From 7ac504472263180745ac94242f1d253eb7284e48 Mon Sep 17 00:00:00 2001 From: Tomasz Majchrzak Date: Mon, 13 Jun 2016 15:51:19 +0200 Subject: raid1/raid10: slow down resync if there is non-resync activity pending A performance drop of mkfs has been observed on RAID10 during resync since commit 09314799e4f0 ("md: remove 'go_faster' option from ->sync_request()"). Resync sends so many IOs it slows down non-resync IOs significantly (few times). Add a short delay to a resync. The previous long sleep (1s) has proven unnecessary, even very short delay brings performance right. The change also applied to raid1. The problem has not been observed on raid1, however it shares barriers code with raid10 so it might be an issue for some setup too. Suggested-by: NeilBrown Link: http://lkml.kernel.org/r/20160609134555.GA9104@proton.igk.intel.com Signed-off-by: Tomasz Majchrzak Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index c7c8cde0ab21..358a08e656f6 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2535,6 +2535,13 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, return sync_blocks; } + /* + * If there is non-resync activity waiting for a turn, then let it + * though before starting on this new sync request. + */ + if (conf->nr_waiting) + schedule_timeout_uninterruptible(1); + /* we are incrementing sector_nr below. To be safe, we check against * sector_nr + two times RESYNC_SECTORS */ -- cgit v1.2.3 From 414e6b9a7032a6c2f5ddf018fdb199190b075170 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 2 Jun 2016 16:19:52 +1000 Subject: md/raid1, raid10: don't recheck "Faulty" flag in read-balance. Re-checking the faulty flag here brings no value. The comment about "risk" refers to the risk that the device could be in the process of being removed by ->hot_remove_disk(). However providing that the ->nr_pending count is incremented inside an rcu_read_locked() region, there is no risk of that happening. This is because the rdev pointer (in the personalities array) is set to NULL before synchronize_rcu(), and ->nr_pending is tested afterwards. If the rcu_read_locked region happens before the synchronize_rcu(), the test will see that nr_pending has been incremented. If it happens afterwards, the rdev pointer will be NULL so there is nothing to increment. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 7 ------- drivers/md/raid10.c | 8 -------- 2 files changed, 15 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 358a08e656f6..f6c3bd4913eb 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -689,13 +689,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect if (!rdev) goto retry; atomic_inc(&rdev->nr_pending); - if (test_bit(Faulty, &rdev->flags)) { - /* cannot risk returning a device that failed - * before we inc'ed nr_pending - */ - rdev_dec_pending(rdev, conf->mddev); - goto retry; - } sectors = best_good_sectors; if (conf->mirrors[best_disk].next_seq_sect != this_sector) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 3578d3aa9ee3..ae4dce1cbc42 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -707,7 +707,6 @@ static struct md_rdev *read_balance(struct r10conf *conf, raid10_find_phys(conf, r10_bio); rcu_read_lock(); -retry: sectors = r10_bio->sectors; best_slot = -1; best_rdev = NULL; @@ -804,13 +803,6 @@ retry: if (slot >= 0) { atomic_inc(&rdev->nr_pending); - if (test_bit(Faulty, &rdev->flags)) { - /* Cannot risk returning a device that failed - * before we inc'ed nr_pending - */ - rdev_dec_pending(rdev, conf->mddev); - goto retry; - } r10_bio->read_slot = slot; } else rdev = NULL; -- cgit v1.2.3 From e5872d58f5ad179fc03267f12257bee4159aace6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 2 Jun 2016 16:19:52 +1000 Subject: md/raid1: small cleanup in raid1_end_read/write_request Both functions use conf->mirrors[mirror].rdev several times, so improve readability by storing this in a local variable. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f6c3bd4913eb..588608dcb780 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -319,14 +319,13 @@ static void raid1_end_read_request(struct bio *bio) { int uptodate = !bio->bi_error; struct r1bio *r1_bio = bio->bi_private; - int mirror; struct r1conf *conf = r1_bio->mddev->private; + struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev; - mirror = r1_bio->read_disk; /* * this branch is our 'one mirror IO has finished' event handler: */ - update_head_pos(mirror, r1_bio); + update_head_pos(r1_bio->read_disk, r1_bio); if (uptodate) set_bit(R1BIO_Uptodate, &r1_bio->state); @@ -339,14 +338,14 @@ static void raid1_end_read_request(struct bio *bio) spin_lock_irqsave(&conf->device_lock, flags); if (r1_bio->mddev->degraded == conf->raid_disks || (r1_bio->mddev->degraded == conf->raid_disks-1 && - test_bit(In_sync, &conf->mirrors[mirror].rdev->flags))) + test_bit(In_sync, &rdev->flags))) uptodate = 1; spin_unlock_irqrestore(&conf->device_lock, flags); } if (uptodate) { raid_end_bio_io(r1_bio); - rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); + rdev_dec_pending(rdev, conf->mddev); } else { /* * oops, read error: @@ -356,7 +355,7 @@ static void raid1_end_read_request(struct bio *bio) KERN_ERR "md/raid1:%s: %s: " "rescheduling sector %llu\n", mdname(conf->mddev), - bdevname(conf->mirrors[mirror].rdev->bdev, + bdevname(rdev->bdev, b), (unsigned long long)r1_bio->sector); set_bit(R1BIO_ReadError, &r1_bio->state); @@ -403,20 +402,18 @@ static void r1_bio_write_done(struct r1bio *r1_bio) static void raid1_end_write_request(struct bio *bio) { struct r1bio *r1_bio = bio->bi_private; - int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); + int behind = test_bit(R1BIO_BehindIO, &r1_bio->state); struct r1conf *conf = r1_bio->mddev->private; struct bio *to_put = NULL; - - mirror = find_bio_disk(r1_bio, bio); + int mirror = find_bio_disk(r1_bio, bio); + struct md_rdev *rdev = conf->mirrors[mirror].rdev; /* * 'one mirror IO has finished' event handler: */ if (bio->bi_error) { - set_bit(WriteErrorSeen, - &conf->mirrors[mirror].rdev->flags); - if (!test_and_set_bit(WantReplacement, - &conf->mirrors[mirror].rdev->flags)) + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) set_bit(MD_RECOVERY_NEEDED, & conf->mddev->recovery); @@ -445,13 +442,12 @@ static void raid1_end_write_request(struct bio *bio) * before rdev->recovery_offset, but for simplicity we don't * check this here. */ - if (test_bit(In_sync, &conf->mirrors[mirror].rdev->flags) && - !test_bit(Faulty, &conf->mirrors[mirror].rdev->flags)) + if (test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) set_bit(R1BIO_Uptodate, &r1_bio->state); /* Maybe we can clear some bad blocks. */ - if (is_badblock(conf->mirrors[mirror].rdev, - r1_bio->sector, r1_bio->sectors, + if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, &first_bad, &bad_sectors)) { r1_bio->bios[mirror] = IO_MADE_GOOD; set_bit(R1BIO_MadeGood, &r1_bio->state); @@ -459,7 +455,7 @@ static void raid1_end_write_request(struct bio *bio) } if (behind) { - if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) + if (test_bit(WriteMostly, &rdev->flags)) atomic_dec(&r1_bio->behind_remaining); /* @@ -483,8 +479,7 @@ static void raid1_end_write_request(struct bio *bio) } } if (r1_bio->bios[mirror] == NULL) - rdev_dec_pending(conf->mirrors[mirror].rdev, - conf->mddev); + rdev_dec_pending(rdev, conf->mddev); /* * Let's see if all mirrored write operations have finished -- cgit v1.2.3 From 854abd75841413f7966bc4fed83b36e78a1c285c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 2 Jun 2016 16:19:52 +1000 Subject: md/raid1: small code cleanup in end_sync_write 'mirror' is only used to find 'rdev', several times. So just find 'rdev' once, and use it instead. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 588608dcb780..60c293df03f1 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1709,11 +1709,9 @@ static void end_sync_write(struct bio *bio) struct r1bio *r1_bio = bio->bi_private; struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; - int mirror=0; sector_t first_bad; int bad_sectors; - - mirror = find_bio_disk(r1_bio, bio); + struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev; if (!uptodate) { sector_t sync_blocks = 0; @@ -1726,16 +1724,12 @@ static void end_sync_write(struct bio *bio) s += sync_blocks; sectors_to_go -= sync_blocks; } while (sectors_to_go > 0); - set_bit(WriteErrorSeen, - &conf->mirrors[mirror].rdev->flags); - if (!test_and_set_bit(WantReplacement, - &conf->mirrors[mirror].rdev->flags)) + set_bit(WriteErrorSeen, &rdev->flags); + if (!test_and_set_bit(WantReplacement, &rdev->flags)) set_bit(MD_RECOVERY_NEEDED, & mddev->recovery); set_bit(R1BIO_WriteError, &r1_bio->state); - } else if (is_badblock(conf->mirrors[mirror].rdev, - r1_bio->sector, - r1_bio->sectors, + } else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, &first_bad, &bad_sectors) && !is_badblock(conf->mirrors[r1_bio->read_disk].rdev, r1_bio->sector, -- cgit v1.2.3 From 707a6a420ccf31634f2b15d8f06f09536e2de079 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 2 Jun 2016 16:19:52 +1000 Subject: md/raid1: add rcu protection to rdev in fix_read_error Since remove_and_add_spares() was added to hot_remove_disk() it has been possible for an rdev to be hot-removed while fix_read_error() was running, so we need to be more careful, and take a reference to the rdev while performing IO. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 52 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 20 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 60c293df03f1..34f20c03d1f6 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2056,29 +2056,30 @@ static void fix_read_error(struct r1conf *conf, int read_disk, s = PAGE_SIZE >> 9; do { - /* Note: no rcu protection needed here - * as this is synchronous in the raid1d thread - * which is the thread that might remove - * a device. If raid1d ever becomes multi-threaded.... - */ sector_t first_bad; int bad_sectors; - rdev = conf->mirrors[d].rdev; + rcu_read_lock(); + rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && (test_bit(In_sync, &rdev->flags) || (!test_bit(Faulty, &rdev->flags) && rdev->recovery_offset >= sect + s)) && is_badblock(rdev, sect, s, - &first_bad, &bad_sectors) == 0 && - sync_page_io(rdev, sect, s<<9, - conf->tmppage, READ, false)) - success = 1; - else { - d++; - if (d == conf->raid_disks * 2) - d = 0; - } + &first_bad, &bad_sectors) == 0) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + if (sync_page_io(rdev, sect, s<<9, + conf->tmppage, READ, false)) + success = 1; + rdev_dec_pending(rdev, mddev); + if (success) + break; + } else + rcu_read_unlock(); + d++; + if (d == conf->raid_disks * 2) + d = 0; } while (!success && d != read_disk); if (!success) { @@ -2094,11 +2095,17 @@ static void fix_read_error(struct r1conf *conf, int read_disk, if (d==0) d = conf->raid_disks * 2; d--; - rdev = conf->mirrors[d].rdev; + rcu_read_lock(); + rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && - !test_bit(Faulty, &rdev->flags)) + !test_bit(Faulty, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); r1_sync_page_io(rdev, sect, s, conf->tmppage, WRITE); + rdev_dec_pending(rdev, mddev); + } else + rcu_read_unlock(); } d = start; while (d != read_disk) { @@ -2106,9 +2113,12 @@ static void fix_read_error(struct r1conf *conf, int read_disk, if (d==0) d = conf->raid_disks * 2; d--; - rdev = conf->mirrors[d].rdev; + rcu_read_lock(); + rdev = rcu_dereference(conf->mirrors[d].rdev); if (rdev && !test_bit(Faulty, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); if (r1_sync_page_io(rdev, sect, s, conf->tmppage, READ)) { atomic_add(s, &rdev->corrected_errors); @@ -2117,10 +2127,12 @@ static void fix_read_error(struct r1conf *conf, int read_disk, "(%d sectors at %llu on %s)\n", mdname(mddev), s, (unsigned long long)(sect + - rdev->data_offset), + rdev->data_offset), bdevname(rdev->bdev, b)); } - } + rdev_dec_pending(rdev, mddev); + } else + rcu_read_unlock(); } sectors -= s; sect += s; -- cgit v1.2.3 From d787be4092e27728cb4c012bee9762098ef3c662 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 2 Jun 2016 16:19:53 +1000 Subject: md: reduce the number of synchronize_rcu() calls when multiple devices fail. Every time a device is removed with ->hot_remove_disk() a synchronize_rcu() call is made which can delay several milliseconds in some case. If lots of devices fail at once - as could happen with a large RAID10 where one set of devices are removed all at once - these delays can add up to be very inconcenient. As failure is not reversible we can check for that first, setting a separate flag if it is found, and then all synchronize_rcu() once for all the flagged devices. Then ->hot_remove_disk() function can skip the synchronize_rcu() step if the flag is set. fix build error(Shaohua) Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 29 ++++++++++++++++++++++++++--- drivers/md/md.h | 5 +++++ drivers/md/multipath.c | 14 ++++++++------ drivers/md/raid1.c | 17 ++++++++++------- drivers/md/raid10.c | 19 +++++++++++-------- drivers/md/raid5.c | 15 +++++++++------ 6 files changed, 69 insertions(+), 30 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 0793754eeffd..2ed547f5c3b6 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8180,15 +8180,34 @@ static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *rdev; int spares = 0; int removed = 0; + bool remove_some = false; - rdev_for_each(rdev, mddev) + rdev_for_each(rdev, mddev) { + if ((this == NULL || rdev == this) && + rdev->raid_disk >= 0 && + !test_bit(Blocked, &rdev->flags) && + test_bit(Faulty, &rdev->flags) && + atomic_read(&rdev->nr_pending)==0) { + /* Faulty non-Blocked devices with nr_pending == 0 + * never get nr_pending incremented, + * never get Faulty cleared, and never get Blocked set. + * So we can synchronize_rcu now rather than once per device + */ + remove_some = true; + set_bit(RemoveSynchronized, &rdev->flags); + } + } + + if (remove_some) + synchronize_rcu(); + rdev_for_each(rdev, mddev) { if ((this == NULL || rdev == this) && rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && - (test_bit(Faulty, &rdev->flags) || + ((test_bit(RemoveSynchronized, &rdev->flags) || (!test_bit(In_sync, &rdev->flags) && !test_bit(Journal, &rdev->flags))) && - atomic_read(&rdev->nr_pending)==0) { + atomic_read(&rdev->nr_pending)==0)) { if (mddev->pers->hot_remove_disk( mddev, rdev) == 0) { sysfs_unlink_rdev(mddev, rdev); @@ -8196,6 +8215,10 @@ static int remove_and_add_spares(struct mddev *mddev, removed++; } } + if (remove_some && test_bit(RemoveSynchronized, &rdev->flags)) + clear_bit(RemoveSynchronized, &rdev->flags); + } + if (removed && mddev->kobj.sd) sysfs_notify(&mddev->kobj, NULL, "degraded"); diff --git a/drivers/md/md.h b/drivers/md/md.h index 03b19aad4921..dc65ca65b26e 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -163,6 +163,11 @@ enum flag_bits { * than other devices in the array */ ClusterRemove, + RemoveSynchronized, /* synchronize_rcu() was called after + * this device was known to be faulty, + * so it is safe to remove without + * another synchronize_rcu() call. + */ }; static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 7eb9972a37e6..c145a5a114eb 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -298,12 +298,14 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } p->rdev = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - goto abort; + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + p->rdev = rdev; + goto abort; + } } err = md_integrity_register(mddev); } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 34f20c03d1f6..5027ef4752ac 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1656,13 +1656,16 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } p->rdev = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - goto abort; - } else if (conf->mirrors[conf->raid_disks + number].rdev) { + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + p->rdev = rdev; + goto abort; + } + } + if (conf->mirrors[conf->raid_disks + number].rdev) { /* We just removed a device that is being replaced. * Move down the replacement. We drain all IO before * doing this to avoid confusion. diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 34facda18e72..8ee5d96e6a2d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1766,7 +1766,7 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) err = -EBUSY; goto abort; } - /* Only remove faulty devices if recovery + /* Only remove non-faulty devices if recovery * is not possible. */ if (!test_bit(Faulty, &rdev->flags) && @@ -1778,13 +1778,16 @@ static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } *rdevp = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - *rdevp = rdev; - goto abort; - } else if (p->replacement) { + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + *rdevp = rdev; + goto abort; + } + } + if (p->replacement) { /* We must have just cleared 'rdev' */ p->rdev = p->replacement; clear_bit(Replacement, &p->replacement->flags); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f6a191aaaa91..413cc7d847da 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7197,12 +7197,15 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) goto abort; } *rdevp = NULL; - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - *rdevp = rdev; - } else if (p->replacement) { + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + *rdevp = rdev; + } + } + if (p->replacement) { /* We must have just cleared 'rdev' */ p->rdev = p->replacement; clear_bit(Replacement, &p->replacement->flags); -- cgit v1.2.3