From 4ed8731d8e6bd2a88a30697fbf4f7e6e979a6c46 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 11 Oct 2012 13:34:00 +1100 Subject: MD: change the parameter of md thread Change the thread parameter, so the thread can carry extra info. Next patch will use it. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/md.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 7a2b0793f66e..8e842b326ebd 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6641,7 +6641,7 @@ static int md_thread(void * arg) clear_bit(THREAD_WAKEUP, &thread->flags); if (!kthread_should_stop()) - thread->run(thread->mddev); + thread->run(thread); } return 0; @@ -6656,8 +6656,8 @@ void md_wakeup_thread(struct md_thread *thread) } } -struct md_thread *md_register_thread(void (*run) (struct mddev *), struct mddev *mddev, - const char *name) +struct md_thread *md_register_thread(void (*run) (struct md_thread *), + struct mddev *mddev, const char *name) { struct md_thread *thread; @@ -7206,8 +7206,9 @@ EXPORT_SYMBOL_GPL(md_allow_write); #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) -void md_do_sync(struct mddev *mddev) +void md_do_sync(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct mddev *mddev2; unsigned int currspeed = 0, window; -- cgit v1.2.3 From 1ca69c4bc4b1ef889861db39f325901eadbf9497 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 13:37:33 +1100 Subject: md: avoid taking the mutex on some ioctls. Some ioctls don't need to take the mutex and doing so can cause a delay as it is held during super-block update. So move those ioctls out of the mutex and rely on rcu locking to ensure we don't access stale data. Signed-off-by: NeilBrown --- drivers/md/md.c | 85 +++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 62 insertions(+), 23 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 8e842b326ebd..feab588adb50 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -674,7 +674,18 @@ static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr) return NULL; } -static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev) +static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) +{ + struct md_rdev *rdev; + + rdev_for_each_rcu(rdev, mddev) + if (rdev->desc_nr == nr) + return rdev; + + return NULL; +} + +static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) { struct md_rdev *rdev; @@ -685,6 +696,17 @@ static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev) return NULL; } +static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) +{ + struct md_rdev *rdev; + + rdev_for_each_rcu(rdev, mddev) + if (rdev->bdev->bd_dev == dev) + return rdev; + + return NULL; +} + static struct md_personality *find_pers(int level, char *clevel) { struct md_personality *pers; @@ -5509,8 +5531,9 @@ static int get_array_info(struct mddev * mddev, void __user * arg) int nr,working,insync,failed,spare; struct md_rdev *rdev; - nr=working=insync=failed=spare=0; - rdev_for_each(rdev, mddev) { + nr = working = insync = failed = spare = 0; + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { nr++; if (test_bit(Faulty, &rdev->flags)) failed++; @@ -5522,6 +5545,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg) spare++; } } + rcu_read_unlock(); info.major_version = mddev->major_version; info.minor_version = mddev->minor_version; @@ -5605,7 +5629,8 @@ static int get_disk_info(struct mddev * mddev, void __user * arg) if (copy_from_user(&info, arg, sizeof(info))) return -EFAULT; - rdev = find_rdev_nr(mddev, info.number); + rcu_read_lock(); + rdev = find_rdev_nr_rcu(mddev, info.number); if (rdev) { info.major = MAJOR(rdev->bdev->bd_dev); info.minor = MINOR(rdev->bdev->bd_dev); @@ -5624,6 +5649,7 @@ static int get_disk_info(struct mddev * mddev, void __user * arg) info.raid_disk = -1; info.state = (1<pers == NULL) return -ENODEV; - rdev = find_rdev(mddev, dev); + rcu_read_lock(); + rdev = find_rdev_rcu(mddev, dev); if (!rdev) - return -ENODEV; - - md_error(mddev, rdev); - if (!test_bit(Faulty, &rdev->flags)) - return -EBUSY; - return 0; + err = -ENODEV; + else { + md_error(mddev, rdev); + if (!test_bit(Faulty, &rdev->flags)) + err = -EBUSY; + } + rcu_read_unlock(); + return err; } /* @@ -6315,6 +6345,27 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, goto abort; } + /* Some actions do not requires the mutex */ + switch (cmd) { + case GET_ARRAY_INFO: + if (!mddev->raid_disks && !mddev->external) + err = -ENODEV; + else + err = get_array_info(mddev, argp); + goto abort; + + case GET_DISK_INFO: + if (!mddev->raid_disks && !mddev->external) + err = -ENODEV; + else + err = get_disk_info(mddev, argp); + goto abort; + + case SET_DISK_FAULTY: + err = set_disk_faulty(mddev, new_decode_dev(arg)); + goto abort; + } + err = mddev_lock(mddev); if (err) { printk(KERN_INFO @@ -6387,18 +6438,10 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, */ switch (cmd) { - case GET_ARRAY_INFO: - err = get_array_info(mddev, argp); - goto done_unlock; - case GET_BITMAP_FILE: err = get_bitmap_file(mddev, argp); goto done_unlock; - case GET_DISK_INFO: - err = get_disk_info(mddev, argp); - goto done_unlock; - case RESTART_ARRAY_RW: err = restart_array(mddev); goto done_unlock; @@ -6480,10 +6523,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, err = hot_add_disk(mddev, new_decode_dev(arg)); goto done_unlock; - case SET_DISK_FAULTY: - err = set_disk_faulty(mddev, new_decode_dev(arg)); - goto done_unlock; - case RUN_ARRAY: err = do_md_run(mddev); goto done_unlock; -- cgit v1.2.3 From 2863b9eb44787adecba4f977d71d7fd876805b1c Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Thu, 11 Oct 2012 13:38:58 +1100 Subject: MD RAID10: Prep for DM RAID10 device replacement capability MD RAID10: Fix a couple potential kernel panics if RAID10 is used by dm-raid When device-mapper uses the RAID10 personality through dm-raid.c, there is no 'gendisk' structure in mddev and some sysfs information is also not populated. This patch avoids touching those non-existent structures. Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown --- drivers/md/md.c | 10 ++++++++-- drivers/md/raid10.c | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index feab588adb50..85e6786653ea 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2044,8 +2044,14 @@ EXPORT_SYMBOL(md_integrity_register); /* Disable data integrity if non-capable/non-matching disk is being added */ void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) { - struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); - struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk); + struct blk_integrity *bi_rdev; + struct blk_integrity *bi_mddev; + + if (!mddev->gendisk) + return; + + bi_rdev = bdev_get_integrity(rdev->bdev); + bi_mddev = blk_get_integrity(mddev->gendisk); if (!bi_mddev) /* nothing to do */ return; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 54860604d097..fb5bd607e15c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1694,7 +1694,7 @@ static int raid10_spare_active(struct mddev *mddev) && !test_bit(Faulty, &tmp->rdev->flags) && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { count++; - sysfs_notify_dirent(tmp->rdev->sysfs_state); + sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); } } spin_lock_irqsave(&conf->device_lock, flags); -- cgit v1.2.3 From 7f7583d420231b9d09897afd57a957011b606a5b Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Thu, 11 Oct 2012 14:17:59 +1100 Subject: Subject: [PATCH] md:change resync_mismatches to atomic64_t to avoid races Now that multiple threads can handle stripes, it is safer to use an atomic64_t for resync_mismatches, to avoid update races. Signed-off-by: Jianpeng Ma Signed-off-by: NeilBrown --- drivers/md/md.c | 7 ++++--- drivers/md/md.h | 2 +- drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 2 +- drivers/md/raid5.c | 4 ++-- 5 files changed, 9 insertions(+), 8 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 85e6786653ea..7564c44b8045 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4269,7 +4269,8 @@ static ssize_t mismatch_cnt_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", - (unsigned long long) mddev->resync_mismatches); + (unsigned long long) + atomic64_read(&mddev->resync_mismatches)); } static struct md_sysfs_entry md_scan_mode = @@ -5235,7 +5236,7 @@ static void md_clean(struct mddev *mddev) mddev->new_layout = 0; mddev->new_chunk_sectors = 0; mddev->curr_resync = 0; - mddev->resync_mismatches = 0; + atomic64_set(&mddev->resync_mismatches, 0); mddev->suspend_lo = mddev->suspend_hi = 0; mddev->sync_speed_min = mddev->sync_speed_max = 0; mddev->recovery = 0; @@ -7357,7 +7358,7 @@ void md_do_sync(struct md_thread *thread) * which defaults to physical size, but can be virtual size */ max_sectors = mddev->resync_max_sectors; - mddev->resync_mismatches = 0; + atomic64_set(&mddev->resync_mismatches, 0); /* we don't use the checkpoint if there's a bitmap */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) j = mddev->resync_min; diff --git a/drivers/md/md.h b/drivers/md/md.h index 93ada214b50e..af443ab868db 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -282,7 +282,7 @@ struct mddev { sector_t resync_max_sectors; /* may be set by personality */ - sector_t resync_mismatches; /* count of sectors where + atomic64_t resync_mismatches; /* count of sectors where * parity/replica mismatch found */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e913356b257e..8034fbd6190c 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1876,7 +1876,7 @@ static int process_checks(struct r1bio *r1_bio) } else j = 0; if (j >= 0) - mddev->resync_mismatches += r1_bio->sectors; + atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { /* No need to write to this device. */ diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index fb5bd607e15c..146749b277c6 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2011,7 +2011,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) break; if (j == vcnt) continue; - mddev->resync_mismatches += r10_bio->sectors; + atomic64_add(r10_bio->sectors, &mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) /* Don't fix anything. */ continue; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ab613efbbead..4cc6b0e398bc 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2973,7 +2973,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, */ set_bit(STRIPE_INSYNC, &sh->state); else { - conf->mddev->resync_mismatches += STRIPE_SECTORS; + atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) /* don't try to repair!! */ set_bit(STRIPE_INSYNC, &sh->state); @@ -3125,7 +3125,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, */ } } else { - conf->mddev->resync_mismatches += STRIPE_SECTORS; + atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) /* don't try to repair!! */ set_bit(STRIPE_INSYNC, &sh->state); -- cgit v1.2.3 From 48c26ddc9f85808632a63b3ae50b868c86a2435f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 14:19:39 +1100 Subject: md: writing to sync_action should clear the read-auto state. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In some cases array are started in 'read-auto' state where in nothing gets written to any device until the array is written to. The purpose of this is to make accidental auto-assembly of the wrong arrays less of a risk, and to allow arrays to be started to read suspend-to-disk images without actually changing anything (as might happen if the array were dirty and a resync seemed necessary). Explicitly writing the 'sync_action' for a read-auto array currently doesn't clear the read-auto state, so the sync action doesn't happen, which can be confusing. So allow any successful write to sync_action to clear any read-auto state. Reported-by: Alexander Kühn Signed-off-by: NeilBrown --- drivers/md/md.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 7564c44b8045..200d0862335f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4259,6 +4259,13 @@ action_store(struct mddev *mddev, const char *page, size_t len) set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); set_bit(MD_RECOVERY_SYNC, &mddev->recovery); } + if (mddev->ro == 2) { + /* A write to sync_action is enough to justify + * canceling read-auto mode + */ + mddev->ro = 0; + md_wakeup_thread(mddev->sync_thread); + } set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); sysfs_notify_dirent_safe(mddev->sysfs_action); -- cgit v1.2.3 From db07d85ef6395d4768c6cdceeb2ea4878bd2a3f4 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 14:22:17 +1100 Subject: md: make sure manual changes to recovery checkpoint are saved. If you make an array bigger but suppress resync of the new region with mdadm --grow /dev/mdX --size=max --assume-clean then stop the array before anything is written to it, the effect of the "--assume-clean" is lost and the array will resync the new space when restarted. So ensure that we update the metadata in the case. Reported-by: Sebastian Riemer Signed-off-by: NeilBrown --- drivers/md/md.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 200d0862335f..8d93f867adbf 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3782,6 +3782,8 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len) return -EINVAL; mddev->recovery_cp = n; + if (mddev->pers) + set_bit(MD_CHANGE_CLEAN, &mddev->flags); return len; } static struct md_sysfs_entry md_resync_start = -- cgit v1.2.3 From 72f36d5972a166197036c1281963f6863c429bf2 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 14:25:57 +1100 Subject: md: refine reporting of resync/reshape delays. If 'resync_max' is set to 0 (as is often done when starting a reshape, so the mdadm can remain in control during a sensitive period), and if the reshape request is initially delayed because another array using the same array is resyncing or reshaping etc, when user-space cannot easily tell when the delay changes from being due to a conflicting reshape, to being due to resync_max = 0. So introduce a new state: (curr_resync == 3) to reflect this, make sure it is visible both via /proc/mdstat and via the "sync_completed" sysfs attribute, and ensure that the event transition from one delay state to the other is properly notified. Signed-off-by: NeilBrown --- drivers/md/md.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 8d93f867adbf..3c6eaab0b6ce 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4400,6 +4400,10 @@ sync_completed_show(struct mddev *mddev, char *page) if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return sprintf(page, "none\n"); + if (mddev->curr_resync == 1 || + mddev->curr_resync == 2) + return sprintf(page, "delayed\n"); + if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sectors = mddev->resync_max_sectors; @@ -6807,7 +6811,11 @@ static void status_resync(struct seq_file *seq, struct mddev * mddev) int scale; unsigned int per_milli; - resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); + if (mddev->curr_resync <= 3) + resync = 0; + else + resync = mddev->curr_resync + - atomic_read(&mddev->recovery_active); if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) @@ -7033,7 +7041,7 @@ static int md_seq_show(struct seq_file *seq, void *v) if (mddev->curr_resync > 2) { status_resync(seq, mddev); seq_printf(seq, "\n "); - } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) + } else if (mddev->curr_resync >= 1) seq_printf(seq, "\tresync=DELAYED\n "); else if (mddev->recovery_cp < MaxSector) seq_printf(seq, "\tresync=PENDING\n "); @@ -7423,8 +7431,11 @@ void md_do_sync(struct md_thread *thread) "md: resuming %s of %s from checkpoint.\n", desc, mdname(mddev)); mddev->curr_resync = j; - } + } else + mddev->curr_resync = 3; /* no longer delayed */ mddev->curr_resync_completed = j; + sysfs_notify(&mddev->kobj, NULL, "sync_completed"); + md_new_event(mddev); blk_start_plug(&plug); while (j < max_sectors) { @@ -7477,7 +7488,8 @@ void md_do_sync(struct md_thread *thread) break; j += sectors; - if (j>1) mddev->curr_resync = j; + if (j > 2) + mddev->curr_resync = j; mddev->curr_mark_cnt = io_sectors; if (last_check == 0) /* this is the earliest that rebuild will be @@ -7599,8 +7611,6 @@ static int remove_and_add_spares(struct mddev *mddev) int spares = 0; int removed = 0; - mddev->curr_resync_completed = 0; - rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Blocked, &rdev->flags) && @@ -7791,6 +7801,7 @@ void md_check_recovery(struct mddev *mddev) /* Set RUNNING before clearing NEEDED to avoid * any transients in the value of "sync_action". */ + mddev->curr_resync_completed = 0; set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); /* Clear some bits that don't mean anything, but * might be left set @@ -7804,7 +7815,7 @@ void md_check_recovery(struct mddev *mddev) /* no recovery is running. * remove any failed drives, then * add spares if possible. - * Spare are also removed and re-added, to allow + * Spares are also removed and re-added, to allow * the personality to fail the re-add. */ -- cgit v1.2.3