diff options
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r-- | drivers/md/md.c | 359 |
1 files changed, 248 insertions, 111 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index c702de18207a..a71b36f0acb0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1735,6 +1735,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) } } + if (mddev_is_clustered(mddev)) + sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); + if (rdev->badblocks.count == 0) /* Nothing to do for bad blocks*/ ; else if (sb->bblog_offset == 0) @@ -2196,18 +2199,72 @@ static void sync_sbs(struct mddev *mddev, int nospares) } } +static bool does_sb_need_changing(struct mddev *mddev) +{ + struct md_rdev *rdev; + struct mdp_superblock_1 *sb; + int role; + + /* Find a good rdev */ + rdev_for_each(rdev, mddev) + if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags)) + break; + + /* No good device found. */ + if (!rdev) + return false; + + sb = page_address(rdev->sb_page); + /* Check if a device has become faulty or a spare become active */ + rdev_for_each(rdev, mddev) { + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + /* Device activated? */ + if (role == 0xffff && rdev->raid_disk >=0 && + !test_bit(Faulty, &rdev->flags)) + return true; + /* Device turned faulty? */ + if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd)) + return true; + } + + /* Check if any mddev parameters have changed */ + if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || + (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || + (mddev->recovery_cp != le64_to_cpu(sb->resync_offset)) || + (mddev->layout != le64_to_cpu(sb->layout)) || + (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || + (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) + return true; + + return false; +} + void md_update_sb(struct mddev *mddev, int force_change) { struct md_rdev *rdev; int sync_req; int nospares = 0; int any_badblocks_changed = 0; + int ret = -1; if (mddev->ro) { if (force_change) set_bit(MD_CHANGE_DEVS, &mddev->flags); return; } + + if (mddev_is_clustered(mddev)) { + if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) + force_change = 1; + ret = md_cluster_ops->metadata_update_start(mddev); + /* Has someone else has updated the sb */ + if (!does_sb_need_changing(mddev)) { + if (ret == 0) + md_cluster_ops->metadata_update_cancel(mddev); + clear_bit(MD_CHANGE_PENDING, &mddev->flags); + return; + } + } repeat: /* First make sure individual recovery_offsets are correct */ rdev_for_each(rdev, mddev) { @@ -2356,6 +2413,9 @@ repeat: clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); } + + if (mddev_is_clustered(mddev) && ret == 0) + md_cluster_ops->metadata_update_finish(mddev); } EXPORT_SYMBOL(md_update_sb); @@ -2490,17 +2550,16 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) err = -EBUSY; else { struct mddev *mddev = rdev->mddev; - if (mddev_is_clustered(mddev)) - md_cluster_ops->remove_disk(mddev, rdev); - md_kick_rdev_from_array(rdev); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); - if (mddev->pers) - md_update_sb(mddev, 1); - md_new_event(mddev); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); err = 0; + if (mddev_is_clustered(mddev)) + err = md_cluster_ops->remove_disk(mddev, rdev); + + if (err == 0) { + md_kick_rdev_from_array(rdev); + if (mddev->pers) + md_update_sb(mddev, 1); + md_new_event(mddev); + } } } else if (cmd_match(buf, "writemostly")) { set_bit(WriteMostly, &rdev->flags); @@ -2688,15 +2747,9 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) rdev->saved_raid_disk = -1; clear_bit(In_sync, &rdev->flags); clear_bit(Bitmap_sync, &rdev->flags); - err = rdev->mddev->pers-> - hot_add_disk(rdev->mddev, rdev); - if (err) { - rdev->raid_disk = -1; - return err; - } else - sysfs_notify_dirent_safe(rdev->sysfs_state); - if (sysfs_link_rdev(rdev->mddev, rdev)) - /* failure here is OK */; + remove_and_add_spares(rdev->mddev, rdev); + if (rdev->raid_disk == -1) + return -EBUSY; /* don't wakeup anyone, leave that to userspace. */ } else { if (slot >= rdev->mddev->raid_disks && @@ -3198,14 +3251,6 @@ static void analyze_sbs(struct mddev *mddev) md_kick_rdev_from_array(rdev); continue; } - /* No device should have a Candidate flag - * when reading devices - */ - if (test_bit(Candidate, &rdev->flags)) { - pr_info("md: kicking Cluster Candidate %s from array!\n", - bdevname(rdev->bdev, b)); - md_kick_rdev_from_array(rdev); - } } if (mddev->level == LEVEL_MULTIPATH) { rdev->desc_nr = i++; @@ -4066,12 +4111,8 @@ size_store(struct mddev *mddev, const char *buf, size_t len) if (err) return err; if (mddev->pers) { - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); err = update_size(mddev, sectors); md_update_sb(mddev, 1); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); } else { if (mddev->dev_sectors == 0 || mddev->dev_sectors > sectors) @@ -5309,8 +5350,6 @@ static void md_clean(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev) { - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); flush_workqueue(md_misc_wq); if (mddev->sync_thread) { @@ -5329,8 +5368,6 @@ static void __md_stop_writes(struct mddev *mddev) mddev->in_sync = 1; md_update_sb(mddev, 1); } - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); } void md_stop_writes(struct mddev *mddev) @@ -5910,19 +5947,12 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) * check whether the device shows up in other nodes */ if (mddev_is_clustered(mddev)) { - if (info->state & (1 << MD_DISK_CANDIDATE)) { - /* Through --cluster-confirm */ + if (info->state & (1 << MD_DISK_CANDIDATE)) set_bit(Candidate, &rdev->flags); - err = md_cluster_ops->new_disk_ack(mddev, true); - if (err) { - export_rdev(rdev); - return err; - } - } else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { + else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) { /* --add initiated by this node */ - err = md_cluster_ops->add_new_disk_start(mddev, rdev); + err = md_cluster_ops->add_new_disk(mddev, rdev); if (err) { - md_cluster_ops->add_new_disk_finish(mddev); export_rdev(rdev); return err; } @@ -5931,13 +5961,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) rdev->raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); + if (err) export_rdev(rdev); - else + + if (mddev_is_clustered(mddev)) { + if (info->state & (1 << MD_DISK_CANDIDATE)) + md_cluster_ops->new_disk_ack(mddev, (err == 0)); + else { + if (err) + md_cluster_ops->add_new_disk_cancel(mddev); + else + err = add_bound_rdev(rdev); + } + + } else if (!err) err = add_bound_rdev(rdev); - if (mddev_is_clustered(mddev) && - (info->state & (1 << MD_DISK_CLUSTER_ADD))) - md_cluster_ops->add_new_disk_finish(mddev); + return err; } @@ -5993,13 +6033,17 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) { char b[BDEVNAME_SIZE]; struct md_rdev *rdev; + int ret = -1; rdev = find_rdev(mddev, dev); if (!rdev) return -ENXIO; if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); + ret = md_cluster_ops->metadata_update_start(mddev); + + if (rdev->raid_disk < 0) + goto kick_rdev; clear_bit(Blocked, &rdev->flags); remove_and_add_spares(mddev, rdev); @@ -6007,20 +6051,19 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) if (rdev->raid_disk >= 0) goto busy; - if (mddev_is_clustered(mddev)) +kick_rdev: + if (mddev_is_clustered(mddev) && ret == 0) md_cluster_ops->remove_disk(mddev, rdev); md_kick_rdev_from_array(rdev); md_update_sb(mddev, 1); md_new_event(mddev); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); - return 0; busy: - if (mddev_is_clustered(mddev)) + if (mddev_is_clustered(mddev) && ret == 0) md_cluster_ops->metadata_update_cancel(mddev); + printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", bdevname(rdev->bdev,b), mdname(mddev)); return -EBUSY; @@ -6071,14 +6114,12 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) goto abort_export; } - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); clear_bit(In_sync, &rdev->flags); rdev->desc_nr = -1; rdev->saved_raid_disk = -1; err = bind_rdev_to_array(rdev, mddev); if (err) - goto abort_clustered; + goto abort_export; /* * The rest should better be atomic, we can have disk failures @@ -6088,9 +6129,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) rdev->raid_disk = -1; md_update_sb(mddev, 1); - - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); /* * Kick recovery, maybe this spare has to be added to the * array immediately. @@ -6100,9 +6138,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) md_new_event(mddev); return 0; -abort_clustered: - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_cancel(mddev); abort_export: export_rdev(rdev); return err; @@ -6420,8 +6455,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) return rv; } } - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) rv = update_size(mddev, (sector_t)info->size * 2); @@ -6479,12 +6512,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) } } md_update_sb(mddev, 1); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); return rv; err: - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_cancel(mddev); return rv; } @@ -7597,11 +7626,7 @@ int md_allow_write(struct mddev *mddev) mddev->safemode == 0) mddev->safemode = 1; spin_unlock(&mddev->lock); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); md_update_sb(mddev, 0); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); sysfs_notify_dirent_safe(mddev->sysfs_state); } else spin_unlock(&mddev->lock); @@ -7633,6 +7658,7 @@ void md_do_sync(struct md_thread *thread) struct md_rdev *rdev; char *desc, *action = NULL; struct blk_plug plug; + bool cluster_resync_finished = false; /* just incase thread restarts... */ if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) @@ -7802,9 +7828,6 @@ void md_do_sync(struct md_thread *thread) md_new_event(mddev); update_time = jiffies; - if (mddev_is_clustered(mddev)) - md_cluster_ops->resync_start(mddev, j, max_sectors); - blk_start_plug(&plug); while (j < max_sectors) { sector_t sectors; @@ -7868,8 +7891,6 @@ void md_do_sync(struct md_thread *thread) j = max_sectors; if (j > 2) mddev->curr_resync = j; - if (mddev_is_clustered(mddev)) - md_cluster_ops->resync_info_update(mddev, j, max_sectors); mddev->curr_mark_cnt = io_sectors; if (last_check == 0) /* this is the earliest that rebuild will be @@ -7940,7 +7961,11 @@ void md_do_sync(struct md_thread *thread) mddev->curr_resync_completed = mddev->curr_resync; sysfs_notify(&mddev->kobj, NULL, "sync_completed"); } - /* tell personality that we are finished */ + /* tell personality and other nodes that we are finished */ + if (mddev_is_clustered(mddev)) { + md_cluster_ops->resync_finish(mddev); + cluster_resync_finished = true; + } mddev->pers->sync_request(mddev, max_sectors, &skipped); if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && @@ -7976,11 +8001,13 @@ void md_do_sync(struct md_thread *thread) } } skip: - if (mddev_is_clustered(mddev)) - md_cluster_ops->resync_finish(mddev); - set_bit(MD_CHANGE_DEVS, &mddev->flags); + if (mddev_is_clustered(mddev) && + test_bit(MD_RECOVERY_INTR, &mddev->recovery) && + !cluster_resync_finished) + md_cluster_ops->resync_finish(mddev); + spin_lock(&mddev->lock); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* We completed so min/max setting can be forgotten if used. */ @@ -8023,10 +8050,14 @@ static int remove_and_add_spares(struct mddev *mddev, if (removed && mddev->kobj.sd) sysfs_notify(&mddev->kobj, NULL, "degraded"); - if (this) + if (this && removed) goto no_add; rdev_for_each(rdev, mddev) { + if (this && this != rdev) + continue; + if (test_bit(Candidate, &rdev->flags)) + continue; if (rdev->raid_disk >= 0 && !test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags)) @@ -8060,14 +8091,25 @@ no_add: static void md_start_sync(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, del_work); + int ret = 0; + + if (mddev_is_clustered(mddev)) { + ret = md_cluster_ops->resync_start(mddev); + if (ret) { + mddev->sync_thread = NULL; + goto out; + } + } mddev->sync_thread = md_register_thread(md_do_sync, mddev, "resync"); +out: if (!mddev->sync_thread) { - printk(KERN_ERR "%s: could not start resync" - " thread...\n", - mdname(mddev)); + if (!(mddev_is_clustered(mddev) && ret == -EAGAIN)) + printk(KERN_ERR "%s: could not start resync" + " thread...\n", + mdname(mddev)); /* leave the spares where they are, it shouldn't hurt */ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); @@ -8186,13 +8228,8 @@ void md_check_recovery(struct mddev *mddev) sysfs_notify_dirent_safe(mddev->sysfs_state); } - if (mddev->flags & MD_UPDATE_SB_FLAGS) { - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); + if (mddev->flags & MD_UPDATE_SB_FLAGS) md_update_sb(mddev, 0); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); - } if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { @@ -8290,8 +8327,6 @@ void md_reap_sync_thread(struct mddev *mddev) set_bit(MD_CHANGE_DEVS, &mddev->flags); } } - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_start(mddev); if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && mddev->pers->finish_reshape) mddev->pers->finish_reshape(mddev); @@ -8304,8 +8339,6 @@ void md_reap_sync_thread(struct mddev *mddev) rdev->saved_raid_disk = -1; md_update_sb(mddev, 1); - if (mddev_is_clustered(mddev)) - md_cluster_ops->metadata_update_finish(mddev); clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); @@ -8928,25 +8961,129 @@ err_wq: return ret; } -void md_reload_sb(struct mddev *mddev) +static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev) { - struct md_rdev *rdev, *tmp; + struct mdp_superblock_1 *sb = page_address(rdev->sb_page); + struct md_rdev *rdev2; + int role, ret; + char b[BDEVNAME_SIZE]; - rdev_for_each_safe(rdev, tmp, mddev) { - rdev->sb_loaded = 0; - ClearPageUptodate(rdev->sb_page); + /* Check for change of roles in the active devices */ + rdev_for_each(rdev2, mddev) { + if (test_bit(Faulty, &rdev2->flags)) + continue; + + /* Check if the roles changed */ + role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]); + + if (test_bit(Candidate, &rdev2->flags)) { + if (role == 0xfffe) { + pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b)); + md_kick_rdev_from_array(rdev2); + continue; + } + else + clear_bit(Candidate, &rdev2->flags); + } + + if (role != rdev2->raid_disk) { + /* got activated */ + if (rdev2->raid_disk == -1 && role != 0xffff) { + rdev2->saved_raid_disk = role; + ret = remove_and_add_spares(mddev, rdev2); + pr_info("Activated spare: %s\n", + bdevname(rdev2->bdev,b)); + continue; + } + /* device faulty + * We just want to do the minimum to mark the disk + * as faulty. The recovery is performed by the + * one who initiated the error. + */ + if ((role == 0xfffe) || (role == 0xfffd)) { + md_error(mddev, rdev2); + clear_bit(Blocked, &rdev2->flags); + } + } } - mddev->raid_disks = 0; - analyze_sbs(mddev); - rdev_for_each_safe(rdev, tmp, mddev) { - struct mdp_superblock_1 *sb = page_address(rdev->sb_page); - /* since we don't write to faulty devices, we figure out if the - * disk is faulty by comparing events - */ - if (mddev->events > sb->events) - set_bit(Faulty, &rdev->flags); + + /* recovery_cp changed */ + if (le64_to_cpu(sb->resync_offset) != mddev->recovery_cp) + mddev->recovery_cp = le64_to_cpu(sb->resync_offset); + + /* Finally set the event to be up to date */ + mddev->events = le64_to_cpu(sb->events); +} + +static int read_rdev(struct mddev *mddev, struct md_rdev *rdev) +{ + int err; + struct page *swapout = rdev->sb_page; + struct mdp_superblock_1 *sb; + + /* Store the sb page of the rdev in the swapout temporary + * variable in case we err in the future + */ + rdev->sb_page = NULL; + alloc_disk_sb(rdev); + ClearPageUptodate(rdev->sb_page); + rdev->sb_loaded = 0; + err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version); + + if (err < 0) { + pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n", + __func__, __LINE__, rdev->desc_nr, err); + put_page(rdev->sb_page); + rdev->sb_page = swapout; + rdev->sb_loaded = 1; + return err; + } + + sb = page_address(rdev->sb_page); + /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET + * is not set + */ + + if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)) + rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); + + /* The other node finished recovery, call spare_active to set + * device In_sync and mddev->degraded + */ + if (rdev->recovery_offset == MaxSector && + !test_bit(In_sync, &rdev->flags) && + mddev->pers->spare_active(mddev)) + sysfs_notify(&mddev->kobj, NULL, "degraded"); + + put_page(swapout); + return 0; +} + +void md_reload_sb(struct mddev *mddev, int nr) +{ + struct md_rdev *rdev; + int err; + + /* Find the rdev */ + rdev_for_each_rcu(rdev, mddev) { + if (rdev->desc_nr == nr) + break; + } + + if (!rdev || rdev->desc_nr != nr) { + pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr); + return; } + err = read_rdev(mddev, rdev); + if (err < 0) + return; + + check_sb_changes(mddev, rdev); + + /* Read all rdev's to update recovery_offset */ + rdev_for_each_rcu(rdev, mddev) + read_rdev(mddev, rdev); } EXPORT_SYMBOL(md_reload_sb); |