summaryrefslogtreecommitdiff
path: root/drivers/md/md.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r--drivers/md/md.c359
1 files changed, 248 insertions, 111 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c702de18207a..a71b36f0acb0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1735,6 +1735,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
}
}
+ if (mddev_is_clustered(mddev))
+ sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
+
if (rdev->badblocks.count == 0)
/* Nothing to do for bad blocks*/ ;
else if (sb->bblog_offset == 0)
@@ -2196,18 +2199,72 @@ static void sync_sbs(struct mddev *mddev, int nospares)
}
}
+static bool does_sb_need_changing(struct mddev *mddev)
+{
+ struct md_rdev *rdev;
+ struct mdp_superblock_1 *sb;
+ int role;
+
+ /* Find a good rdev */
+ rdev_for_each(rdev, mddev)
+ if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
+ break;
+
+ /* No good device found. */
+ if (!rdev)
+ return false;
+
+ sb = page_address(rdev->sb_page);
+ /* Check if a device has become faulty or a spare become active */
+ rdev_for_each(rdev, mddev) {
+ role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
+ /* Device activated? */
+ if (role == 0xffff && rdev->raid_disk >=0 &&
+ !test_bit(Faulty, &rdev->flags))
+ return true;
+ /* Device turned faulty? */
+ if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
+ return true;
+ }
+
+ /* Check if any mddev parameters have changed */
+ if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
+ (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
+ (mddev->recovery_cp != le64_to_cpu(sb->resync_offset)) ||
+ (mddev->layout != le64_to_cpu(sb->layout)) ||
+ (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
+ (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
+ return true;
+
+ return false;
+}
+
void md_update_sb(struct mddev *mddev, int force_change)
{
struct md_rdev *rdev;
int sync_req;
int nospares = 0;
int any_badblocks_changed = 0;
+ int ret = -1;
if (mddev->ro) {
if (force_change)
set_bit(MD_CHANGE_DEVS, &mddev->flags);
return;
}
+
+ if (mddev_is_clustered(mddev)) {
+ if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
+ force_change = 1;
+ ret = md_cluster_ops->metadata_update_start(mddev);
+ /* Has someone else has updated the sb */
+ if (!does_sb_need_changing(mddev)) {
+ if (ret == 0)
+ md_cluster_ops->metadata_update_cancel(mddev);
+ clear_bit(MD_CHANGE_PENDING, &mddev->flags);
+ return;
+ }
+ }
repeat:
/* First make sure individual recovery_offsets are correct */
rdev_for_each(rdev, mddev) {
@@ -2356,6 +2413,9 @@ repeat:
clear_bit(BlockedBadBlocks, &rdev->flags);
wake_up(&rdev->blocked_wait);
}
+
+ if (mddev_is_clustered(mddev) && ret == 0)
+ md_cluster_ops->metadata_update_finish(mddev);
}
EXPORT_SYMBOL(md_update_sb);
@@ -2490,17 +2550,16 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
err = -EBUSY;
else {
struct mddev *mddev = rdev->mddev;
- if (mddev_is_clustered(mddev))
- md_cluster_ops->remove_disk(mddev, rdev);
- md_kick_rdev_from_array(rdev);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
- if (mddev->pers)
- md_update_sb(mddev, 1);
- md_new_event(mddev);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
err = 0;
+ if (mddev_is_clustered(mddev))
+ err = md_cluster_ops->remove_disk(mddev, rdev);
+
+ if (err == 0) {
+ md_kick_rdev_from_array(rdev);
+ if (mddev->pers)
+ md_update_sb(mddev, 1);
+ md_new_event(mddev);
+ }
}
} else if (cmd_match(buf, "writemostly")) {
set_bit(WriteMostly, &rdev->flags);
@@ -2688,15 +2747,9 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
rdev->saved_raid_disk = -1;
clear_bit(In_sync, &rdev->flags);
clear_bit(Bitmap_sync, &rdev->flags);
- err = rdev->mddev->pers->
- hot_add_disk(rdev->mddev, rdev);
- if (err) {
- rdev->raid_disk = -1;
- return err;
- } else
- sysfs_notify_dirent_safe(rdev->sysfs_state);
- if (sysfs_link_rdev(rdev->mddev, rdev))
- /* failure here is OK */;
+ remove_and_add_spares(rdev->mddev, rdev);
+ if (rdev->raid_disk == -1)
+ return -EBUSY;
/* don't wakeup anyone, leave that to userspace. */
} else {
if (slot >= rdev->mddev->raid_disks &&
@@ -3198,14 +3251,6 @@ static void analyze_sbs(struct mddev *mddev)
md_kick_rdev_from_array(rdev);
continue;
}
- /* No device should have a Candidate flag
- * when reading devices
- */
- if (test_bit(Candidate, &rdev->flags)) {
- pr_info("md: kicking Cluster Candidate %s from array!\n",
- bdevname(rdev->bdev, b));
- md_kick_rdev_from_array(rdev);
- }
}
if (mddev->level == LEVEL_MULTIPATH) {
rdev->desc_nr = i++;
@@ -4066,12 +4111,8 @@ size_store(struct mddev *mddev, const char *buf, size_t len)
if (err)
return err;
if (mddev->pers) {
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
err = update_size(mddev, sectors);
md_update_sb(mddev, 1);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
} else {
if (mddev->dev_sectors == 0 ||
mddev->dev_sectors > sectors)
@@ -5309,8 +5350,6 @@ static void md_clean(struct mddev *mddev)
static void __md_stop_writes(struct mddev *mddev)
{
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
@@ -5329,8 +5368,6 @@ static void __md_stop_writes(struct mddev *mddev)
mddev->in_sync = 1;
md_update_sb(mddev, 1);
}
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
}
void md_stop_writes(struct mddev *mddev)
@@ -5910,19 +5947,12 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
* check whether the device shows up in other nodes
*/
if (mddev_is_clustered(mddev)) {
- if (info->state & (1 << MD_DISK_CANDIDATE)) {
- /* Through --cluster-confirm */
+ if (info->state & (1 << MD_DISK_CANDIDATE))
set_bit(Candidate, &rdev->flags);
- err = md_cluster_ops->new_disk_ack(mddev, true);
- if (err) {
- export_rdev(rdev);
- return err;
- }
- } else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
+ else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
/* --add initiated by this node */
- err = md_cluster_ops->add_new_disk_start(mddev, rdev);
+ err = md_cluster_ops->add_new_disk(mddev, rdev);
if (err) {
- md_cluster_ops->add_new_disk_finish(mddev);
export_rdev(rdev);
return err;
}
@@ -5931,13 +5961,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
rdev->raid_disk = -1;
err = bind_rdev_to_array(rdev, mddev);
+
if (err)
export_rdev(rdev);
- else
+
+ if (mddev_is_clustered(mddev)) {
+ if (info->state & (1 << MD_DISK_CANDIDATE))
+ md_cluster_ops->new_disk_ack(mddev, (err == 0));
+ else {
+ if (err)
+ md_cluster_ops->add_new_disk_cancel(mddev);
+ else
+ err = add_bound_rdev(rdev);
+ }
+
+ } else if (!err)
err = add_bound_rdev(rdev);
- if (mddev_is_clustered(mddev) &&
- (info->state & (1 << MD_DISK_CLUSTER_ADD)))
- md_cluster_ops->add_new_disk_finish(mddev);
+
return err;
}
@@ -5993,13 +6033,17 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
{
char b[BDEVNAME_SIZE];
struct md_rdev *rdev;
+ int ret = -1;
rdev = find_rdev(mddev, dev);
if (!rdev)
return -ENXIO;
if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
+ ret = md_cluster_ops->metadata_update_start(mddev);
+
+ if (rdev->raid_disk < 0)
+ goto kick_rdev;
clear_bit(Blocked, &rdev->flags);
remove_and_add_spares(mddev, rdev);
@@ -6007,20 +6051,19 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
if (rdev->raid_disk >= 0)
goto busy;
- if (mddev_is_clustered(mddev))
+kick_rdev:
+ if (mddev_is_clustered(mddev) && ret == 0)
md_cluster_ops->remove_disk(mddev, rdev);
md_kick_rdev_from_array(rdev);
md_update_sb(mddev, 1);
md_new_event(mddev);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
-
return 0;
busy:
- if (mddev_is_clustered(mddev))
+ if (mddev_is_clustered(mddev) && ret == 0)
md_cluster_ops->metadata_update_cancel(mddev);
+
printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
bdevname(rdev->bdev,b), mdname(mddev));
return -EBUSY;
@@ -6071,14 +6114,12 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
goto abort_export;
}
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
clear_bit(In_sync, &rdev->flags);
rdev->desc_nr = -1;
rdev->saved_raid_disk = -1;
err = bind_rdev_to_array(rdev, mddev);
if (err)
- goto abort_clustered;
+ goto abort_export;
/*
* The rest should better be atomic, we can have disk failures
@@ -6088,9 +6129,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
rdev->raid_disk = -1;
md_update_sb(mddev, 1);
-
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
/*
* Kick recovery, maybe this spare has to be added to the
* array immediately.
@@ -6100,9 +6138,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
md_new_event(mddev);
return 0;
-abort_clustered:
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_cancel(mddev);
abort_export:
export_rdev(rdev);
return err;
@@ -6420,8 +6455,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
return rv;
}
}
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
rv = update_size(mddev, (sector_t)info->size * 2);
@@ -6479,12 +6512,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
}
}
md_update_sb(mddev, 1);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
return rv;
err:
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_cancel(mddev);
return rv;
}
@@ -7597,11 +7626,7 @@ int md_allow_write(struct mddev *mddev)
mddev->safemode == 0)
mddev->safemode = 1;
spin_unlock(&mddev->lock);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
md_update_sb(mddev, 0);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
sysfs_notify_dirent_safe(mddev->sysfs_state);
} else
spin_unlock(&mddev->lock);
@@ -7633,6 +7658,7 @@ void md_do_sync(struct md_thread *thread)
struct md_rdev *rdev;
char *desc, *action = NULL;
struct blk_plug plug;
+ bool cluster_resync_finished = false;
/* just incase thread restarts... */
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
@@ -7802,9 +7828,6 @@ void md_do_sync(struct md_thread *thread)
md_new_event(mddev);
update_time = jiffies;
- if (mddev_is_clustered(mddev))
- md_cluster_ops->resync_start(mddev, j, max_sectors);
-
blk_start_plug(&plug);
while (j < max_sectors) {
sector_t sectors;
@@ -7868,8 +7891,6 @@ void md_do_sync(struct md_thread *thread)
j = max_sectors;
if (j > 2)
mddev->curr_resync = j;
- if (mddev_is_clustered(mddev))
- md_cluster_ops->resync_info_update(mddev, j, max_sectors);
mddev->curr_mark_cnt = io_sectors;
if (last_check == 0)
/* this is the earliest that rebuild will be
@@ -7940,7 +7961,11 @@ void md_do_sync(struct md_thread *thread)
mddev->curr_resync_completed = mddev->curr_resync;
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
- /* tell personality that we are finished */
+ /* tell personality and other nodes that we are finished */
+ if (mddev_is_clustered(mddev)) {
+ md_cluster_ops->resync_finish(mddev);
+ cluster_resync_finished = true;
+ }
mddev->pers->sync_request(mddev, max_sectors, &skipped);
if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
@@ -7976,11 +8001,13 @@ void md_do_sync(struct md_thread *thread)
}
}
skip:
- if (mddev_is_clustered(mddev))
- md_cluster_ops->resync_finish(mddev);
-
set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ if (mddev_is_clustered(mddev) &&
+ test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+ !cluster_resync_finished)
+ md_cluster_ops->resync_finish(mddev);
+
spin_lock(&mddev->lock);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
/* We completed so min/max setting can be forgotten if used. */
@@ -8023,10 +8050,14 @@ static int remove_and_add_spares(struct mddev *mddev,
if (removed && mddev->kobj.sd)
sysfs_notify(&mddev->kobj, NULL, "degraded");
- if (this)
+ if (this && removed)
goto no_add;
rdev_for_each(rdev, mddev) {
+ if (this && this != rdev)
+ continue;
+ if (test_bit(Candidate, &rdev->flags))
+ continue;
if (rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags))
@@ -8060,14 +8091,25 @@ no_add:
static void md_start_sync(struct work_struct *ws)
{
struct mddev *mddev = container_of(ws, struct mddev, del_work);
+ int ret = 0;
+
+ if (mddev_is_clustered(mddev)) {
+ ret = md_cluster_ops->resync_start(mddev);
+ if (ret) {
+ mddev->sync_thread = NULL;
+ goto out;
+ }
+ }
mddev->sync_thread = md_register_thread(md_do_sync,
mddev,
"resync");
+out:
if (!mddev->sync_thread) {
- printk(KERN_ERR "%s: could not start resync"
- " thread...\n",
- mdname(mddev));
+ if (!(mddev_is_clustered(mddev) && ret == -EAGAIN))
+ printk(KERN_ERR "%s: could not start resync"
+ " thread...\n",
+ mdname(mddev));
/* leave the spares where they are, it shouldn't hurt */
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
@@ -8186,13 +8228,8 @@ void md_check_recovery(struct mddev *mddev)
sysfs_notify_dirent_safe(mddev->sysfs_state);
}
- if (mddev->flags & MD_UPDATE_SB_FLAGS) {
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
+ if (mddev->flags & MD_UPDATE_SB_FLAGS)
md_update_sb(mddev, 0);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
- }
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
@@ -8290,8 +8327,6 @@ void md_reap_sync_thread(struct mddev *mddev)
set_bit(MD_CHANGE_DEVS, &mddev->flags);
}
}
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_start(mddev);
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
mddev->pers->finish_reshape)
mddev->pers->finish_reshape(mddev);
@@ -8304,8 +8339,6 @@ void md_reap_sync_thread(struct mddev *mddev)
rdev->saved_raid_disk = -1;
md_update_sb(mddev, 1);
- if (mddev_is_clustered(mddev))
- md_cluster_ops->metadata_update_finish(mddev);
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
@@ -8928,25 +8961,129 @@ err_wq:
return ret;
}
-void md_reload_sb(struct mddev *mddev)
+static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
{
- struct md_rdev *rdev, *tmp;
+ struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
+ struct md_rdev *rdev2;
+ int role, ret;
+ char b[BDEVNAME_SIZE];
- rdev_for_each_safe(rdev, tmp, mddev) {
- rdev->sb_loaded = 0;
- ClearPageUptodate(rdev->sb_page);
+ /* Check for change of roles in the active devices */
+ rdev_for_each(rdev2, mddev) {
+ if (test_bit(Faulty, &rdev2->flags))
+ continue;
+
+ /* Check if the roles changed */
+ role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
+
+ if (test_bit(Candidate, &rdev2->flags)) {
+ if (role == 0xfffe) {
+ pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
+ md_kick_rdev_from_array(rdev2);
+ continue;
+ }
+ else
+ clear_bit(Candidate, &rdev2->flags);
+ }
+
+ if (role != rdev2->raid_disk) {
+ /* got activated */
+ if (rdev2->raid_disk == -1 && role != 0xffff) {
+ rdev2->saved_raid_disk = role;
+ ret = remove_and_add_spares(mddev, rdev2);
+ pr_info("Activated spare: %s\n",
+ bdevname(rdev2->bdev,b));
+ continue;
+ }
+ /* device faulty
+ * We just want to do the minimum to mark the disk
+ * as faulty. The recovery is performed by the
+ * one who initiated the error.
+ */
+ if ((role == 0xfffe) || (role == 0xfffd)) {
+ md_error(mddev, rdev2);
+ clear_bit(Blocked, &rdev2->flags);
+ }
+ }
}
- mddev->raid_disks = 0;
- analyze_sbs(mddev);
- rdev_for_each_safe(rdev, tmp, mddev) {
- struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
- /* since we don't write to faulty devices, we figure out if the
- * disk is faulty by comparing events
- */
- if (mddev->events > sb->events)
- set_bit(Faulty, &rdev->flags);
+
+ /* recovery_cp changed */
+ if (le64_to_cpu(sb->resync_offset) != mddev->recovery_cp)
+ mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
+
+ /* Finally set the event to be up to date */
+ mddev->events = le64_to_cpu(sb->events);
+}
+
+static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
+{
+ int err;
+ struct page *swapout = rdev->sb_page;
+ struct mdp_superblock_1 *sb;
+
+ /* Store the sb page of the rdev in the swapout temporary
+ * variable in case we err in the future
+ */
+ rdev->sb_page = NULL;
+ alloc_disk_sb(rdev);
+ ClearPageUptodate(rdev->sb_page);
+ rdev->sb_loaded = 0;
+ err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version);
+
+ if (err < 0) {
+ pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
+ __func__, __LINE__, rdev->desc_nr, err);
+ put_page(rdev->sb_page);
+ rdev->sb_page = swapout;
+ rdev->sb_loaded = 1;
+ return err;
+ }
+
+ sb = page_address(rdev->sb_page);
+ /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
+ * is not set
+ */
+
+ if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
+ rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
+
+ /* The other node finished recovery, call spare_active to set
+ * device In_sync and mddev->degraded
+ */
+ if (rdev->recovery_offset == MaxSector &&
+ !test_bit(In_sync, &rdev->flags) &&
+ mddev->pers->spare_active(mddev))
+ sysfs_notify(&mddev->kobj, NULL, "degraded");
+
+ put_page(swapout);
+ return 0;
+}
+
+void md_reload_sb(struct mddev *mddev, int nr)
+{
+ struct md_rdev *rdev;
+ int err;
+
+ /* Find the rdev */
+ rdev_for_each_rcu(rdev, mddev) {
+ if (rdev->desc_nr == nr)
+ break;
+ }
+
+ if (!rdev || rdev->desc_nr != nr) {
+ pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
+ return;
}
+ err = read_rdev(mddev, rdev);
+ if (err < 0)
+ return;
+
+ check_sb_changes(mddev, rdev);
+
+ /* Read all rdev's to update recovery_offset */
+ rdev_for_each_rcu(rdev, mddev)
+ read_rdev(mddev, rdev);
}
EXPORT_SYMBOL(md_reload_sb);