From f67055780caac6a99f43834795c43acf99eba6a6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 27 Mar 2006 01:18:11 -0800 Subject: [PATCH] md: Checkpoint and allow restart of raid5 reshape We allow the superblock to record an 'old' and a 'new' geometry, and a position where any conversion is up to. The geometry allows for changing chunksize, layout and level as well as number of devices. When using verion-0.90 superblock, we convert the version to 0.91 while the conversion is happening so that an old kernel will refuse the assemble the array. For version-1, we use a feature bit for the same effect. When starting an array we check for an incomplete reshape and restart the reshape process if needed. If the reshape stopped at an awkward time (like when updating the first stripe) we refuse to assemble the array, and let user-space worry about it. Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 140 +++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 120 insertions(+), 20 deletions(-) (limited to 'drivers/md/raid5.c') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index b29135acb1d9..20ae32d67e21 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -1504,6 +1505,7 @@ static void handle_stripe(struct stripe_head *sh) clear_bit(STRIPE_EXPANDING, &sh->state); } else if (expanded) { clear_bit(STRIPE_EXPAND_READY, &sh->state); + atomic_dec(&conf->reshape_stripes); wake_up(&conf->wait_for_overlap); md_done_sync(conf->mddev, STRIPE_SECTORS, 1); } @@ -1875,6 +1877,26 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i */ int i; int dd_idx; + + if (sector_nr == 0 && + conf->expand_progress != 0) { + /* restarting in the middle, skip the initial sectors */ + sector_nr = conf->expand_progress; + sector_div(sector_nr, conf->raid_disks-1); + *skipped = 1; + return sector_nr; + } + + /* Cannot proceed until we've updated the superblock... */ + wait_event(conf->wait_for_overlap, + atomic_read(&conf->reshape_stripes)==0); + mddev->reshape_position = conf->expand_progress; + + mddev->sb_dirty = 1; + md_wakeup_thread(mddev->thread); + wait_event(mddev->sb_wait, mddev->sb_dirty == 0 || + kthread_should_stop()); + for (i=0; i < conf->chunk_size/512; i+= STRIPE_SECTORS) { int j; int skipped = 0; @@ -1882,6 +1904,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i sh = get_active_stripe(conf, sector_nr+i, conf->raid_disks, pd_idx, 0); set_bit(STRIPE_EXPANDING, &sh->state); + atomic_inc(&conf->reshape_stripes); /* If any of this stripe is beyond the end of the old * array, then we need to zero those blocks */ @@ -2121,10 +2144,61 @@ static int run(mddev_t *mddev) return -EIO; } + if (mddev->reshape_position != MaxSector) { + /* Check that we can continue the reshape. + * Currently only disks can change, it must + * increase, and we must be past the point where + * a stripe over-writes itself + */ + sector_t here_new, here_old; + int old_disks; + + if (mddev->new_level != mddev->level || + mddev->new_layout != mddev->layout || + mddev->new_chunk != mddev->chunk_size) { + printk(KERN_ERR "raid5: %s: unsupported reshape required - aborting.\n", + mdname(mddev)); + return -EINVAL; + } + if (mddev->delta_disks <= 0) { + printk(KERN_ERR "raid5: %s: unsupported reshape (reduce disks) required - aborting.\n", + mdname(mddev)); + return -EINVAL; + } + old_disks = mddev->raid_disks - mddev->delta_disks; + /* reshape_position must be on a new-stripe boundary, and one + * further up in new geometry must map after here in old geometry. + */ + here_new = mddev->reshape_position; + if (sector_div(here_new, (mddev->chunk_size>>9)*(mddev->raid_disks-1))) { + printk(KERN_ERR "raid5: reshape_position not on a stripe boundary\n"); + return -EINVAL; + } + /* here_new is the stripe we will write to */ + here_old = mddev->reshape_position; + sector_div(here_old, (mddev->chunk_size>>9)*(old_disks-1)); + /* here_old is the first stripe that we might need to read from */ + if (here_new >= here_old) { + /* Reading from the same stripe as writing to - bad */ + printk(KERN_ERR "raid5: reshape_position too early for auto-recovery - aborting.\n"); + return -EINVAL; + } + printk(KERN_INFO "raid5: reshape will continue\n"); + /* OK, we should be able to continue; */ + } + + mddev->private = kzalloc(sizeof (raid5_conf_t), GFP_KERNEL); if ((conf = mddev->private) == NULL) goto abort; - conf->disks = kzalloc(mddev->raid_disks * sizeof(struct disk_info), + if (mddev->reshape_position == MaxSector) { + conf->previous_raid_disks = conf->raid_disks = mddev->raid_disks; + } else { + conf->raid_disks = mddev->raid_disks; + conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; + } + + conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info), GFP_KERNEL); if (!conf->disks) goto abort; @@ -2148,7 +2222,7 @@ static int run(mddev_t *mddev) ITERATE_RDEV(mddev,rdev,tmp) { raid_disk = rdev->raid_disk; - if (raid_disk >= mddev->raid_disks + if (raid_disk >= conf->raid_disks || raid_disk < 0) continue; disk = conf->disks + raid_disk; @@ -2164,7 +2238,6 @@ static int run(mddev_t *mddev) } } - conf->raid_disks = mddev->raid_disks; /* * 0 for a fully functional array, 1 for a degraded array. */ @@ -2174,7 +2247,7 @@ static int run(mddev_t *mddev) conf->level = mddev->level; conf->algorithm = mddev->layout; conf->max_nr_stripes = NR_STRIPES; - conf->expand_progress = MaxSector; + conf->expand_progress = mddev->reshape_position; /* device size must be a multiple of chunk size */ mddev->size &= ~(mddev->chunk_size/1024 -1); @@ -2247,6 +2320,20 @@ static int run(mddev_t *mddev) print_raid5_conf(conf); + if (conf->expand_progress != MaxSector) { + printk("...ok start reshape thread\n"); + atomic_set(&conf->reshape_stripes, 0); + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); + clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); + set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); + set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + mddev->sync_thread = md_register_thread(md_do_sync, mddev, + "%s_reshape"); + /* FIXME if md_register_thread fails?? */ + md_wakeup_thread(mddev->sync_thread); + + } + /* read-ahead size must cover two whole stripes, which is * 2 * (n-1) * chunksize where 'n' is the number of raid devices */ @@ -2262,8 +2349,8 @@ static int run(mddev_t *mddev) mddev->queue->unplug_fn = raid5_unplug_device; mddev->queue->issue_flush_fn = raid5_issue_flush; + mddev->array_size = mddev->size * (conf->previous_raid_disks - 1); - mddev->array_size = mddev->size * (mddev->raid_disks - 1); return 0; abort: if (conf) { @@ -2436,7 +2523,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) /* * find the disk ... */ - for (disk=0; disk < mddev->raid_disks; disk++) + for (disk=0; disk < conf->raid_disks; disk++) if ((p=conf->disks + disk)->rdev == NULL) { clear_bit(In_sync, &rdev->flags); rdev->raid_disk = disk; @@ -2518,9 +2605,10 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks) if (err) return err; + atomic_set(&conf->reshape_stripes, 0); spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; - mddev->raid_disks = conf->raid_disks = raid_disks; + conf->raid_disks = raid_disks; conf->expand_progress = 0; spin_unlock_irq(&conf->device_lock); @@ -2542,6 +2630,14 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks) } mddev->degraded = (raid_disks - conf->previous_raid_disks) - added_devices; + mddev->new_chunk = mddev->chunk_size; + mddev->new_layout = mddev->layout; + mddev->new_level = mddev->level; + mddev->raid_disks = raid_disks; + mddev->delta_disks = raid_disks - conf->previous_raid_disks; + mddev->reshape_position = 0; + mddev->sb_dirty = 1; + clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); @@ -2552,6 +2648,7 @@ static int raid5_reshape(mddev_t *mddev, int raid_disks) mddev->recovery = 0; spin_lock_irq(&conf->device_lock); mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; + mddev->delta_disks = 0; conf->expand_progress = MaxSector; spin_unlock_irq(&conf->device_lock); return -EAGAIN; @@ -2566,20 +2663,23 @@ static void end_reshape(raid5_conf_t *conf) { struct block_device *bdev; - conf->mddev->array_size = conf->mddev->size * (conf->mddev->raid_disks-1); - set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); - conf->mddev->changed = 1; - - bdev = bdget_disk(conf->mddev->gendisk, 0); - if (bdev) { - mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, conf->mddev->array_size << 10); - mutex_unlock(&bdev->bd_inode->i_mutex); - bdput(bdev); + if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { + conf->mddev->array_size = conf->mddev->size * (conf->raid_disks-1); + set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); + conf->mddev->changed = 1; + + bdev = bdget_disk(conf->mddev->gendisk, 0); + if (bdev) { + mutex_lock(&bdev->bd_inode->i_mutex); + i_size_write(bdev->bd_inode, conf->mddev->array_size << 10); + mutex_unlock(&bdev->bd_inode->i_mutex); + bdput(bdev); + } + spin_lock_irq(&conf->device_lock); + conf->expand_progress = MaxSector; + spin_unlock_irq(&conf->device_lock); + conf->mddev->reshape_position = MaxSector; } - spin_lock_irq(&conf->device_lock); - conf->expand_progress = MaxSector; - spin_unlock_irq(&conf->device_lock); } static void raid5_quiesce(mddev_t *mddev, int state) -- cgit v1.2.3