diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 1 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 28 | ||||
-rw-r--r-- | drivers/md/dm-cache-metadata.c | 104 | ||||
-rw-r--r-- | drivers/md/dm-cache-target.c | 89 | ||||
-rw-r--r-- | drivers/md/dm-raid.c | 8 | ||||
-rw-r--r-- | drivers/md/dm-thin.c | 6 | ||||
-rw-r--r-- | drivers/md/dm.c | 9 | ||||
-rw-r--r-- | drivers/md/faulty.c | 8 | ||||
-rw-r--r-- | drivers/md/linear.c | 67 | ||||
-rw-r--r-- | drivers/md/md.c | 816 | ||||
-rw-r--r-- | drivers/md/md.h | 57 | ||||
-rw-r--r-- | drivers/md/multipath.c | 22 | ||||
-rw-r--r-- | drivers/md/raid0.c | 29 | ||||
-rw-r--r-- | drivers/md/raid1.c | 52 | ||||
-rw-r--r-- | drivers/md/raid1.h | 3 | ||||
-rw-r--r-- | drivers/md/raid10.c | 49 | ||||
-rw-r--r-- | drivers/md/raid10.h | 3 | ||||
-rw-r--r-- | drivers/md/raid5.c | 339 | ||||
-rw-r--r-- | drivers/md/raid5.h | 1 |
19 files changed, 1037 insertions, 654 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 5bdedf6df153..c355a226a024 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -5,6 +5,7 @@ menuconfig MD bool "Multiple devices driver support (RAID and LVM)" depends on BLOCK + select SRCU help Support multiple physical spindles through a single logical device. Required for RAID and logical volume management. diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index da3604e73e8a..3a5767968ba0 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -72,6 +72,19 @@ __acquires(bitmap->lock) /* this page has not been allocated yet */ spin_unlock_irq(&bitmap->lock); + /* It is possible that this is being called inside a + * prepare_to_wait/finish_wait loop from raid5c:make_request(). + * In general it is not permitted to sleep in that context as it + * can cause the loop to spin freely. + * That doesn't apply here as we can only reach this point + * once with any loop. + * When this function completes, either bp[page].map or + * bp[page].hijacked. In either case, this function will + * abort before getting to this point again. So there is + * no risk of a free-spin, and so it is safe to assert + * that sleeping here is allowed. + */ + sched_annotate_sleep(); mappage = kzalloc(PAGE_SIZE, GFP_NOIO); spin_lock_irq(&bitmap->lock); @@ -1606,7 +1619,9 @@ void bitmap_destroy(struct mddev *mddev) return; mutex_lock(&mddev->bitmap_info.mutex); + spin_lock(&mddev->lock); mddev->bitmap = NULL; /* disconnect from the md device */ + spin_unlock(&mddev->lock); mutex_unlock(&mddev->bitmap_info.mutex); if (mddev->thread) mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; @@ -2196,11 +2211,13 @@ __ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store); static ssize_t can_clear_show(struct mddev *mddev, char *page) { int len; + spin_lock(&mddev->lock); if (mddev->bitmap) len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ? "false" : "true")); else len = sprintf(page, "\n"); + spin_unlock(&mddev->lock); return len; } @@ -2225,10 +2242,15 @@ __ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store); static ssize_t behind_writes_used_show(struct mddev *mddev, char *page) { + ssize_t ret; + spin_lock(&mddev->lock); if (mddev->bitmap == NULL) - return sprintf(page, "0\n"); - return sprintf(page, "%lu\n", - mddev->bitmap->behind_writes_used); + ret = sprintf(page, "0\n"); + else + ret = sprintf(page, "%lu\n", + mddev->bitmap->behind_writes_used); + spin_unlock(&mddev->lock); + return ret; } static ssize_t diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 9fc616c2755e..c1c010498a21 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -94,6 +94,9 @@ struct cache_disk_superblock { } __packed; struct dm_cache_metadata { + atomic_t ref_count; + struct list_head list; + struct block_device *bdev; struct dm_block_manager *bm; struct dm_space_map *metadata_sm; @@ -669,10 +672,10 @@ static void unpack_value(__le64 value_le, dm_oblock_t *block, unsigned *flags) /*----------------------------------------------------------------*/ -struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, - sector_t data_block_size, - bool may_format_device, - size_t policy_hint_size) +static struct dm_cache_metadata *metadata_open(struct block_device *bdev, + sector_t data_block_size, + bool may_format_device, + size_t policy_hint_size) { int r; struct dm_cache_metadata *cmd; @@ -680,9 +683,10 @@ struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); if (!cmd) { DMERR("could not allocate metadata struct"); - return NULL; + return ERR_PTR(-ENOMEM); } + atomic_set(&cmd->ref_count, 1); init_rwsem(&cmd->root_lock); cmd->bdev = bdev; cmd->data_block_size = data_block_size; @@ -705,10 +709,96 @@ struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, return cmd; } +/* + * We keep a little list of ref counted metadata objects to prevent two + * different target instances creating separate bufio instances. This is + * an issue if a table is reloaded before the suspend. + */ +static DEFINE_MUTEX(table_lock); +static LIST_HEAD(table); + +static struct dm_cache_metadata *lookup(struct block_device *bdev) +{ + struct dm_cache_metadata *cmd; + + list_for_each_entry(cmd, &table, list) + if (cmd->bdev == bdev) { + atomic_inc(&cmd->ref_count); + return cmd; + } + + return NULL; +} + +static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev, + sector_t data_block_size, + bool may_format_device, + size_t policy_hint_size) +{ + struct dm_cache_metadata *cmd, *cmd2; + + mutex_lock(&table_lock); + cmd = lookup(bdev); + mutex_unlock(&table_lock); + + if (cmd) + return cmd; + + cmd = metadata_open(bdev, data_block_size, may_format_device, policy_hint_size); + if (!IS_ERR(cmd)) { + mutex_lock(&table_lock); + cmd2 = lookup(bdev); + if (cmd2) { + mutex_unlock(&table_lock); + __destroy_persistent_data_objects(cmd); + kfree(cmd); + return cmd2; + } + list_add(&cmd->list, &table); + mutex_unlock(&table_lock); + } + + return cmd; +} + +static bool same_params(struct dm_cache_metadata *cmd, sector_t data_block_size) +{ + if (cmd->data_block_size != data_block_size) { + DMERR("data_block_size (%llu) different from that in metadata (%llu)\n", + (unsigned long long) data_block_size, + (unsigned long long) cmd->data_block_size); + return false; + } + + return true; +} + +struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, + sector_t data_block_size, + bool may_format_device, + size_t policy_hint_size) +{ + struct dm_cache_metadata *cmd = lookup_or_open(bdev, data_block_size, + may_format_device, policy_hint_size); + + if (!IS_ERR(cmd) && !same_params(cmd, data_block_size)) { + dm_cache_metadata_close(cmd); + return ERR_PTR(-EINVAL); + } + + return cmd; +} + void dm_cache_metadata_close(struct dm_cache_metadata *cmd) { - __destroy_persistent_data_objects(cmd); - kfree(cmd); + if (atomic_dec_and_test(&cmd->ref_count)) { + mutex_lock(&table_lock); + list_del(&cmd->list); + mutex_unlock(&table_lock); + + __destroy_persistent_data_objects(cmd); + kfree(cmd); + } } /* diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 1e96d7889f51..e1650539cc2f 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -221,7 +221,13 @@ struct cache { struct list_head need_commit_migrations; sector_t migration_threshold; wait_queue_head_t migration_wait; - atomic_t nr_migrations; + atomic_t nr_allocated_migrations; + + /* + * The number of in flight migrations that are performing + * background io. eg, promotion, writeback. + */ + atomic_t nr_io_migrations; wait_queue_head_t quiescing_wait; atomic_t quiescing; @@ -258,7 +264,6 @@ struct cache { struct dm_deferred_set *all_io_ds; mempool_t *migration_pool; - struct dm_cache_migration *next_migration; struct dm_cache_policy *policy; unsigned policy_nr_args; @@ -350,10 +355,31 @@ static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cel dm_bio_prison_free_cell(cache->prison, cell); } +static struct dm_cache_migration *alloc_migration(struct cache *cache) +{ + struct dm_cache_migration *mg; + + mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); + if (mg) { + mg->cache = cache; + atomic_inc(&mg->cache->nr_allocated_migrations); + } + + return mg; +} + +static void free_migration(struct dm_cache_migration *mg) +{ + if (atomic_dec_and_test(&mg->cache->nr_allocated_migrations)) + wake_up(&mg->cache->migration_wait); + + mempool_free(mg, mg->cache->migration_pool); +} + static int prealloc_data_structs(struct cache *cache, struct prealloc *p) { if (!p->mg) { - p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); + p->mg = alloc_migration(cache); if (!p->mg) return -ENOMEM; } @@ -382,7 +408,7 @@ static void prealloc_free_structs(struct cache *cache, struct prealloc *p) free_prison_cell(cache, p->cell1); if (p->mg) - mempool_free(p->mg, cache->migration_pool); + free_migration(p->mg); } static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) @@ -854,24 +880,14 @@ static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, * Migration covers moving data from the origin device to the cache, or * vice versa. *--------------------------------------------------------------*/ -static void free_migration(struct dm_cache_migration *mg) -{ - mempool_free(mg, mg->cache->migration_pool); -} - -static void inc_nr_migrations(struct cache *cache) +static void inc_io_migrations(struct cache *cache) { - atomic_inc(&cache->nr_migrations); + atomic_inc(&cache->nr_io_migrations); } -static void dec_nr_migrations(struct cache *cache) +static void dec_io_migrations(struct cache *cache) { - atomic_dec(&cache->nr_migrations); - - /* - * Wake the worker in case we're suspending the target. - */ - wake_up(&cache->migration_wait); + atomic_dec(&cache->nr_io_migrations); } static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, @@ -894,11 +910,10 @@ static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, wake_worker(cache); } -static void cleanup_migration(struct dm_cache_migration *mg) +static void free_io_migration(struct dm_cache_migration *mg) { - struct cache *cache = mg->cache; + dec_io_migrations(mg->cache); free_migration(mg); - dec_nr_migrations(cache); } static void migration_failure(struct dm_cache_migration *mg) @@ -923,7 +938,7 @@ static void migration_failure(struct dm_cache_migration *mg) cell_defer(cache, mg->new_ocell, true); } - cleanup_migration(mg); + free_io_migration(mg); } static void migration_success_pre_commit(struct dm_cache_migration *mg) @@ -934,7 +949,7 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg) if (mg->writeback) { clear_dirty(cache, mg->old_oblock, mg->cblock); cell_defer(cache, mg->old_ocell, false); - cleanup_migration(mg); + free_io_migration(mg); return; } else if (mg->demote) { @@ -944,14 +959,14 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg) mg->old_oblock); if (mg->promote) cell_defer(cache, mg->new_ocell, true); - cleanup_migration(mg); + free_io_migration(mg); return; } } else { if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) { DMWARN_LIMIT("promotion failed; couldn't update on disk metadata"); policy_remove_mapping(cache->policy, mg->new_oblock); - cleanup_migration(mg); + free_io_migration(mg); return; } } @@ -984,7 +999,7 @@ static void migration_success_post_commit(struct dm_cache_migration *mg) } else { if (mg->invalidate) policy_remove_mapping(cache->policy, mg->old_oblock); - cleanup_migration(mg); + free_io_migration(mg); } } else { @@ -999,7 +1014,7 @@ static void migration_success_post_commit(struct dm_cache_migration *mg) bio_endio(mg->new_ocell->holder, 0); cell_defer(cache, mg->new_ocell, false); } - cleanup_migration(mg); + free_io_migration(mg); } } @@ -1251,7 +1266,7 @@ static void promote(struct cache *cache, struct prealloc *structs, mg->new_ocell = cell; mg->start_jiffies = jiffies; - inc_nr_migrations(cache); + inc_io_migrations(cache); quiesce_migration(mg); } @@ -1275,7 +1290,7 @@ static void writeback(struct cache *cache, struct prealloc *structs, mg->new_ocell = NULL; mg->start_jiffies = jiffies; - inc_nr_migrations(cache); + inc_io_migrations(cache); quiesce_migration(mg); } @@ -1302,7 +1317,7 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs, mg->new_ocell = new_ocell; mg->start_jiffies = jiffies; - inc_nr_migrations(cache); + inc_io_migrations(cache); quiesce_migration(mg); } @@ -1330,7 +1345,7 @@ static void invalidate(struct cache *cache, struct prealloc *structs, mg->new_ocell = NULL; mg->start_jiffies = jiffies; - inc_nr_migrations(cache); + inc_io_migrations(cache); quiesce_migration(mg); } @@ -1412,7 +1427,7 @@ static void process_discard_bio(struct cache *cache, struct prealloc *structs, static bool spare_migration_bandwidth(struct cache *cache) { - sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) * + sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * cache->sectors_per_block; return current_volume < cache->migration_threshold; } @@ -1764,7 +1779,7 @@ static void stop_quiescing(struct cache *cache) static void wait_for_migrations(struct cache *cache) { - wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations)); + wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations)); } static void stop_worker(struct cache *cache) @@ -1876,9 +1891,6 @@ static void destroy(struct cache *cache) { unsigned i; - if (cache->next_migration) - mempool_free(cache->next_migration, cache->migration_pool); - if (cache->migration_pool) mempool_destroy(cache->migration_pool); @@ -2424,7 +2436,8 @@ static int cache_create(struct cache_args *ca, struct cache **result) INIT_LIST_HEAD(&cache->quiesced_migrations); INIT_LIST_HEAD(&cache->completed_migrations); INIT_LIST_HEAD(&cache->need_commit_migrations); - atomic_set(&cache->nr_migrations, 0); + atomic_set(&cache->nr_allocated_migrations, 0); + atomic_set(&cache->nr_io_migrations, 0); init_waitqueue_head(&cache->migration_wait); init_waitqueue_head(&cache->quiescing_wait); @@ -2487,8 +2500,6 @@ static int cache_create(struct cache_args *ca, struct cache **result) goto bad; } - cache->next_migration = NULL; - cache->need_tick_bio = true; cache->sized = false; cache->invalidate = false; diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 07c0fa0fa284..777d9ba2acad 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -746,13 +746,7 @@ static int raid_is_congested(struct dm_target_callbacks *cb, int bits) { struct raid_set *rs = container_of(cb, struct raid_set, callbacks); - if (rs->raid_type->level == 1) - return md_raid1_congested(&rs->md, bits); - - if (rs->raid_type->level == 10) - return md_raid10_congested(&rs->md, bits); - - return md_raid5_congested(&rs->md, bits); + return mddev_congested(&rs->md, bits); } /* diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 493478989dbd..07705ee181e3 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -3385,6 +3385,12 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv) struct pool_c *pt = ti->private; struct pool *pool = pt->pool; + if (get_pool_mode(pool) >= PM_READ_ONLY) { + DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode", + dm_device_name(pool->pool_md)); + return -EINVAL; + } + if (!strcasecmp(argv[0], "create_thin")) r = process_create_thin_mesg(argc, argv, pool); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index b98cd9d84435..2caf5b374649 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -206,6 +206,9 @@ struct mapped_device { /* zero-length flush that will be cloned and submitted to targets */ struct bio flush_bio; + /* the number of internal suspends */ + unsigned internal_suspend_count; + struct dm_stats stats; }; @@ -2928,7 +2931,7 @@ static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_fla { struct dm_table *map = NULL; - if (dm_suspended_internally_md(md)) + if (md->internal_suspend_count++) return; /* nested internal suspend */ if (dm_suspended_md(md)) { @@ -2953,7 +2956,9 @@ static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_fla static void __dm_internal_resume(struct mapped_device *md) { - if (!dm_suspended_internally_md(md)) + BUG_ON(!md->internal_suspend_count); + + if (--md->internal_suspend_count) return; /* resume from nested internal suspend */ if (dm_suspended_md(md)) diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index e8b4574956c7..1277eb26b58a 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -332,13 +332,11 @@ static int run(struct mddev *mddev) return 0; } -static int stop(struct mddev *mddev) +static void faulty_free(struct mddev *mddev, void *priv) { - struct faulty_conf *conf = mddev->private; + struct faulty_conf *conf = priv; kfree(conf); - mddev->private = NULL; - return 0; } static struct md_personality faulty_personality = @@ -348,7 +346,7 @@ static struct md_personality faulty_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = faulty_free, .status = status, .check_reshape = reshape, .size = faulty_size, diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 64713b77df1c..fa7d577f3d12 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -34,7 +34,7 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) lo = 0; hi = mddev->raid_disks - 1; - conf = rcu_dereference(mddev->private); + conf = mddev->private; /* * Binary Search @@ -60,18 +60,16 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) * * Return amount of bytes we can take at this offset */ -static int linear_mergeable_bvec(struct request_queue *q, +static int linear_mergeable_bvec(struct mddev *mddev, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - struct mddev *mddev = q->queuedata; struct dev_info *dev0; unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int maxbytes = biovec->bv_len; struct request_queue *subq; - rcu_read_lock(); dev0 = which_dev(mddev, sector); maxsectors = dev0->end_sector - sector; subq = bdev_get_queue(dev0->rdev->bdev); @@ -81,7 +79,6 @@ static int linear_mergeable_bvec(struct request_queue *q, maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm, biovec)); } - rcu_read_unlock(); if (maxsectors < bio_sectors) maxsectors = 0; @@ -97,24 +94,18 @@ static int linear_mergeable_bvec(struct request_queue *q, return maxsectors << 9; } -static int linear_congested(void *data, int bits) +static int linear_congested(struct mddev *mddev, int bits) { - struct mddev *mddev = data; struct linear_conf *conf; int i, ret = 0; - if (mddev_congested(mddev, bits)) - return 1; - - rcu_read_lock(); - conf = rcu_dereference(mddev->private); + conf = mddev->private; for (i = 0; i < mddev->raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); ret |= bdi_congested(&q->backing_dev_info, bits); } - rcu_read_unlock(); return ret; } @@ -123,12 +114,10 @@ static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disk struct linear_conf *conf; sector_t array_sectors; - rcu_read_lock(); - conf = rcu_dereference(mddev->private); + conf = mddev->private; WARN_ONCE(sectors || raid_disks, "%s does not support generic reshape\n", __func__); array_sectors = conf->array_sectors; - rcu_read_unlock(); return array_sectors; } @@ -217,10 +206,6 @@ static int linear_run (struct mddev *mddev) mddev->private = conf; md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); - blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec); - mddev->queue->backing_dev_info.congested_fn = linear_congested; - mddev->queue->backing_dev_info.congested_data = mddev; - ret = md_integrity_register(mddev); if (ret) { kfree(conf); @@ -252,38 +237,23 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) if (!newconf) return -ENOMEM; - oldconf = rcu_dereference_protected(mddev->private, - lockdep_is_held( - &mddev->reconfig_mutex)); + mddev_suspend(mddev); + oldconf = mddev->private; mddev->raid_disks++; - rcu_assign_pointer(mddev->private, newconf); + mddev->private = newconf; md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); + mddev_resume(mddev); revalidate_disk(mddev->gendisk); - kfree_rcu(oldconf, rcu); + kfree(oldconf); return 0; } -static int linear_stop (struct mddev *mddev) +static void linear_free(struct mddev *mddev, void *priv) { - struct linear_conf *conf = - rcu_dereference_protected(mddev->private, - lockdep_is_held( - &mddev->reconfig_mutex)); + struct linear_conf *conf = priv; - /* - * We do not require rcu protection here since - * we hold reconfig_mutex for both linear_add and - * linear_stop, so they cannot race. - * We should make sure any old 'conf's are properly - * freed though. - */ - rcu_barrier(); - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ kfree(conf); - mddev->private = NULL; - - return 0; } static void linear_make_request(struct mddev *mddev, struct bio *bio) @@ -299,16 +269,12 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) } do { - rcu_read_lock(); - tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector); start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; end_sector = tmp_dev->end_sector; data_offset = tmp_dev->rdev->data_offset; bio->bi_bdev = tmp_dev->rdev->bdev; - rcu_read_unlock(); - if (unlikely(bio->bi_iter.bi_sector >= end_sector || bio->bi_iter.bi_sector < start_sector)) goto out_of_bounds; @@ -355,6 +321,10 @@ static void linear_status (struct seq_file *seq, struct mddev *mddev) seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); } +static void linear_quiesce(struct mddev *mddev, int state) +{ +} + static struct md_personality linear_personality = { .name = "linear", @@ -362,10 +332,13 @@ static struct md_personality linear_personality = .owner = THIS_MODULE, .make_request = linear_make_request, .run = linear_run, - .stop = linear_stop, + .free = linear_free, .status = linear_status, .hot_add_disk = linear_add, .size = linear_size, + .quiesce = linear_quiesce, + .congested = linear_congested, + .mergeable_bvec = linear_mergeable_bvec, }; static int __init linear_init (void) diff --git a/drivers/md/md.c b/drivers/md/md.c index 709755fb6d7b..c8d2bac4e28b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -72,6 +72,7 @@ static struct workqueue_struct *md_misc_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); +static void mddev_detach(struct mddev *mddev); /* * Default number of read corrections we'll attempt on an rdev @@ -292,8 +293,8 @@ static void md_make_request(struct request_queue *q, struct bio *bio) /* mddev_suspend makes sure no new requests are submitted * to the device, and that any requests that have been submitted * are completely handled. - * Once ->stop is called and completes, the module will be completely - * unused. + * Once mddev_detach() is called and completes, the module will be + * completely unused. */ void mddev_suspend(struct mddev *mddev) { @@ -321,10 +322,47 @@ EXPORT_SYMBOL_GPL(mddev_resume); int mddev_congested(struct mddev *mddev, int bits) { - return mddev->suspended; + struct md_personality *pers = mddev->pers; + int ret = 0; + + rcu_read_lock(); + if (mddev->suspended) + ret = 1; + else if (pers && pers->congested) + ret = pers->congested(mddev, bits); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(mddev_congested); +static int md_congested(void *data, int bits) +{ + struct mddev *mddev = data; + return mddev_congested(mddev, bits); } -EXPORT_SYMBOL(mddev_congested); +static int md_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) +{ + struct mddev *mddev = q->queuedata; + int ret; + rcu_read_lock(); + if (mddev->suspended) { + /* Must always allow one vec */ + if (bvm->bi_size == 0) + ret = biovec->bv_len; + else + ret = 0; + } else { + struct md_personality *pers = mddev->pers; + if (pers && pers->mergeable_bvec) + ret = pers->mergeable_bvec(mddev, bvm, biovec); + else + ret = biovec->bv_len; + } + rcu_read_unlock(); + return ret; +} /* * Generic flush handling for md */ @@ -397,12 +435,12 @@ static void md_submit_flush_data(struct work_struct *ws) void md_flush_request(struct mddev *mddev, struct bio *bio) { - spin_lock_irq(&mddev->write_lock); + spin_lock_irq(&mddev->lock); wait_event_lock_irq(mddev->sb_wait, !mddev->flush_bio, - mddev->write_lock); + mddev->lock); mddev->flush_bio = bio; - spin_unlock_irq(&mddev->write_lock); + spin_unlock_irq(&mddev->lock); INIT_WORK(&mddev->flush_work, submit_flushes); queue_work(md_wq, &mddev->flush_work); @@ -465,7 +503,7 @@ void mddev_init(struct mddev *mddev) atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); atomic_set(&mddev->active_io, 0); - spin_lock_init(&mddev->write_lock); + spin_lock_init(&mddev->lock); atomic_set(&mddev->flush_pending, 0); init_waitqueue_head(&mddev->sb_wait); init_waitqueue_head(&mddev->recovery_wait); @@ -552,32 +590,9 @@ static struct mddev *mddev_find(dev_t unit) goto retry; } -static inline int __must_check mddev_lock(struct mddev *mddev) -{ - return mutex_lock_interruptible(&mddev->reconfig_mutex); -} - -/* Sometimes we need to take the lock in a situation where - * failure due to interrupts is not acceptable. - */ -static inline void mddev_lock_nointr(struct mddev *mddev) -{ - mutex_lock(&mddev->reconfig_mutex); -} - -static inline int mddev_is_locked(struct mddev *mddev) -{ - return mutex_is_locked(&mddev->reconfig_mutex); -} - -static inline int mddev_trylock(struct mddev *mddev) -{ - return mutex_trylock(&mddev->reconfig_mutex); -} - static struct attribute_group md_redundancy_group; -static void mddev_unlock(struct mddev *mddev) +void mddev_unlock(struct mddev *mddev) { if (mddev->to_remove) { /* These cannot be removed under reconfig_mutex as @@ -619,6 +634,7 @@ static void mddev_unlock(struct mddev *mddev) md_wakeup_thread(mddev->thread); spin_unlock(&pers_lock); } +EXPORT_SYMBOL_GPL(mddev_unlock); static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) { @@ -2230,7 +2246,7 @@ repeat: return; } - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); mddev->utime = get_seconds(); @@ -2287,7 +2303,7 @@ repeat: } sync_sbs(mddev, nospares); - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", mdname(mddev), mddev->in_sync); @@ -2326,15 +2342,15 @@ repeat: md_super_wait(mddev); /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); if (mddev->in_sync != sync_req || test_bit(MD_CHANGE_DEVS, &mddev->flags)) { /* have to write it out again */ - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); goto repeat; } clear_bit(MD_CHANGE_PENDING, &mddev->flags); - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); wake_up(&mddev->sb_wait); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) sysfs_notify(&mddev->kobj, NULL, "sync_completed"); @@ -2381,40 +2397,41 @@ state_show(struct md_rdev *rdev, char *page) { char *sep = ""; size_t len = 0; + unsigned long flags = ACCESS_ONCE(rdev->flags); - if (test_bit(Faulty, &rdev->flags) || + if (test_bit(Faulty, &flags) || rdev->badblocks.unacked_exist) { len+= sprintf(page+len, "%sfaulty",sep); sep = ","; } - if (test_bit(In_sync, &rdev->flags)) { + if (test_bit(In_sync, &flags)) { len += sprintf(page+len, "%sin_sync",sep); sep = ","; } - if (test_bit(WriteMostly, &rdev->flags)) { + if (test_bit(WriteMostly, &flags)) { len += sprintf(page+len, "%swrite_mostly",sep); sep = ","; } - if (test_bit(Blocked, &rdev->flags) || + if (test_bit(Blocked, &flags) || (rdev->badblocks.unacked_exist - && !test_bit(Faulty, &rdev->flags))) { + && !test_bit(Faulty, &flags))) { len += sprintf(page+len, "%sblocked", sep); sep = ","; } - if (!test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags)) { + if (!test_bit(Faulty, &flags) && + !test_bit(In_sync, &flags)) { len += sprintf(page+len, "%sspare", sep); sep = ","; } - if (test_bit(WriteErrorSeen, &rdev->flags)) { + if (test_bit(WriteErrorSeen, &flags)) { len += sprintf(page+len, "%swrite_error", sep); sep = ","; } - if (test_bit(WantReplacement, &rdev->flags)) { + if (test_bit(WantReplacement, &flags)) { len += sprintf(page+len, "%swant_replacement", sep); sep = ","; } - if (test_bit(Replacement, &rdev->flags)) { + if (test_bit(Replacement, &flags)) { len += sprintf(page+len, "%sreplacement", sep); sep = ","; } @@ -2927,21 +2944,12 @@ rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); - struct mddev *mddev = rdev->mddev; - ssize_t rv; if (!entry->show) return -EIO; - - rv = mddev ? mddev_lock(mddev) : -EBUSY; - if (!rv) { - if (rdev->mddev == NULL) - rv = -EBUSY; - else - rv = entry->show(rdev, page); - mddev_unlock(mddev); - } - return rv; + if (!rdev->mddev) + return -EBUSY; + return entry->show(rdev, page); } static ssize_t @@ -3212,11 +3220,13 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) mddev->safemode_delay = 0; else { unsigned long old_delay = mddev->safemode_delay; - mddev->safemode_delay = (msec*HZ)/1000; - if (mddev->safemode_delay == 0) - mddev->safemode_delay = 1; - if (mddev->safemode_delay < old_delay || old_delay == 0) - md_safemode_timeout((unsigned long)mddev); + unsigned long new_delay = (msec*HZ)/1000; + + if (new_delay == 0) + new_delay = 1; + mddev->safemode_delay = new_delay; + if (new_delay < old_delay || old_delay == 0) + mod_timer(&mddev->safemode_timer, jiffies+1); } return len; } @@ -3226,41 +3236,52 @@ __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); static ssize_t level_show(struct mddev *mddev, char *page) { - struct md_personality *p = mddev->pers; + struct md_personality *p; + int ret; + spin_lock(&mddev->lock); + p = mddev->pers; if (p) - return sprintf(page, "%s\n", p->name); + ret = sprintf(page, "%s\n", p->name); else if (mddev->clevel[0]) - return sprintf(page, "%s\n", mddev->clevel); + ret = sprintf(page, "%s\n", mddev->clevel); else if (mddev->level != LEVEL_NONE) - return sprintf(page, "%d\n", mddev->level); + ret = sprintf(page, "%d\n", mddev->level); else - return 0; + ret = 0; + spin_unlock(&mddev->lock); + return ret; } static ssize_t level_store(struct mddev *mddev, const char *buf, size_t len) { char clevel[16]; - ssize_t rv = len; - struct md_personality *pers; + ssize_t rv; + size_t slen = len; + struct md_personality *pers, *oldpers; long level; - void *priv; + void *priv, *oldpriv; struct md_rdev *rdev; + if (slen == 0 || slen >= sizeof(clevel)) + return -EINVAL; + + rv = mddev_lock(mddev); + if (rv) + return rv; + if (mddev->pers == NULL) { - if (len == 0) - return 0; - if (len >= sizeof(mddev->clevel)) - return -ENOSPC; - strncpy(mddev->clevel, buf, len); - if (mddev->clevel[len-1] == '\n') - len--; - mddev->clevel[len] = 0; + strncpy(mddev->clevel, buf, slen); + if (mddev->clevel[slen-1] == '\n') + slen--; + mddev->clevel[slen] = 0; mddev->level = LEVEL_NONE; - return rv; + rv = len; + goto out_unlock; } + rv = -EROFS; if (mddev->ro) - return -EROFS; + goto out_unlock; /* request to change the personality. Need to ensure: * - array is not engaged in resync/recovery/reshape @@ -3268,25 +3289,25 @@ level_store(struct mddev *mddev, const char *buf, size_t len) * - new personality will access other array. */ + rv = -EBUSY; if (mddev->sync_thread || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || mddev->reshape_position != MaxSector || mddev->sysfs_active) - return -EBUSY; + goto out_unlock; + rv = -EINVAL; if (!mddev->pers->quiesce) { printk(KERN_WARNING "md: %s: %s does not support online personality change\n", mdname(mddev), mddev->pers->name); - return -EINVAL; + goto out_unlock; } /* Now find the new personality */ - if (len == 0 || len >= sizeof(clevel)) - return -EINVAL; - strncpy(clevel, buf, len); - if (clevel[len-1] == '\n') - len--; - clevel[len] = 0; + strncpy(clevel, buf, slen); + if (clevel[slen-1] == '\n') + slen--; + clevel[slen] = 0; if (kstrtol(clevel, 10, &level)) level = LEVEL_NONE; @@ -3297,20 +3318,23 @@ level_store(struct mddev *mddev, const char *buf, size_t len) if (!pers || !try_module_get(pers->owner)) { spin_unlock(&pers_lock); printk(KERN_WARNING "md: personality %s not loaded\n", clevel); - return -EINVAL; + rv = -EINVAL; + goto out_unlock; } spin_unlock(&pers_lock); if (pers == mddev->pers) { /* Nothing to do! */ module_put(pers->owner); - return rv; + rv = len; + goto out_unlock; } if (!pers->takeover) { module_put(pers->owner); printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", mdname(mddev), clevel); - return -EINVAL; + rv = -EINVAL; + goto out_unlock; } rdev_for_each(rdev, mddev) @@ -3330,30 +3354,29 @@ level_store(struct mddev *mddev, const char *buf, size_t len) module_put(pers->owner); printk(KERN_WARNING "md: %s: %s would not accept array\n", mdname(mddev), clevel); - return PTR_ERR(priv); + rv = PTR_ERR(priv); + goto out_unlock; } /* Looks like we have a winner */ mddev_suspend(mddev); - mddev->pers->stop(mddev); + mddev_detach(mddev); - if (mddev->pers->sync_request == NULL && - pers->sync_request != NULL) { - /* need to add the md_redundancy_group */ - if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) - printk(KERN_WARNING - "md: cannot register extra attributes for %s\n", - mdname(mddev)); - mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); - } - if (mddev->pers->sync_request != NULL && - pers->sync_request == NULL) { - /* need to remove the md_redundancy_group */ - if (mddev->to_remove == NULL) - mddev->to_remove = &md_redundancy_group; - } + spin_lock(&mddev->lock); + oldpers = mddev->pers; + oldpriv = mddev->private; + mddev->pers = pers; + mddev->private = priv; + strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); + mddev->level = mddev->new_level; + mddev->layout = mddev->new_layout; + mddev->chunk_sectors = mddev->new_chunk_sectors; + mddev->delta_disks = 0; + mddev->reshape_backwards = 0; + mddev->degraded = 0; + spin_unlock(&mddev->lock); - if (mddev->pers->sync_request == NULL && + if (oldpers->sync_request == NULL && mddev->external) { /* We are converting from a no-redundancy array * to a redundancy array and metadata is managed @@ -3367,6 +3390,24 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->safemode = 0; } + oldpers->free(mddev, oldpriv); + + if (oldpers->sync_request == NULL && + pers->sync_request != NULL) { + /* need to add the md_redundancy_group */ + if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) + printk(KERN_WARNING + "md: cannot register extra attributes for %s\n", + mdname(mddev)); + mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); + } + if (oldpers->sync_request != NULL && + pers->sync_request == NULL) { + /* need to remove the md_redundancy_group */ + if (mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + } + rdev_for_each(rdev, mddev) { if (rdev->raid_disk < 0) continue; @@ -3392,17 +3433,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) } } - module_put(mddev->pers->owner); - mddev->pers = pers; - mddev->private = priv; - strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); - mddev->level = mddev->new_level; - mddev->layout = mddev->new_layout; - mddev->chunk_sectors = mddev->new_chunk_sectors; - mddev->delta_disks = 0; - mddev->reshape_backwards = 0; - mddev->degraded = 0; - if (mddev->pers->sync_request == NULL) { + if (pers->sync_request == NULL) { /* this is now an array without redundancy, so * it must always be in_sync */ @@ -3417,6 +3448,9 @@ level_store(struct mddev *mddev, const char *buf, size_t len) md_update_sb(mddev, 1); sysfs_notify(&mddev->kobj, NULL, "level"); md_new_event(mddev); + rv = len; +out_unlock: + mddev_unlock(mddev); return rv; } @@ -3439,28 +3473,32 @@ layout_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long n = simple_strtoul(buf, &e, 10); + int err; if (!*buf || (*e && *e != '\n')) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; if (mddev->pers) { - int err; if (mddev->pers->check_reshape == NULL) - return -EBUSY; - if (mddev->ro) - return -EROFS; - mddev->new_layout = n; - err = mddev->pers->check_reshape(mddev); - if (err) { - mddev->new_layout = mddev->layout; - return err; + err = -EBUSY; + else if (mddev->ro) + err = -EROFS; + else { + mddev->new_layout = n; + err = mddev->pers->check_reshape(mddev); + if (err) + mddev->new_layout = mddev->layout; } } else { mddev->new_layout = n; if (mddev->reshape_position == MaxSector) mddev->layout = n; } - return len; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_layout = __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); @@ -3483,32 +3521,39 @@ static ssize_t raid_disks_store(struct mddev *mddev, const char *buf, size_t len) { char *e; - int rv = 0; + int err; unsigned long n = simple_strtoul(buf, &e, 10); if (!*buf || (*e && *e != '\n')) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; if (mddev->pers) - rv = update_raid_disks(mddev, n); + err = update_raid_disks(mddev, n); else if (mddev->reshape_position != MaxSector) { struct md_rdev *rdev; int olddisks = mddev->raid_disks - mddev->delta_disks; + err = -EINVAL; rdev_for_each(rdev, mddev) { if (olddisks < n && rdev->data_offset < rdev->new_data_offset) - return -EINVAL; + goto out_unlock; if (olddisks > n && rdev->data_offset > rdev->new_data_offset) - return -EINVAL; + goto out_unlock; } + err = 0; mddev->delta_disks = n - olddisks; mddev->raid_disks = n; mddev->reshape_backwards = (mddev->delta_disks < 0); } else mddev->raid_disks = n; - return rv ? rv : len; +out_unlock: + mddev_unlock(mddev); + return err ? err : len; } static struct md_sysfs_entry md_raid_disks = __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); @@ -3527,30 +3572,34 @@ chunk_size_show(struct mddev *mddev, char *page) static ssize_t chunk_size_store(struct mddev *mddev, const char *buf, size_t len) { + int err; char *e; unsigned long n = simple_strtoul(buf, &e, 10); if (!*buf || (*e && *e != '\n')) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; if (mddev->pers) { - int err; if (mddev->pers->check_reshape == NULL) - return -EBUSY; - if (mddev->ro) - return -EROFS; - mddev->new_chunk_sectors = n >> 9; - err = mddev->pers->check_reshape(mddev); - if (err) { - mddev->new_chunk_sectors = mddev->chunk_sectors; - return err; + err = -EBUSY; + else if (mddev->ro) + err = -EROFS; + else { + mddev->new_chunk_sectors = n >> 9; + err = mddev->pers->check_reshape(mddev); + if (err) + mddev->new_chunk_sectors = mddev->chunk_sectors; } } else { mddev->new_chunk_sectors = n >> 9; if (mddev->reshape_position == MaxSector) mddev->chunk_sectors = n >> 9; } - return len; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_chunk_size = __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); @@ -3566,20 +3615,27 @@ resync_start_show(struct mddev *mddev, char *page) static ssize_t resync_start_store(struct mddev *mddev, const char *buf, size_t len) { + int err; char *e; unsigned long long n = simple_strtoull(buf, &e, 10); + err = mddev_lock(mddev); + if (err) + return err; if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) - return -EBUSY; - if (cmd_match(buf, "none")) + err = -EBUSY; + else if (cmd_match(buf, "none")) n = MaxSector; else if (!*buf || (*e && *e != '\n')) - return -EINVAL; + err = -EINVAL; - mddev->recovery_cp = n; - if (mddev->pers) - set_bit(MD_CHANGE_CLEAN, &mddev->flags); - return len; + if (!err) { + mddev->recovery_cp = n; + if (mddev->pers) + set_bit(MD_CHANGE_CLEAN, &mddev->flags); + } + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_resync_start = __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); @@ -3677,8 +3733,39 @@ static int restart_array(struct mddev *mddev); static ssize_t array_state_store(struct mddev *mddev, const char *buf, size_t len) { - int err = -EINVAL; + int err; enum array_state st = match_word(buf, array_states); + + if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) { + /* don't take reconfig_mutex when toggling between + * clean and active + */ + spin_lock(&mddev->lock); + if (st == active) { + restart_array(mddev); + clear_bit(MD_CHANGE_PENDING, &mddev->flags); + wake_up(&mddev->sb_wait); + err = 0; + } else /* st == clean */ { + restart_array(mddev); + if (atomic_read(&mddev->writes_pending) == 0) { + if (mddev->in_sync == 0) { + mddev->in_sync = 1; + if (mddev->safemode == 1) + mddev->safemode = 0; + set_bit(MD_CHANGE_CLEAN, &mddev->flags); + } + err = 0; + } else + err = -EBUSY; + } + spin_unlock(&mddev->lock); + return err; + } + err = mddev_lock(mddev); + if (err) + return err; + err = -EINVAL; switch(st) { case bad_word: break; @@ -3722,7 +3809,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) case clean: if (mddev->pers) { restart_array(mddev); - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); if (atomic_read(&mddev->writes_pending) == 0) { if (mddev->in_sync == 0) { mddev->in_sync = 1; @@ -3733,7 +3820,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) err = 0; } else err = -EBUSY; - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); } else err = -EINVAL; break; @@ -3754,14 +3841,14 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) /* these cannot be set */ break; } - if (err) - return err; - else { + + if (!err) { if (mddev->hold_active == UNTIL_IOCTL) mddev->hold_active = 0; sysfs_notify_dirent_safe(mddev->sysfs_state); - return len; } + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_array_state = __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); @@ -3822,6 +3909,11 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) minor != MINOR(dev)) return -EOVERFLOW; + flush_workqueue(md_misc_wq); + + err = mddev_lock(mddev); + if (err) + return err; if (mddev->persistent) { rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); @@ -3845,6 +3937,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) out: if (err) export_rdev(rdev); + mddev_unlock(mddev); return err ? err : len; } @@ -3856,7 +3949,11 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len) { char *end; unsigned long chunk, end_chunk; + int err; + err = mddev_lock(mddev); + if (err) + return err; if (!mddev->bitmap) goto out; /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ @@ -3874,6 +3971,7 @@ bitmap_store(struct mddev *mddev, const char *buf, size_t len) } bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ out: + mddev_unlock(mddev); return len; } @@ -3901,6 +3999,9 @@ size_store(struct mddev *mddev, const char *buf, size_t len) if (err < 0) return err; + err = mddev_lock(mddev); + if (err) + return err; if (mddev->pers) { err = update_size(mddev, sectors); md_update_sb(mddev, 1); @@ -3911,6 +4012,7 @@ size_store(struct mddev *mddev, const char *buf, size_t len) else err = -ENOSPC; } + mddev_unlock(mddev); return err ? err : len; } @@ -3940,21 +4042,28 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len) { int major, minor; char *e; + int err; /* Changing the details of 'external' metadata is * always permitted. Otherwise there must be * no devices attached to the array. */ + + err = mddev_lock(mddev); + if (err) + return err; + err = -EBUSY; if (mddev->external && strncmp(buf, "external:", 9) == 0) ; else if (!list_empty(&mddev->disks)) - return -EBUSY; + goto out_unlock; + err = 0; if (cmd_match(buf, "none")) { mddev->persistent = 0; mddev->external = 0; mddev->major_version = 0; mddev->minor_version = 90; - return len; + goto out_unlock; } if (strncmp(buf, "external:", 9) == 0) { size_t namelen = len-9; @@ -3968,22 +4077,27 @@ metadata_store(struct mddev *mddev, const char *buf, size_t len) mddev->external = 1; mddev->major_version = 0; mddev->minor_version = 90; - return len; + goto out_unlock; } major = simple_strtoul(buf, &e, 10); + err = -EINVAL; if (e==buf || *e != '.') - return -EINVAL; + goto out_unlock; buf = e+1; minor = simple_strtoul(buf, &e, 10); if (e==buf || (*e && *e != '\n') ) - return -EINVAL; + goto out_unlock; + err = -ENOENT; if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) - return -ENOENT; + goto out_unlock; mddev->major_version = major; mddev->minor_version = minor; mddev->persistent = 1; mddev->external = 0; - return len; + err = 0; +out_unlock: + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_metadata = @@ -3993,20 +4107,21 @@ static ssize_t action_show(struct mddev *mddev, char *page) { char *type = "idle"; - if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + unsigned long recovery = mddev->recovery; + if (test_bit(MD_RECOVERY_FROZEN, &recovery)) type = "frozen"; - else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) + else if (test_bit(MD_RECOVERY_RUNNING, &recovery) || + (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) { + if (test_bit(MD_RECOVERY_RESHAPE, &recovery)) type = "reshape"; - else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { - if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) + else if (test_bit(MD_RECOVERY_SYNC, &recovery)) { + if (!test_bit(MD_RECOVERY_REQUESTED, &recovery)) type = "resync"; - else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) + else if (test_bit(MD_RECOVERY_CHECK, &recovery)) type = "check"; else type = "repair"; - } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) + } else if (test_bit(MD_RECOVERY_RECOVER, &recovery)) type = "recover"; } return sprintf(page, "%s\n", type); @@ -4027,7 +4142,10 @@ action_store(struct mddev *mddev, const char *page, size_t len) flush_workqueue(md_misc_wq); if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_reap_sync_thread(mddev); + if (mddev_lock(mddev) == 0) { + md_reap_sync_thread(mddev); + mddev_unlock(mddev); + } } } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) @@ -4041,7 +4159,11 @@ action_store(struct mddev *mddev, const char *page, size_t len) int err; if (mddev->pers->start_reshape == NULL) return -EINVAL; - err = mddev->pers->start_reshape(mddev); + err = mddev_lock(mddev); + if (!err) { + err = mddev->pers->start_reshape(mddev); + mddev_unlock(mddev); + } if (err) return err; sysfs_notify(&mddev->kobj, NULL, "degraded"); @@ -4225,22 +4347,36 @@ static ssize_t min_sync_store(struct mddev *mddev, const char *buf, size_t len) { unsigned long long min; + int err; + int chunk; + if (kstrtoull(buf, 10, &min)) return -EINVAL; + + spin_lock(&mddev->lock); + err = -EINVAL; if (min > mddev->resync_max) - return -EINVAL; + goto out_unlock; + + err = -EBUSY; if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - return -EBUSY; + goto out_unlock; /* Must be a multiple of chunk_size */ - if (mddev->chunk_sectors) { + chunk = mddev->chunk_sectors; + if (chunk) { sector_t temp = min; - if (sector_div(temp, mddev->chunk_sectors)) - return -EINVAL; + + err = -EINVAL; + if (sector_div(temp, chunk)) + goto out_unlock; } mddev->resync_min = min; + err = 0; - return len; +out_unlock: + spin_unlock(&mddev->lock); + return err ?: len; } static struct md_sysfs_entry md_min_sync = @@ -4258,29 +4394,42 @@ max_sync_show(struct mddev *mddev, char *page) static ssize_t max_sync_store(struct mddev *mddev, const char *buf, size_t len) { + int err; + spin_lock(&mddev->lock); if (strncmp(buf, "max", 3) == 0) mddev->resync_max = MaxSector; else { unsigned long long max; + int chunk; + + err = -EINVAL; if (kstrtoull(buf, 10, &max)) - return -EINVAL; + goto out_unlock; if (max < mddev->resync_min) - return -EINVAL; + goto out_unlock; + + err = -EBUSY; if (max < mddev->resync_max && mddev->ro == 0 && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) - return -EBUSY; + goto out_unlock; /* Must be a multiple of chunk_size */ - if (mddev->chunk_sectors) { + chunk = mddev->chunk_sectors; + if (chunk) { sector_t temp = max; - if (sector_div(temp, mddev->chunk_sectors)) - return -EINVAL; + + err = -EINVAL; + if (sector_div(temp, chunk)) + goto out_unlock; } mddev->resync_max = max; } wake_up(&mddev->recovery_wait); - return len; + err = 0; +out_unlock: + spin_unlock(&mddev->lock); + return err ?: len; } static struct md_sysfs_entry md_max_sync = @@ -4297,14 +4446,20 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long long new = simple_strtoull(buf, &e, 10); - unsigned long long old = mddev->suspend_lo; + unsigned long long old; + int err; - if (mddev->pers == NULL || - mddev->pers->quiesce == NULL) - return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; + err = -EINVAL; + if (mddev->pers == NULL || + mddev->pers->quiesce == NULL) + goto unlock; + old = mddev->suspend_lo; mddev->suspend_lo = new; if (new >= old) /* Shrinking suspended region */ @@ -4314,7 +4469,10 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } - return len; + err = 0; +unlock: + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_suspend_lo = __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); @@ -4330,14 +4488,20 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) { char *e; unsigned long long new = simple_strtoull(buf, &e, 10); - unsigned long long old = mddev->suspend_hi; + unsigned long long old; + int err; - if (mddev->pers == NULL || - mddev->pers->quiesce == NULL) - return -EINVAL; if (buf == e || (*e && *e != '\n')) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; + err = -EINVAL; + if (mddev->pers == NULL || + mddev->pers->quiesce == NULL) + goto unlock; + old = mddev->suspend_hi; mddev->suspend_hi = new; if (new <= old) /* Shrinking suspended region */ @@ -4347,7 +4511,10 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); } - return len; + err = 0; +unlock: + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_suspend_hi = __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); @@ -4367,11 +4534,17 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len) { struct md_rdev *rdev; char *e; + int err; unsigned long long new = simple_strtoull(buf, &e, 10); - if (mddev->pers) - return -EBUSY; + if (buf == e || (*e && *e != '\n')) return -EINVAL; + err = mddev_lock(mddev); + if (err) + return err; + err = -EBUSY; + if (mddev->pers) + goto unlock; mddev->reshape_position = new; mddev->delta_disks = 0; mddev->reshape_backwards = 0; @@ -4380,7 +4553,10 @@ reshape_position_store(struct mddev *mddev, const char *buf, size_t len) mddev->new_chunk_sectors = mddev->chunk_sectors; rdev_for_each(rdev, mddev) rdev->new_data_offset = rdev->data_offset; - return len; + err = 0; +unlock: + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_reshape_position = @@ -4398,6 +4574,8 @@ static ssize_t reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) { int backwards = 0; + int err; + if (cmd_match(buf, "forwards")) backwards = 0; else if (cmd_match(buf, "backwards")) @@ -4407,16 +4585,19 @@ reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) if (mddev->reshape_backwards == backwards) return len; + err = mddev_lock(mddev); + if (err) + return err; /* check if we are allowed to change */ if (mddev->delta_disks) - return -EBUSY; - - if (mddev->persistent && + err = -EBUSY; + else if (mddev->persistent && mddev->major_version == 0) - return -EINVAL; - - mddev->reshape_backwards = backwards; - return len; + err = -EINVAL; + else + mddev->reshape_backwards = backwards; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_reshape_direction = @@ -4437,6 +4618,11 @@ static ssize_t array_size_store(struct mddev *mddev, const char *buf, size_t len) { sector_t sectors; + int err; + + err = mddev_lock(mddev); + if (err) + return err; if (strncmp(buf, "default", 7) == 0) { if (mddev->pers) @@ -4447,19 +4633,22 @@ array_size_store(struct mddev *mddev, const char *buf, size_t len) mddev->external_size = 0; } else { if (strict_blocks_to_sectors(buf, §ors) < 0) - return -EINVAL; - if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) - return -E2BIG; - - mddev->external_size = 1; + err = -EINVAL; + else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) + err = -E2BIG; + else + mddev->external_size = 1; } - mddev->array_sectors = sectors; - if (mddev->pers) { - set_capacity(mddev->gendisk, mddev->array_sectors); - revalidate_disk(mddev->gendisk); + if (!err) { + mddev->array_sectors = sectors; + if (mddev->pers) { + set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); + } } - return len; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry md_array_size = @@ -4523,11 +4712,7 @@ md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) mddev_get(mddev); spin_unlock(&all_mddevs_lock); - rv = mddev_lock(mddev); - if (!rv) { - rv = entry->show(mddev, page); - mddev_unlock(mddev); - } + rv = entry->show(mddev, page); mddev_put(mddev); return rv; } @@ -4551,13 +4736,7 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, } mddev_get(mddev); spin_unlock(&all_mddevs_lock); - if (entry->store == new_dev_store) - flush_workqueue(md_misc_wq); - rv = mddev_lock(mddev); - if (!rv) { - rv = entry->store(mddev, page, length); - mddev_unlock(mddev); - } + rv = entry->store(mddev, page, length); mddev_put(mddev); return rv; } @@ -4825,7 +5004,6 @@ int md_run(struct mddev *mddev) mddev->clevel); return -EINVAL; } - mddev->pers = pers; spin_unlock(&pers_lock); if (mddev->level != pers->level) { mddev->level = pers->level; @@ -4836,7 +5014,6 @@ int md_run(struct mddev *mddev) if (mddev->reshape_position != MaxSector && pers->start_reshape == NULL) { /* This personality cannot handle reshaping... */ - mddev->pers = NULL; module_put(pers->owner); return -EINVAL; } @@ -4880,35 +5057,38 @@ int md_run(struct mddev *mddev) if (start_readonly && mddev->ro == 0) mddev->ro = 2; /* read-only, but switch on first write */ - err = mddev->pers->run(mddev); + err = pers->run(mddev); if (err) printk(KERN_ERR "md: pers->run() failed ...\n"); - else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) { + else if (pers->size(mddev, 0, 0) < mddev->array_sectors) { WARN_ONCE(!mddev->external_size, "%s: default size too small," " but 'external_size' not in effect?\n", __func__); printk(KERN_ERR "md: invalid array_size %llu > default size %llu\n", (unsigned long long)mddev->array_sectors / 2, - (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2); + (unsigned long long)pers->size(mddev, 0, 0) / 2); err = -EINVAL; - mddev->pers->stop(mddev); } - if (err == 0 && mddev->pers->sync_request && + if (err == 0 && pers->sync_request && (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { err = bitmap_create(mddev); - if (err) { + if (err) printk(KERN_ERR "%s: failed to create bitmap (%d)\n", mdname(mddev), err); - mddev->pers->stop(mddev); - } } if (err) { - module_put(mddev->pers->owner); - mddev->pers = NULL; + mddev_detach(mddev); + pers->free(mddev, mddev->private); + module_put(pers->owner); bitmap_destroy(mddev); return err; } - if (mddev->pers->sync_request) { + if (mddev->queue) { + mddev->queue->backing_dev_info.congested_data = mddev; + mddev->queue->backing_dev_info.congested_fn = md_congested; + blk_queue_merge_bvec(mddev->queue, md_mergeable_bvec); + } + if (pers->sync_request) { if (mddev->kobj.sd && sysfs_create_group(&mddev->kobj, &md_redundancy_group)) printk(KERN_WARNING @@ -4927,7 +5107,10 @@ int md_run(struct mddev *mddev) mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ mddev->in_sync = 1; smp_wmb(); + spin_lock(&mddev->lock); + mddev->pers = pers; mddev->ready = 1; + spin_unlock(&mddev->lock); rdev_for_each(rdev, mddev) if (rdev->raid_disk >= 0) if (sysfs_link_rdev(mddev, rdev)) @@ -5070,14 +5253,38 @@ void md_stop_writes(struct mddev *mddev) } EXPORT_SYMBOL_GPL(md_stop_writes); +static void mddev_detach(struct mddev *mddev) +{ + struct bitmap *bitmap = mddev->bitmap; + /* wait for behind writes to complete */ + if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { + printk(KERN_INFO "md:%s: behind writes in progress - waiting to stop.\n", + mdname(mddev)); + /* need to kick something here to make sure I/O goes? */ + wait_event(bitmap->behind_wait, + atomic_read(&bitmap->behind_writes) == 0); + } + if (mddev->pers && mddev->pers->quiesce) { + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } + md_unregister_thread(&mddev->thread); + if (mddev->queue) + blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ +} + static void __md_stop(struct mddev *mddev) { + struct md_personality *pers = mddev->pers; + mddev_detach(mddev); + spin_lock(&mddev->lock); mddev->ready = 0; - mddev->pers->stop(mddev); - if (mddev->pers->sync_request && mddev->to_remove == NULL) - mddev->to_remove = &md_redundancy_group; - module_put(mddev->pers->owner); mddev->pers = NULL; + spin_unlock(&mddev->lock); + pers->free(mddev, mddev->private); + if (pers->sync_request && mddev->to_remove == NULL) + mddev->to_remove = &md_redundancy_group; + module_put(pers->owner); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } @@ -5226,8 +5433,11 @@ static int do_md_stop(struct mddev *mddev, int mode, bitmap_destroy(mddev); if (mddev->bitmap_info.file) { - fput(mddev->bitmap_info.file); + struct file *f = mddev->bitmap_info.file; + spin_lock(&mddev->lock); mddev->bitmap_info.file = NULL; + spin_unlock(&mddev->lock); + fput(f); } mddev->bitmap_info.offset = 0; @@ -5436,37 +5646,31 @@ static int get_array_info(struct mddev *mddev, void __user *arg) static int get_bitmap_file(struct mddev *mddev, void __user * arg) { mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ - char *ptr, *buf = NULL; - int err = -ENOMEM; + char *ptr; + int err; file = kmalloc(sizeof(*file), GFP_NOIO); - if (!file) - goto out; + return -ENOMEM; + err = 0; + spin_lock(&mddev->lock); /* bitmap disabled, zero the first byte and copy out */ - if (!mddev->bitmap || !mddev->bitmap->storage.file) { + if (!mddev->bitmap_info.file) file->pathname[0] = '\0'; - goto copy_out; - } - - buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); - if (!buf) - goto out; - - ptr = d_path(&mddev->bitmap->storage.file->f_path, - buf, sizeof(file->pathname)); - if (IS_ERR(ptr)) - goto out; - - strcpy(file->pathname, ptr); + else if ((ptr = d_path(&mddev->bitmap_info.file->f_path, + file->pathname, sizeof(file->pathname))), + IS_ERR(ptr)) + err = PTR_ERR(ptr); + else + memmove(file->pathname, ptr, + sizeof(file->pathname)-(ptr-file->pathname)); + spin_unlock(&mddev->lock); -copy_out: - err = 0; - if (copy_to_user(arg, file, sizeof(*file))) + if (err == 0 && + copy_to_user(arg, file, sizeof(*file))) err = -EFAULT; -out: - kfree(buf); + kfree(file); return err; } @@ -5789,22 +5993,24 @@ static int set_bitmap_file(struct mddev *mddev, int fd) if (fd >= 0) { struct inode *inode; - if (mddev->bitmap) + struct file *f; + + if (mddev->bitmap || mddev->bitmap_info.file) return -EEXIST; /* cannot add when bitmap is present */ - mddev->bitmap_info.file = fget(fd); + f = fget(fd); - if (mddev->bitmap_info.file == NULL) { + if (f == NULL) { printk(KERN_ERR "%s: error: failed to get bitmap file\n", mdname(mddev)); return -EBADF; } - inode = mddev->bitmap_info.file->f_mapping->host; + inode = f->f_mapping->host; if (!S_ISREG(inode->i_mode)) { printk(KERN_ERR "%s: error: bitmap file must be a regular file\n", mdname(mddev)); err = -EBADF; - } else if (!(mddev->bitmap_info.file->f_mode & FMODE_WRITE)) { + } else if (!(f->f_mode & FMODE_WRITE)) { printk(KERN_ERR "%s: error: bitmap file must open for write\n", mdname(mddev)); err = -EBADF; @@ -5814,10 +6020,10 @@ static int set_bitmap_file(struct mddev *mddev, int fd) err = -EBUSY; } if (err) { - fput(mddev->bitmap_info.file); - mddev->bitmap_info.file = NULL; + fput(f); return err; } + mddev->bitmap_info.file = f; mddev->bitmap_info.offset = 0; /* file overrides offset */ } else if (mddev->bitmap == NULL) return -ENOENT; /* cannot remove what isn't there */ @@ -5836,9 +6042,13 @@ static int set_bitmap_file(struct mddev *mddev, int fd) mddev->pers->quiesce(mddev, 0); } if (fd < 0) { - if (mddev->bitmap_info.file) - fput(mddev->bitmap_info.file); - mddev->bitmap_info.file = NULL; + struct file *f = mddev->bitmap_info.file; + if (f) { + spin_lock(&mddev->lock); + mddev->bitmap_info.file = NULL; + spin_unlock(&mddev->lock); + fput(f); + } } return err; @@ -6251,6 +6461,11 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, case SET_DISK_FAULTY: err = set_disk_faulty(mddev, new_decode_dev(arg)); goto out; + + case GET_BITMAP_FILE: + err = get_bitmap_file(mddev, argp); + goto out; + } if (cmd == ADD_NEW_DISK) @@ -6342,10 +6557,6 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, * Commands even a read-only array can execute: */ switch (cmd) { - case GET_BITMAP_FILE: - err = get_bitmap_file(mddev, argp); - goto unlock; - case RESTART_ARRAY_RW: err = restart_array(mddev); goto unlock; @@ -6873,9 +7084,7 @@ static int md_seq_show(struct seq_file *seq, void *v) return 0; } - if (mddev_lock(mddev) < 0) - return -EINTR; - + spin_lock(&mddev->lock); if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { seq_printf(seq, "%s : %sactive", mdname(mddev), mddev->pers ? "" : "in"); @@ -6888,7 +7097,8 @@ static int md_seq_show(struct seq_file *seq, void *v) } sectors = 0; - rdev_for_each(rdev, mddev) { + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { char b[BDEVNAME_SIZE]; seq_printf(seq, " %s[%d]", bdevname(rdev->bdev,b), rdev->desc_nr); @@ -6904,6 +7114,7 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "(R)"); sectors += rdev->sectors; } + rcu_read_unlock(); if (!list_empty(&mddev->disks)) { if (mddev->pers) @@ -6946,7 +7157,7 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n"); } - mddev_unlock(mddev); + spin_unlock(&mddev->lock); return 0; } @@ -7102,7 +7313,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi) if (mddev->safemode == 1) mddev->safemode = 0; if (mddev->in_sync) { - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); if (mddev->in_sync) { mddev->in_sync = 0; set_bit(MD_CHANGE_CLEAN, &mddev->flags); @@ -7110,7 +7321,7 @@ void md_write_start(struct mddev *mddev, struct bio *bi) md_wakeup_thread(mddev->thread); did_change = 1; } - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); } if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); @@ -7148,7 +7359,7 @@ int md_allow_write(struct mddev *mddev) if (!mddev->pers->sync_request) return 0; - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); if (mddev->in_sync) { mddev->in_sync = 0; set_bit(MD_CHANGE_CLEAN, &mddev->flags); @@ -7156,11 +7367,11 @@ int md_allow_write(struct mddev *mddev) if (mddev->safemode_delay && mddev->safemode == 0) mddev->safemode = 1; - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); md_update_sb(mddev, 0); sysfs_notify_dirent_safe(mddev->sysfs_state); } else - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) return -EAGAIN; @@ -7513,6 +7724,7 @@ void md_do_sync(struct md_thread *thread) skip: set_bit(MD_CHANGE_DEVS, &mddev->flags); + spin_lock(&mddev->lock); if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { /* We completed so min/max setting can be forgotten if used. */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) @@ -7521,6 +7733,8 @@ void md_do_sync(struct md_thread *thread) } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) mddev->resync_min = mddev->curr_resync_completed; mddev->curr_resync = 0; + spin_unlock(&mddev->lock); + wake_up(&resync_wait); set_bit(MD_RECOVERY_DONE, &mddev->recovery); md_wakeup_thread(mddev->thread); @@ -7688,7 +7902,7 @@ void md_check_recovery(struct mddev *mddev) if (!mddev->external) { int did_change = 0; - spin_lock_irq(&mddev->write_lock); + spin_lock(&mddev->lock); if (mddev->safemode && !atomic_read(&mddev->writes_pending) && !mddev->in_sync && @@ -7699,7 +7913,7 @@ void md_check_recovery(struct mddev *mddev) } if (mddev->safemode == 1) mddev->safemode = 0; - spin_unlock_irq(&mddev->write_lock); + spin_unlock(&mddev->lock); if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); } @@ -7721,7 +7935,9 @@ void md_check_recovery(struct mddev *mddev) * any transients in the value of "sync_action". */ mddev->curr_resync_completed = 0; + spin_lock(&mddev->lock); set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); + spin_unlock(&mddev->lock); /* Clear some bits that don't mean anything, but * might be left set */ diff --git a/drivers/md/md.h b/drivers/md/md.h index 03cec5bdcaae..318ca8fd430f 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -386,7 +386,18 @@ struct mddev { struct work_struct del_work; /* used for delayed sysfs removal */ - spinlock_t write_lock; + /* "lock" protects: + * flush_bio transition from NULL to !NULL + * rdev superblocks, events + * clearing MD_CHANGE_* + * in_sync - and related safemode and MD_CHANGE changes + * pers (also protected by reconfig_mutex and pending IO). + * clearing ->bitmap + * clearing ->bitmap_info.file + * changing ->resync_{min,max} + * setting MD_RECOVERY_RUNNING (which interacts with resync_{min,max}) + */ + spinlock_t lock; wait_queue_head_t sb_wait; /* for waiting on superblock updates */ atomic_t pending_writes; /* number of active superblock writes */ @@ -439,13 +450,30 @@ struct mddev { void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); }; -static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) +static inline int __must_check mddev_lock(struct mddev *mddev) { - int faulty = test_bit(Faulty, &rdev->flags); - if (atomic_dec_and_test(&rdev->nr_pending) && faulty) - set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + return mutex_lock_interruptible(&mddev->reconfig_mutex); +} + +/* Sometimes we need to take the lock in a situation where + * failure due to interrupts is not acceptable. + */ +static inline void mddev_lock_nointr(struct mddev *mddev) +{ + mutex_lock(&mddev->reconfig_mutex); +} + +static inline int mddev_is_locked(struct mddev *mddev) +{ + return mutex_is_locked(&mddev->reconfig_mutex); } +static inline int mddev_trylock(struct mddev *mddev) +{ + return mutex_trylock(&mddev->reconfig_mutex); +} +extern void mddev_unlock(struct mddev *mddev); + static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) { atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); @@ -459,7 +487,7 @@ struct md_personality struct module *owner; void (*make_request)(struct mddev *mddev, struct bio *bio); int (*run)(struct mddev *mddev); - int (*stop)(struct mddev *mddev); + void (*free)(struct mddev *mddev, void *priv); void (*status)(struct seq_file *seq, struct mddev *mddev); /* error_handler must set ->faulty and clear ->in_sync * if appropriate, and should abort recovery if needed @@ -490,6 +518,13 @@ struct md_personality * array. */ void *(*takeover) (struct mddev *mddev); + /* congested implements bdi.congested_fn(). + * Will not be called while array is 'suspended' */ + int (*congested)(struct mddev *mddev, int bits); + /* mergeable_bvec is use to implement ->merge_bvec_fn */ + int (*mergeable_bvec)(struct mddev *mddev, + struct bvec_merge_data *bvm, + struct bio_vec *biovec); }; struct md_sysfs_entry { @@ -624,4 +659,14 @@ static inline int mddev_check_plugged(struct mddev *mddev) return !!blk_check_plugged(md_unplug, mddev, sizeof(struct blk_plug_cb)); } + +static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) +{ + int faulty = test_bit(Faulty, &rdev->flags); + if (atomic_dec_and_test(&rdev->nr_pending) && faulty) { + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + } +} + #endif /* _MD_MD_H */ diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 399272f9c042..ac3ede2bd00e 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -153,15 +153,11 @@ static void multipath_status (struct seq_file *seq, struct mddev *mddev) seq_printf (seq, "]"); } -static int multipath_congested(void *data, int bits) +static int multipath_congested(struct mddev *mddev, int bits) { - struct mddev *mddev = data; struct mpconf *conf = mddev->private; int i, ret = 0; - if (mddev_congested(mddev, bits)) - return 1; - rcu_read_lock(); for (i = 0; i < mddev->raid_disks ; i++) { struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); @@ -403,7 +399,7 @@ static int multipath_run (struct mddev *mddev) /* * copy the already verified devices into our private MULTIPATH * bookkeeping area. [whatever we allocate in multipath_run(), - * should be freed in multipath_stop()] + * should be freed in multipath_free()] */ conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL); @@ -489,9 +485,6 @@ static int multipath_run (struct mddev *mddev) */ md_set_array_sectors(mddev, multipath_size(mddev, 0, 0)); - mddev->queue->backing_dev_info.congested_fn = multipath_congested; - mddev->queue->backing_dev_info.congested_data = mddev; - if (md_integrity_register(mddev)) goto out_free_conf; @@ -507,17 +500,13 @@ out: return -EIO; } -static int multipath_stop (struct mddev *mddev) +static void multipath_free(struct mddev *mddev, void *priv) { - struct mpconf *conf = mddev->private; + struct mpconf *conf = priv; - md_unregister_thread(&mddev->thread); - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ mempool_destroy(conf->pool); kfree(conf->multipaths); kfree(conf); - mddev->private = NULL; - return 0; } static struct md_personality multipath_personality = @@ -527,12 +516,13 @@ static struct md_personality multipath_personality = .owner = THIS_MODULE, .make_request = multipath_make_request, .run = multipath_run, - .stop = multipath_stop, + .free = multipath_free, .status = multipath_status, .error_handler = multipath_error, .hot_add_disk = multipath_add_disk, .hot_remove_disk= multipath_remove_disk, .size = multipath_size, + .congested = multipath_congested, }; static int __init multipath_init (void) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index ba6b85de96d2..a13f738a7b39 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -25,17 +25,13 @@ #include "raid0.h" #include "raid5.h" -static int raid0_congested(void *data, int bits) +static int raid0_congested(struct mddev *mddev, int bits) { - struct mddev *mddev = data; struct r0conf *conf = mddev->private; struct md_rdev **devlist = conf->devlist; int raid_disks = conf->strip_zone[0].nb_dev; int i, ret = 0; - if (mddev_congested(mddev, bits)) - return 1; - for (i = 0; i < raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(devlist[i]->bdev); @@ -263,8 +259,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) mdname(mddev), (unsigned long long)smallest->sectors); } - mddev->queue->backing_dev_info.congested_fn = raid0_congested; - mddev->queue->backing_dev_info.congested_data = mddev; /* * now since we have the hard sector sizes, we can make sure @@ -356,17 +350,16 @@ static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone, /** * raid0_mergeable_bvec -- tell bio layer if two requests can be merged - * @q: request queue + * @mddev: the md device * @bvm: properties of new bio * @biovec: the request that could be merged to it. * * Return amount of bytes we can accept at this offset */ -static int raid0_mergeable_bvec(struct request_queue *q, +static int raid0_mergeable_bvec(struct mddev *mddev, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - struct mddev *mddev = q->queuedata; struct r0conf *conf = mddev->private; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); sector_t sector_offset = sector; @@ -422,7 +415,7 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks return array_sectors; } -static int raid0_stop(struct mddev *mddev); +static void raid0_free(struct mddev *mddev, void *priv); static int raid0_run(struct mddev *mddev) { @@ -471,26 +464,22 @@ static int raid0_run(struct mddev *mddev) mddev->queue->backing_dev_info.ra_pages = 2* stripe; } - blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); dump_zones(mddev); ret = md_integrity_register(mddev); if (ret) - raid0_stop(mddev); + raid0_free(mddev, conf); return ret; } -static int raid0_stop(struct mddev *mddev) +static void raid0_free(struct mddev *mddev, void *priv) { - struct r0conf *conf = mddev->private; + struct r0conf *conf = priv; - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ kfree(conf->strip_zone); kfree(conf->devlist); kfree(conf); - mddev->private = NULL; - return 0; } /* @@ -724,11 +713,13 @@ static struct md_personality raid0_personality= .owner = THIS_MODULE, .make_request = raid0_make_request, .run = raid0_run, - .stop = raid0_stop, + .free = raid0_free, .status = raid0_status, .size = raid0_size, .takeover = raid0_takeover, .quiesce = raid0_quiesce, + .congested = raid0_congested, + .mergeable_bvec = raid0_mergeable_bvec, }; static int __init raid0_init (void) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 40b35be34f8d..5dd0c2e59ab9 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -701,11 +701,10 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect return best_disk; } -static int raid1_mergeable_bvec(struct request_queue *q, +static int raid1_mergeable_bvec(struct mddev *mddev, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - struct mddev *mddev = q->queuedata; struct r1conf *conf = mddev->private; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max = biovec->bv_len; @@ -734,7 +733,7 @@ static int raid1_mergeable_bvec(struct request_queue *q, } -int md_raid1_congested(struct mddev *mddev, int bits) +static int raid1_congested(struct mddev *mddev, int bits) { struct r1conf *conf = mddev->private; int i, ret = 0; @@ -763,15 +762,6 @@ int md_raid1_congested(struct mddev *mddev, int bits) rcu_read_unlock(); return ret; } -EXPORT_SYMBOL_GPL(md_raid1_congested); - -static int raid1_congested(void *data, int bits) -{ - struct mddev *mddev = data; - - return mddev_congested(mddev, bits) || - md_raid1_congested(mddev, bits); -} static void flush_pending_writes(struct r1conf *conf) { @@ -2882,7 +2872,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) return ERR_PTR(err); } -static int stop(struct mddev *mddev); +static void raid1_free(struct mddev *mddev, void *priv); static int run(struct mddev *mddev) { struct r1conf *conf; @@ -2904,7 +2894,7 @@ static int run(struct mddev *mddev) /* * copy the already verified devices into our private RAID1 * bookkeeping area. [whatever we allocate in run(), - * should be freed in stop()] + * should be freed in raid1_free()] */ if (mddev->private == NULL) conf = setup_conf(mddev); @@ -2955,10 +2945,6 @@ static int run(struct mddev *mddev) md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); if (mddev->queue) { - mddev->queue->backing_dev_info.congested_fn = raid1_congested; - mddev->queue->backing_dev_info.congested_data = mddev; - blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec); - if (discard_supported) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); @@ -2968,37 +2954,23 @@ static int run(struct mddev *mddev) } ret = md_integrity_register(mddev); - if (ret) - stop(mddev); + if (ret) { + md_unregister_thread(&mddev->thread); + raid1_free(mddev, conf); + } return ret; } -static int stop(struct mddev *mddev) +static void raid1_free(struct mddev *mddev, void *priv) { - struct r1conf *conf = mddev->private; - struct bitmap *bitmap = mddev->bitmap; + struct r1conf *conf = priv; - /* wait for behind writes to complete */ - if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { - printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n", - mdname(mddev)); - /* need to kick something here to make sure I/O goes? */ - wait_event(bitmap->behind_wait, - atomic_read(&bitmap->behind_writes) == 0); - } - - freeze_array(conf, 0); - unfreeze_array(conf); - - md_unregister_thread(&mddev->thread); if (conf->r1bio_pool) mempool_destroy(conf->r1bio_pool); kfree(conf->mirrors); safe_put_page(conf->tmppage); kfree(conf->poolinfo); kfree(conf); - mddev->private = NULL; - return 0; } static int raid1_resize(struct mddev *mddev, sector_t sectors) @@ -3181,7 +3153,7 @@ static struct md_personality raid1_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = raid1_free, .status = status, .error_handler = error, .hot_add_disk = raid1_add_disk, @@ -3193,6 +3165,8 @@ static struct md_personality raid1_personality = .check_reshape = raid1_reshape, .quiesce = raid1_quiesce, .takeover = raid1_takeover, + .congested = raid1_congested, + .mergeable_bvec = raid1_mergeable_bvec, }; static int __init raid_init(void) diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 33bda55ef9f7..14ebb288c1ef 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -170,7 +170,4 @@ struct r1bio { */ #define R1BIO_MadeGood 7 #define R1BIO_WriteError 8 - -extern int md_raid1_congested(struct mddev *mddev, int bits); - #endif diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 32e282f4c83c..b8d76b1fba64 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -674,7 +674,7 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) /** * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged - * @q: request queue + * @mddev: the md device * @bvm: properties of new bio * @biovec: the request that could be merged to it. * @@ -682,11 +682,10 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) * This requires checking for end-of-chunk if near_copies != raid_disks, * and for subordinate merge_bvec_fns if merge_check_needed. */ -static int raid10_mergeable_bvec(struct request_queue *q, +static int raid10_mergeable_bvec(struct mddev *mddev, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - struct mddev *mddev = q->queuedata; struct r10conf *conf = mddev->private; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; @@ -910,7 +909,7 @@ retry: return rdev; } -int md_raid10_congested(struct mddev *mddev, int bits) +static int raid10_congested(struct mddev *mddev, int bits) { struct r10conf *conf = mddev->private; int i, ret = 0; @@ -934,15 +933,6 @@ int md_raid10_congested(struct mddev *mddev, int bits) rcu_read_unlock(); return ret; } -EXPORT_SYMBOL_GPL(md_raid10_congested); - -static int raid10_congested(void *data, int bits) -{ - struct mddev *mddev = data; - - return mddev_congested(mddev, bits) || - md_raid10_congested(mddev, bits); -} static void flush_pending_writes(struct r10conf *conf) { @@ -3757,8 +3747,6 @@ static int run(struct mddev *mddev) if (mddev->queue) { int stripe = conf->geo.raid_disks * ((mddev->chunk_sectors << 9) / PAGE_SIZE); - mddev->queue->backing_dev_info.congested_fn = raid10_congested; - mddev->queue->backing_dev_info.congested_data = mddev; /* Calculate max read-ahead size. * We need to readahead at least twice a whole stripe.... @@ -3767,7 +3755,6 @@ static int run(struct mddev *mddev) stripe /= conf->geo.near_copies; if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) mddev->queue->backing_dev_info.ra_pages = 2 * stripe; - blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); } if (md_integrity_register(mddev)) @@ -3811,17 +3798,9 @@ out: return -EIO; } -static int stop(struct mddev *mddev) +static void raid10_free(struct mddev *mddev, void *priv) { - struct r10conf *conf = mddev->private; - - raise_barrier(conf, 0); - lower_barrier(conf); - - md_unregister_thread(&mddev->thread); - if (mddev->queue) - /* the unplug fn references 'conf'*/ - blk_sync_queue(mddev->queue); + struct r10conf *conf = priv; if (conf->r10bio_pool) mempool_destroy(conf->r10bio_pool); @@ -3830,8 +3809,6 @@ static int stop(struct mddev *mddev) kfree(conf->mirrors_old); kfree(conf->mirrors_new); kfree(conf); - mddev->private = NULL; - return 0; } static void raid10_quiesce(struct mddev *mddev, int state) @@ -3895,7 +3872,7 @@ static int raid10_resize(struct mddev *mddev, sector_t sectors) return 0; } -static void *raid10_takeover_raid0(struct mddev *mddev) +static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs) { struct md_rdev *rdev; struct r10conf *conf; @@ -3905,6 +3882,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev) mdname(mddev)); return ERR_PTR(-EINVAL); } + sector_div(size, devs); /* Set new parameters */ mddev->new_level = 10; @@ -3915,12 +3893,15 @@ static void *raid10_takeover_raid0(struct mddev *mddev) mddev->raid_disks *= 2; /* make sure it will be not marked as dirty */ mddev->recovery_cp = MaxSector; + mddev->dev_sectors = size; conf = setup_conf(mddev); if (!IS_ERR(conf)) { rdev_for_each(rdev, mddev) - if (rdev->raid_disk >= 0) + if (rdev->raid_disk >= 0) { rdev->new_raid_disk = rdev->raid_disk * 2; + rdev->sectors = size; + } conf->barrier = 1; } @@ -3943,7 +3924,9 @@ static void *raid10_takeover(struct mddev *mddev) mdname(mddev)); return ERR_PTR(-EINVAL); } - return raid10_takeover_raid0(mddev); + return raid10_takeover_raid0(mddev, + raid0_conf->strip_zone->zone_end, + raid0_conf->strip_zone->nb_dev); } return ERR_PTR(-EINVAL); } @@ -4713,7 +4696,7 @@ static struct md_personality raid10_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = raid10_free, .status = status, .error_handler = error, .hot_add_disk = raid10_add_disk, @@ -4727,6 +4710,8 @@ static struct md_personality raid10_personality = .check_reshape = raid10_check_reshape, .start_reshape = raid10_start_reshape, .finish_reshape = raid10_finish_reshape, + .congested = raid10_congested, + .mergeable_bvec = raid10_mergeable_bvec, }; static int __init raid_init(void) diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 157d69e83ff4..5ee6473ddc2c 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -150,7 +150,4 @@ enum r10bio_state { */ R10BIO_Previous, }; - -extern int md_raid10_congested(struct mddev *mddev, int bits); - #endif diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c1b0d52bfcb0..aa76865b804b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -296,12 +296,9 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, BUG_ON(atomic_read(&conf->active_stripes)==0); if (test_bit(STRIPE_HANDLE, &sh->state)) { if (test_bit(STRIPE_DELAYED, &sh->state) && - !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) list_add_tail(&sh->lru, &conf->delayed_list); - if (atomic_read(&conf->preread_active_stripes) - < IO_THRESHOLD) - md_wakeup_thread(conf->mddev->thread); - } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && + else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && sh->bm_seq - conf->seq_write > 0) list_add_tail(&sh->lru, &conf->bitmap_list); else { @@ -2898,31 +2895,102 @@ static int want_replace(struct stripe_head *sh, int disk_idx) * Returns 1 when no more member devices need to be checked, otherwise returns * 0 to tell the loop in handle_stripe_fill to continue */ -static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, - int disk_idx, int disks) + +static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, + int disk_idx, int disks) { struct r5dev *dev = &sh->dev[disk_idx]; struct r5dev *fdev[2] = { &sh->dev[s->failed_num[0]], &sh->dev[s->failed_num[1]] }; + int i; + + + if (test_bit(R5_LOCKED, &dev->flags) || + test_bit(R5_UPTODATE, &dev->flags)) + /* No point reading this as we already have it or have + * decided to get it. + */ + return 0; + + if (dev->toread || + (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags))) + /* We need this block to directly satisfy a request */ + return 1; + + if (s->syncing || s->expanding || + (s->replacing && want_replace(sh, disk_idx))) + /* When syncing, or expanding we read everything. + * When replacing, we need the replaced block. + */ + return 1; + + if ((s->failed >= 1 && fdev[0]->toread) || + (s->failed >= 2 && fdev[1]->toread)) + /* If we want to read from a failed device, then + * we need to actually read every other device. + */ + return 1; + + /* Sometimes neither read-modify-write nor reconstruct-write + * cycles can work. In those cases we read every block we + * can. Then the parity-update is certain to have enough to + * work with. + * This can only be a problem when we need to write something, + * and some device has failed. If either of those tests + * fail we need look no further. + */ + if (!s->failed || !s->to_write) + return 0; + + if (test_bit(R5_Insync, &dev->flags) && + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + /* Pre-reads at not permitted until after short delay + * to gather multiple requests. However if this + * device is no Insync, the block could only be be computed + * and there is no need to delay that. + */ + return 0; + + for (i = 0; i < s->failed; i++) { + if (fdev[i]->towrite && + !test_bit(R5_UPTODATE, &fdev[i]->flags) && + !test_bit(R5_OVERWRITE, &fdev[i]->flags)) + /* If we have a partial write to a failed + * device, then we will need to reconstruct + * the content of that device, so all other + * devices must be read. + */ + return 1; + } + + /* If we are forced to do a reconstruct-write, either because + * the current RAID6 implementation only supports that, or + * or because parity cannot be trusted and we are currently + * recovering it, there is extra need to be careful. + * If one of the devices that we would need to read, because + * it is not being overwritten (and maybe not written at all) + * is missing/faulty, then we need to read everything we can. + */ + if (sh->raid_conf->level != 6 && + sh->sector < sh->raid_conf->mddev->recovery_cp) + /* reconstruct-write isn't being forced */ + return 0; + for (i = 0; i < s->failed; i++) { + if (!test_bit(R5_UPTODATE, &fdev[i]->flags) && + !test_bit(R5_OVERWRITE, &fdev[i]->flags)) + return 1; + } + + return 0; +} + +static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, + int disk_idx, int disks) +{ + struct r5dev *dev = &sh->dev[disk_idx]; /* is the data in this block needed, and can we get it? */ - if (!test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && - (dev->toread || - (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || - s->syncing || s->expanding || - (s->replacing && want_replace(sh, disk_idx)) || - (s->failed >= 1 && fdev[0]->toread) || - (s->failed >= 2 && fdev[1]->toread) || - (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && - (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) && - !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || - ((sh->raid_conf->level == 6 || - sh->sector >= sh->raid_conf->mddev->recovery_cp) - && s->failed && s->to_write && - (s->to_write - s->non_overwrite < - sh->raid_conf->raid_disks - sh->raid_conf->max_degraded) && - (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) { + if (need_this_block(sh, s, disk_idx, disks)) { /* we would like to get this block, possibly by computing it, * otherwise read it if the backing disk is insync */ @@ -3195,6 +3263,11 @@ static void handle_stripe_dirtying(struct r5conf *conf, (unsigned long long)sh->sector, rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); } + + if (rcw > disks && rmw > disks && + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + set_bit(STRIPE_DELAYED, &sh->state); + /* now if nothing is locked, and if we have enough data, * we can start a write request */ @@ -4076,7 +4149,7 @@ static void activate_bit_delay(struct r5conf *conf, } } -int md_raid5_congested(struct mddev *mddev, int bits) +static int raid5_congested(struct mddev *mddev, int bits) { struct r5conf *conf = mddev->private; @@ -4093,24 +4166,14 @@ int md_raid5_congested(struct mddev *mddev, int bits) return 0; } -EXPORT_SYMBOL_GPL(md_raid5_congested); - -static int raid5_congested(void *data, int bits) -{ - struct mddev *mddev = data; - - return mddev_congested(mddev, bits) || - md_raid5_congested(mddev, bits); -} /* We want read requests to align with chunks where possible, * but write requests don't need to. */ -static int raid5_mergeable_bvec(struct request_queue *q, +static int raid5_mergeable_bvec(struct mddev *mddev, struct bvec_merge_data *bvm, struct bio_vec *biovec) { - struct mddev *mddev = q->queuedata; sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_sectors; @@ -5291,11 +5354,14 @@ static void raid5d(struct md_thread *thread) static ssize_t raid5_show_stripe_cache_size(struct mddev *mddev, char *page) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; + int ret = 0; + spin_lock(&mddev->lock); + conf = mddev->private; if (conf) - return sprintf(page, "%d\n", conf->max_nr_stripes); - else - return 0; + ret = sprintf(page, "%d\n", conf->max_nr_stripes); + spin_unlock(&mddev->lock); + return ret; } int @@ -5334,21 +5400,25 @@ EXPORT_SYMBOL(raid5_set_cache_size); static ssize_t raid5_store_stripe_cache_size(struct mddev *mddev, const char *page, size_t len) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; unsigned long new; int err; if (len >= PAGE_SIZE) return -EINVAL; - if (!conf) - return -ENODEV; - if (kstrtoul(page, 10, &new)) return -EINVAL; - err = raid5_set_cache_size(mddev, new); + err = mddev_lock(mddev); if (err) return err; - return len; + conf = mddev->private; + if (!conf) + err = -ENODEV; + else + err = raid5_set_cache_size(mddev, new); + mddev_unlock(mddev); + + return err ?: len; } static struct md_sysfs_entry @@ -5359,29 +5429,40 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, static ssize_t raid5_show_preread_threshold(struct mddev *mddev, char *page) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; + int ret = 0; + spin_lock(&mddev->lock); + conf = mddev->private; if (conf) - return sprintf(page, "%d\n", conf->bypass_threshold); - else - return 0; + ret = sprintf(page, "%d\n", conf->bypass_threshold); + spin_unlock(&mddev->lock); + return ret; } static ssize_t raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; unsigned long new; + int err; + if (len >= PAGE_SIZE) return -EINVAL; - if (!conf) - return -ENODEV; - if (kstrtoul(page, 10, &new)) return -EINVAL; - if (new > conf->max_nr_stripes) - return -EINVAL; - conf->bypass_threshold = new; - return len; + + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf) + err = -ENODEV; + else if (new > conf->max_nr_stripes) + err = -EINVAL; + else + conf->bypass_threshold = new; + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry @@ -5393,39 +5474,48 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, static ssize_t raid5_show_skip_copy(struct mddev *mddev, char *page) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; + int ret = 0; + spin_lock(&mddev->lock); + conf = mddev->private; if (conf) - return sprintf(page, "%d\n", conf->skip_copy); - else - return 0; + ret = sprintf(page, "%d\n", conf->skip_copy); + spin_unlock(&mddev->lock); + return ret; } static ssize_t raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; unsigned long new; + int err; + if (len >= PAGE_SIZE) return -EINVAL; - if (!conf) - return -ENODEV; - if (kstrtoul(page, 10, &new)) return -EINVAL; new = !!new; - if (new == conf->skip_copy) - return len; - mddev_suspend(mddev); - conf->skip_copy = new; - if (new) - mddev->queue->backing_dev_info.capabilities |= - BDI_CAP_STABLE_WRITES; - else - mddev->queue->backing_dev_info.capabilities &= - ~BDI_CAP_STABLE_WRITES; - mddev_resume(mddev); - return len; + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf) + err = -ENODEV; + else if (new != conf->skip_copy) { + mddev_suspend(mddev); + conf->skip_copy = new; + if (new) + mddev->queue->backing_dev_info.capabilities |= + BDI_CAP_STABLE_WRITES; + else + mddev->queue->backing_dev_info.capabilities &= + ~BDI_CAP_STABLE_WRITES; + mddev_resume(mddev); + } + mddev_unlock(mddev); + return err ?: len; } static struct md_sysfs_entry @@ -5449,11 +5539,14 @@ raid5_stripecache_active = __ATTR_RO(stripe_cache_active); static ssize_t raid5_show_group_thread_cnt(struct mddev *mddev, char *page) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; + int ret = 0; + spin_lock(&mddev->lock); + conf = mddev->private; if (conf) - return sprintf(page, "%d\n", conf->worker_cnt_per_group); - else - return 0; + ret = sprintf(page, "%d\n", conf->worker_cnt_per_group); + spin_unlock(&mddev->lock); + return ret; } static int alloc_thread_groups(struct r5conf *conf, int cnt, @@ -5463,7 +5556,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt, static ssize_t raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) { - struct r5conf *conf = mddev->private; + struct r5conf *conf; unsigned long new; int err; struct r5worker_group *new_groups, *old_groups; @@ -5471,41 +5564,41 @@ raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) if (len >= PAGE_SIZE) return -EINVAL; - if (!conf) - return -ENODEV; - if (kstrtoul(page, 10, &new)) return -EINVAL; - if (new == conf->worker_cnt_per_group) - return len; - - mddev_suspend(mddev); + err = mddev_lock(mddev); + if (err) + return err; + conf = mddev->private; + if (!conf) + err = -ENODEV; + else if (new != conf->worker_cnt_per_group) { + mddev_suspend(mddev); - old_groups = conf->worker_groups; - if (old_groups) - flush_workqueue(raid5_wq); + old_groups = conf->worker_groups; + if (old_groups) + flush_workqueue(raid5_wq); - err = alloc_thread_groups(conf, new, - &group_cnt, &worker_cnt_per_group, - &new_groups); - if (!err) { - spin_lock_irq(&conf->device_lock); - conf->group_cnt = group_cnt; - conf->worker_cnt_per_group = worker_cnt_per_group; - conf->worker_groups = new_groups; - spin_unlock_irq(&conf->device_lock); + err = alloc_thread_groups(conf, new, + &group_cnt, &worker_cnt_per_group, + &new_groups); + if (!err) { + spin_lock_irq(&conf->device_lock); + conf->group_cnt = group_cnt; + conf->worker_cnt_per_group = worker_cnt_per_group; + conf->worker_groups = new_groups; + spin_unlock_irq(&conf->device_lock); - if (old_groups) - kfree(old_groups[0].workers); - kfree(old_groups); + if (old_groups) + kfree(old_groups[0].workers); + kfree(old_groups); + } + mddev_resume(mddev); } + mddev_unlock(mddev); - mddev_resume(mddev); - - if (err) - return err; - return len; + return err ?: len; } static struct md_sysfs_entry @@ -6173,11 +6266,6 @@ static int run(struct mddev *mddev) if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) mddev->queue->backing_dev_info.ra_pages = 2 * stripe; - blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); - - mddev->queue->backing_dev_info.congested_data = mddev; - mddev->queue->backing_dev_info.congested_fn = raid5_congested; - chunk_size = mddev->chunk_sectors << 9; blk_queue_io_min(mddev->queue, chunk_size); blk_queue_io_opt(mddev->queue, chunk_size * @@ -6255,17 +6343,12 @@ abort: return -EIO; } -static int stop(struct mddev *mddev) +static void raid5_free(struct mddev *mddev, void *priv) { - struct r5conf *conf = mddev->private; + struct r5conf *conf = priv; - md_unregister_thread(&mddev->thread); - if (mddev->queue) - mddev->queue->backing_dev_info.congested_fn = NULL; free_conf(conf); - mddev->private = NULL; mddev->to_remove = &raid5_attrs_group; - return 0; } static void status(struct seq_file *seq, struct mddev *mddev) @@ -7039,7 +7122,7 @@ static struct md_personality raid6_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = raid5_free, .status = status, .error_handler = error, .hot_add_disk = raid5_add_disk, @@ -7053,6 +7136,8 @@ static struct md_personality raid6_personality = .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid6_takeover, + .congested = raid5_congested, + .mergeable_bvec = raid5_mergeable_bvec, }; static struct md_personality raid5_personality = { @@ -7061,7 +7146,7 @@ static struct md_personality raid5_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = raid5_free, .status = status, .error_handler = error, .hot_add_disk = raid5_add_disk, @@ -7075,6 +7160,8 @@ static struct md_personality raid5_personality = .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid5_takeover, + .congested = raid5_congested, + .mergeable_bvec = raid5_mergeable_bvec, }; static struct md_personality raid4_personality = @@ -7084,7 +7171,7 @@ static struct md_personality raid4_personality = .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = raid5_free, .status = status, .error_handler = error, .hot_add_disk = raid5_add_disk, @@ -7098,6 +7185,8 @@ static struct md_personality raid4_personality = .finish_reshape = raid5_finish_reshape, .quiesce = raid5_quiesce, .takeover = raid4_takeover, + .congested = raid5_congested, + .mergeable_bvec = raid5_mergeable_bvec, }; static int __init raid5_init(void) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index d59f5ca743cd..983e18a83db1 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -558,7 +558,6 @@ static inline int algorithm_is_DDF(int layout) return layout >= 8 && layout <= 10; } -extern int md_raid5_congested(struct mddev *mddev, int bits); extern void md_raid5_kick_device(struct r5conf *conf); extern int raid5_set_cache_size(struct mddev *mddev, int size); #endif |