diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-11-23 12:11:13 -1000 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-11-23 12:11:13 -1000 |
commit | 1d838d70fb867b155ee380b37d556893a7313ed2 (patch) | |
tree | bf7d99d096864151791ef70066448d0cd51ecd23 /drivers/md | |
parent | a8946afe5a3519bf4fd84d87f94c92d49f2f13d8 (diff) | |
parent | 884162df2aadd7414bef4935e1a54976fd4e3988 (diff) | |
download | lwn-1d838d70fb867b155ee380b37d556893a7313ed2.tar.gz lwn-1d838d70fb867b155ee380b37d556893a7313ed2.zip |
Merge tag 'md-3.7-fixes' of git://neil.brown.name/md
Pull md fixes from NeilBrown:
"Several bug fixes for md in 3.7:
- raid5 discard has problems
- raid10 replacement devices have problems
- bad block lock seqlock usage has problems
- dm-raid doesn't free everything"
* tag 'md-3.7-fixes' of git://neil.brown.name/md:
md/raid10: decrement correct pending counter when writing to replacement.
md/raid10: close race that lose writes lost when replacement completes.
md/raid5: Make sure we clear R5_Discard when discard is finished.
md/raid5: move resolving of reconstruct_state earlier in stripe_handle.
md/raid5: round discard alignment up to power of 2.
md: make sure everything is freed when dm-raid stops an array.
md: Avoid write invalid address if read_seqretry returned true.
md: Reassigned the parameters if read_seqretry returned true in func md_is_badblock.
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/md.c | 27 | ||||
-rw-r--r-- | drivers/md/raid10.c | 131 | ||||
-rw-r--r-- | drivers/md/raid5.c | 79 |
3 files changed, 132 insertions, 105 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index 9ab768acfb62..61200717687b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1817,10 +1817,10 @@ retry: memset(bbp, 0xff, PAGE_SIZE); for (i = 0 ; i < bb->count ; i++) { - u64 internal_bb = *p++; + u64 internal_bb = p[i]; u64 store_bb = ((BB_OFFSET(internal_bb) << 10) | BB_LEN(internal_bb)); - *bbp++ = cpu_to_le64(store_bb); + bbp[i] = cpu_to_le64(store_bb); } bb->changed = 0; if (read_seqretry(&bb->lock, seq)) @@ -5294,7 +5294,7 @@ void md_stop_writes(struct mddev *mddev) } EXPORT_SYMBOL_GPL(md_stop_writes); -void md_stop(struct mddev *mddev) +static void __md_stop(struct mddev *mddev) { mddev->ready = 0; mddev->pers->stop(mddev); @@ -5304,6 +5304,18 @@ void md_stop(struct mddev *mddev) mddev->pers = NULL; clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } + +void md_stop(struct mddev *mddev) +{ + /* stop the array and free an attached data structures. + * This is called from dm-raid + */ + __md_stop(mddev); + bitmap_destroy(mddev); + if (mddev->bio_set) + bioset_free(mddev->bio_set); +} + EXPORT_SYMBOL_GPL(md_stop); static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) @@ -5364,7 +5376,7 @@ static int do_md_stop(struct mddev * mddev, int mode, set_disk_ro(disk, 0); __md_stop_writes(mddev); - md_stop(mddev); + __md_stop(mddev); mddev->queue->merge_bvec_fn = NULL; mddev->queue->backing_dev_info.congested_fn = NULL; @@ -7936,9 +7948,9 @@ int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, sector_t *first_bad, int *bad_sectors) { int hi; - int lo = 0; + int lo; u64 *p = bb->page; - int rv = 0; + int rv; sector_t target = s + sectors; unsigned seq; @@ -7953,7 +7965,8 @@ int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, retry: seq = read_seqbegin(&bb->lock); - + lo = 0; + rv = 0; hi = bb->count; /* Binary search between lo and hi for 'target' diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index d1295aff4173..0d5d0ff2c0f7 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -499,7 +499,7 @@ static void raid10_end_write_request(struct bio *bio, int error) */ one_write_done(r10_bio); if (dec_rdev) - rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); + rdev_dec_pending(rdev, conf->mddev); } /* @@ -1334,18 +1334,21 @@ retry_write: blocked_rdev = rrdev; break; } + if (rdev && (test_bit(Faulty, &rdev->flags) + || test_bit(Unmerged, &rdev->flags))) + rdev = NULL; if (rrdev && (test_bit(Faulty, &rrdev->flags) || test_bit(Unmerged, &rrdev->flags))) rrdev = NULL; r10_bio->devs[i].bio = NULL; r10_bio->devs[i].repl_bio = NULL; - if (!rdev || test_bit(Faulty, &rdev->flags) || - test_bit(Unmerged, &rdev->flags)) { + + if (!rdev && !rrdev) { set_bit(R10BIO_Degraded, &r10_bio->state); continue; } - if (test_bit(WriteErrorSeen, &rdev->flags)) { + if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { sector_t first_bad; sector_t dev_sector = r10_bio->devs[i].addr; int bad_sectors; @@ -1387,8 +1390,10 @@ retry_write: max_sectors = good_sectors; } } - r10_bio->devs[i].bio = bio; - atomic_inc(&rdev->nr_pending); + if (rdev) { + r10_bio->devs[i].bio = bio; + atomic_inc(&rdev->nr_pending); + } if (rrdev) { r10_bio->devs[i].repl_bio = bio; atomic_inc(&rrdev->nr_pending); @@ -1444,69 +1449,71 @@ retry_write: for (i = 0; i < conf->copies; i++) { struct bio *mbio; int d = r10_bio->devs[i].devnum; - if (!r10_bio->devs[i].bio) - continue; + if (r10_bio->devs[i].bio) { + struct md_rdev *rdev = conf->mirrors[d].rdev; + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, + max_sectors); + r10_bio->devs[i].bio = mbio; + + mbio->bi_sector = (r10_bio->devs[i].addr+ + choose_data_offset(r10_bio, + rdev)); + mbio->bi_bdev = rdev->bdev; + mbio->bi_end_io = raid10_end_write_request; + mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; + mbio->bi_private = r10_bio; - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, - max_sectors); - r10_bio->devs[i].bio = mbio; + atomic_inc(&r10_bio->remaining); - mbio->bi_sector = (r10_bio->devs[i].addr+ - choose_data_offset(r10_bio, - conf->mirrors[d].rdev)); - mbio->bi_bdev = conf->mirrors[d].rdev->bdev; - mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; - mbio->bi_private = r10_bio; + cb = blk_check_plugged(raid10_unplug, mddev, + sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid10_plug_cb, + cb); + else + plug = NULL; + spin_lock_irqsave(&conf->device_lock, flags); + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } + spin_unlock_irqrestore(&conf->device_lock, flags); + if (!plug) + md_wakeup_thread(mddev->thread); + } - atomic_inc(&r10_bio->remaining); + if (r10_bio->devs[i].repl_bio) { + struct md_rdev *rdev = conf->mirrors[d].replacement; + if (rdev == NULL) { + /* Replacement just got moved to main 'rdev' */ + smp_mb(); + rdev = conf->mirrors[d].rdev; + } + mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, + max_sectors); + r10_bio->devs[i].repl_bio = mbio; + + mbio->bi_sector = (r10_bio->devs[i].addr + + choose_data_offset( + r10_bio, rdev)); + mbio->bi_bdev = rdev->bdev; + mbio->bi_end_io = raid10_end_write_request; + mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; + mbio->bi_private = r10_bio; - cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); - if (cb) - plug = container_of(cb, struct raid10_plug_cb, cb); - else - plug = NULL; - spin_lock_irqsave(&conf->device_lock, flags); - if (plug) { - bio_list_add(&plug->pending, mbio); - plug->pending_cnt++; - } else { + atomic_inc(&r10_bio->remaining); + spin_lock_irqsave(&conf->device_lock, flags); bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; + spin_unlock_irqrestore(&conf->device_lock, flags); + if (!mddev_check_plugged(mddev)) + md_wakeup_thread(mddev->thread); } - spin_unlock_irqrestore(&conf->device_lock, flags); - if (!plug) - md_wakeup_thread(mddev->thread); - - if (!r10_bio->devs[i].repl_bio) - continue; - - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, - max_sectors); - r10_bio->devs[i].repl_bio = mbio; - - /* We are actively writing to the original device - * so it cannot disappear, so the replacement cannot - * become NULL here - */ - mbio->bi_sector = (r10_bio->devs[i].addr + - choose_data_offset( - r10_bio, - conf->mirrors[d].replacement)); - mbio->bi_bdev = conf->mirrors[d].replacement->bdev; - mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; - mbio->bi_private = r10_bio; - - atomic_inc(&r10_bio->remaining); - spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; - spin_unlock_irqrestore(&conf->device_lock, flags); - if (!mddev_check_plugged(mddev)) - md_wakeup_thread(mddev->thread); } /* Don't remove the bias on 'remaining' (one_write_done) until diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c5439dce0295..a4502686e7a8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2774,10 +2774,12 @@ static void handle_stripe_clean_event(struct r5conf *conf, dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && (test_bit(R5_UPTODATE, &dev->flags) || - test_and_clear_bit(R5_Discard, &dev->flags))) { + test_bit(R5_Discard, &dev->flags))) { /* We can return any write requests */ struct bio *wbi, *wbi2; pr_debug("Return write for disc %d\n", i); + if (test_and_clear_bit(R5_Discard, &dev->flags)) + clear_bit(R5_UPTODATE, &dev->flags); wbi = dev->written; dev->written = NULL; while (wbi && wbi->bi_sector < @@ -2795,7 +2797,8 @@ static void handle_stripe_clean_event(struct r5conf *conf, !test_bit(STRIPE_DEGRADED, &sh->state), 0); } - } + } else if (test_bit(R5_Discard, &sh->dev[i].flags)) + clear_bit(R5_Discard, &sh->dev[i].flags); if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) if (atomic_dec_and_test(&conf->pending_full_writes)) @@ -3490,40 +3493,6 @@ static void handle_stripe(struct stripe_head *sh) handle_failed_sync(conf, sh, &s); } - /* - * might be able to return some write requests if the parity blocks - * are safe, or on a failed drive - */ - pdev = &sh->dev[sh->pd_idx]; - s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) - || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); - qdev = &sh->dev[sh->qd_idx]; - s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) - || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) - || conf->level < 6; - - if (s.written && - (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) - && !test_bit(R5_LOCKED, &pdev->flags) - && (test_bit(R5_UPTODATE, &pdev->flags) || - test_bit(R5_Discard, &pdev->flags))))) && - (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) - && !test_bit(R5_LOCKED, &qdev->flags) - && (test_bit(R5_UPTODATE, &qdev->flags) || - test_bit(R5_Discard, &qdev->flags)))))) - handle_stripe_clean_event(conf, sh, disks, &s.return_bi); - - /* Now we might consider reading some blocks, either to check/generate - * parity, or to satisfy requests - * or to load a block that is being partially written. - */ - if (s.to_read || s.non_overwrite - || (conf->level == 6 && s.to_write && s.failed) - || (s.syncing && (s.uptodate + s.compute < disks)) - || s.replacing - || s.expanding) - handle_stripe_fill(sh, &s, disks); - /* Now we check to see if any write operations have recently * completed */ @@ -3561,6 +3530,40 @@ static void handle_stripe(struct stripe_head *sh) s.dec_preread_active = 1; } + /* + * might be able to return some write requests if the parity blocks + * are safe, or on a failed drive + */ + pdev = &sh->dev[sh->pd_idx]; + s.p_failed = (s.failed >= 1 && s.failed_num[0] == sh->pd_idx) + || (s.failed >= 2 && s.failed_num[1] == sh->pd_idx); + qdev = &sh->dev[sh->qd_idx]; + s.q_failed = (s.failed >= 1 && s.failed_num[0] == sh->qd_idx) + || (s.failed >= 2 && s.failed_num[1] == sh->qd_idx) + || conf->level < 6; + + if (s.written && + (s.p_failed || ((test_bit(R5_Insync, &pdev->flags) + && !test_bit(R5_LOCKED, &pdev->flags) + && (test_bit(R5_UPTODATE, &pdev->flags) || + test_bit(R5_Discard, &pdev->flags))))) && + (s.q_failed || ((test_bit(R5_Insync, &qdev->flags) + && !test_bit(R5_LOCKED, &qdev->flags) + && (test_bit(R5_UPTODATE, &qdev->flags) || + test_bit(R5_Discard, &qdev->flags)))))) + handle_stripe_clean_event(conf, sh, disks, &s.return_bi); + + /* Now we might consider reading some blocks, either to check/generate + * parity, or to satisfy requests + * or to load a block that is being partially written. + */ + if (s.to_read || s.non_overwrite + || (conf->level == 6 && s.to_write && s.failed) + || (s.syncing && (s.uptodate + s.compute < disks)) + || s.replacing + || s.expanding) + handle_stripe_fill(sh, &s, disks); + /* Now to consider new write requests and what else, if anything * should be read. We do not handle new writes when: * 1/ A 'write' operation (copy+xor) is already in flight. @@ -5529,6 +5532,10 @@ static int run(struct mddev *mddev) * discard data disk but write parity disk */ stripe = stripe * PAGE_SIZE; + /* Round up to power of 2, as discard handling + * currently assumes that */ + while ((stripe-1) & stripe) + stripe = (stripe | (stripe-1)) + 1; mddev->queue->limits.discard_alignment = stripe; mddev->queue->limits.discard_granularity = stripe; /* |