From 5ef56c8fecedf403a346d02140e52a072d693d6b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 25 Aug 2011 14:42:51 +1000
Subject: md: report failure if a 'set faulty' request doesn't.

Sometimes a device will refuse to be set faulty.  e.g. RAID1 will
never let the last working device become faulty.

So check if "md_error()" did manage to set the faulty flag and fail
with EBUSY if it didn't.

Resolves-Debian-Bug: http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=601198
Reported-by: Mike Hommey <mh+reportbug@glandium.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8e221a20f5d9..1cd9bfb45e9a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2561,7 +2561,10 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 	int err = -EINVAL;
 	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
 		md_error(rdev->mddev, rdev);
-		err = 0;
+		if (test_bit(Faulty, &rdev->flags))
+			err = 0;
+		else
+			err = -EBUSY;
 	} else if (cmd_match(buf, "remove")) {
 		if (rdev->raid_disk >= 0)
 			err = -EBUSY;
@@ -5983,6 +5986,8 @@ static int set_disk_faulty(mddev_t *mddev, dev_t dev)
 		return -ENODEV;
 
 	md_error(mddev, rdev);
+	if (!test_bit(Faulty, &rdev->flags))
+		return -EBUSY;
 	return 0;
 }
 
-- 
cgit v1.2.3


From aeb9b211849621f592288ed5ad694de9eeaae87a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 25 Aug 2011 14:43:08 +1000
Subject: md: ensure changes to 'write-mostly' are reflected in metadata.

The 'write-mostly' flag can be changed through sysfs.
With 0.90 metadata, those changes are reflected in the metadata.
For 1.x metadata, they aren't.

So fix super_1_sync to record 'write-mostly' status.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1cd9bfb45e9a..9a880239219d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1738,6 +1738,11 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 	sb->level = cpu_to_le32(mddev->level);
 	sb->layout = cpu_to_le32(mddev->layout);
 
+	if (test_bit(WriteMostly, &rdev->flags))
+		sb->devflags |= WriteMostly1;
+	else
+		sb->devflags &= ~WriteMostly1;
+
 	if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
 		sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
 		sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
-- 
cgit v1.2.3


From a5bf4df0c88b88d34b6f0e3bc8a402dac7d14611 Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung@gmail.com>
Date: Thu, 25 Aug 2011 14:43:34 +1000
Subject: md: use REQ_NOIDLE flag in md_super_write()

Queue idling is used for the anticipation of immediate
sequencial I/O's but md_super_write() is a kind of one-
shot operation, coupled with md_super_wait(), so the
idling in this case will be just a waste of time.

Specifying REQ_NOIDLE prevents it. Instead of adding
the flag to submit_bio() directly, use pre-defined
macro WRITE_FLUSH_FUA.

Signed-off-by: Namhyung Kim <namhyung@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9a880239219d..aca611711264 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -848,7 +848,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 	bio->bi_end_io = super_written;
 
 	atomic_inc(&mddev->pending_writes);
-	submit_bio(REQ_WRITE | REQ_SYNC | REQ_FLUSH | REQ_FUA, bio);
+	submit_bio(WRITE_FLUSH_FUA, bio);
 }
 
 void md_super_wait(mddev_t *mddev)
-- 
cgit v1.2.3


From 1b6afa17581027218088a18a9ceda600e0ddba7a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 25 Aug 2011 14:43:53 +1000
Subject: md/linear: avoid corrupting structure while waiting for rcu_free to
 complete.

I don't know what I was thinking putting 'rcu' after a dynamically
sized array!  The array could still be in use when we call rcu_free()
(That is the point) so we mustn't corrupt it.

Cc: stable@kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/linear.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/linear.h b/drivers/md/linear.h
index 0ce29b61605a..2f2da05b2ce9 100644
--- a/drivers/md/linear.h
+++ b/drivers/md/linear.h
@@ -10,9 +10,9 @@ typedef struct dev_info dev_info_t;
 
 struct linear_private_data
 {
+	struct rcu_head		rcu;
 	sector_t		array_sectors;
 	dev_info_t		disks[0];
-	struct rcu_head		rcu;
 };
 
 
-- 
cgit v1.2.3


From 7da64a0abc3b2c6cbd3521672e9bb74dd560bb89 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 30 Aug 2011 16:20:17 +1000
Subject: md: fix clearing of 'blocked' flag in the presence of bad blocks.

When the 'blocked' flag on a device is cleared while there are
unacknowledged bad blocks we must fail the device.  This is needed for
backwards compatability of the interface.

The code currently uses the wrong test for "unacknowledged bad blocks
exist".  Change it to the right test.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index aca611711264..3742ce8b0acf 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2592,7 +2592,7 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
 		err = 0;
 	} else if (cmd_match(buf, "-blocked")) {
 		if (!test_bit(Faulty, &rdev->flags) &&
-		    test_bit(BlockedBadBlocks, &rdev->flags)) {
+		    rdev->badblocks.unacked_exist) {
 			/* metadata handler doesn't understand badblocks,
 			 * so we need to fail the device
 			 */
-- 
cgit v1.2.3


From 43220aa0f22cd3ce5b30246d50ccd696d119edea Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 31 Aug 2011 12:49:14 +1000
Subject: md/raid5: fix a hang on device failure.

Waiting for a 'blocked' rdev to become unblocked in the raid5d thread
cannot work with internal metadata as it is the raid5d thread which
will clear the blocked flag.
This wasn't a problem in 3.0 and earlier as we only set the blocked
flag when external metadata was used then.
However we now set it always, so we need to be more careful.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index dbae459fb02d..43709fa6b6df 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3336,7 +3336,7 @@ static void handle_stripe(struct stripe_head *sh)
 
 finish:
 	/* wait for this device to become unblocked */
-	if (unlikely(s.blocked_rdev))
+	if (conf->mddev->external && unlikely(s.blocked_rdev))
 		md_wait_for_blocked_rdev(s.blocked_rdev, conf->mddev);
 
 	if (s.handle_bad_blocks)
-- 
cgit v1.2.3


From 19d5f834d6aff7efb1c9353523865c5bce869470 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Sat, 10 Sep 2011 17:21:17 +1000
Subject: md/raid10: unify handling of write completion.

A write can complete at two different places:
1/ when the last member-device write completes, through
   raid10_end_write_request
2/ in make_request() when we remove the initial bias from ->remaining.

These two should do exactly the same thing and the comment says they
do, but they don't.

So factor the correct code out into a function and call it in both
places.  This makes the code much more similar to RAID1.

The difference is only significant if there is an error, and they
usually take a while, so it is unlikely that there will be an error
already when make_request is completing, so this is unlikely to cause
real problems.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 38 ++++++++++++++++++--------------------
 1 file changed, 18 insertions(+), 20 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8b29cd4f01c8..f6873fc8e5ee 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -337,6 +337,21 @@ static void close_write(r10bio_t *r10_bio)
 	md_write_end(r10_bio->mddev);
 }
 
+static void one_write_done(r10bio_t *r10_bio)
+{
+	if (atomic_dec_and_test(&r10_bio->remaining)) {
+		if (test_bit(R10BIO_WriteError, &r10_bio->state))
+			reschedule_retry(r10_bio);
+		else {
+			close_write(r10_bio);
+			if (test_bit(R10BIO_MadeGood, &r10_bio->state))
+				reschedule_retry(r10_bio);
+			else
+				raid_end_bio_io(r10_bio);
+		}
+	}
+}
+
 static void raid10_end_write_request(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -387,17 +402,7 @@ static void raid10_end_write_request(struct bio *bio, int error)
 	 * Let's see if all mirrored write operations have finished
 	 * already.
 	 */
-	if (atomic_dec_and_test(&r10_bio->remaining)) {
-		if (test_bit(R10BIO_WriteError, &r10_bio->state))
-			reschedule_retry(r10_bio);
-		else {
-			close_write(r10_bio);
-			if (test_bit(R10BIO_MadeGood, &r10_bio->state))
-				reschedule_retry(r10_bio);
-			else
-				raid_end_bio_io(r10_bio);
-		}
-	}
+	one_write_done(r10_bio);
 	if (dec_rdev)
 		rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
 }
@@ -1127,15 +1132,8 @@ retry_write:
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 	}
 
-	if (atomic_dec_and_test(&r10_bio->remaining)) {
-		/* This matches the end of raid10_end_write_request() */
-		bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
-				r10_bio->sectors,
-				!test_bit(R10BIO_Degraded, &r10_bio->state),
-				0);
-		md_write_end(mddev);
-		raid_end_bio_io(r10_bio);
-	}
+	/* Remove the bias on 'remaining' */
+	one_write_done(r10_bio);
 
 	/* In case raid10d snuck in to freeze_array */
 	wake_up(&conf->wait_barrier);
-- 
cgit v1.2.3


From 079fa166a2874985ae58b2e21e26e1cbc91127d4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Sat, 10 Sep 2011 17:21:23 +1000
Subject: md/raid1,10: Remove use-after-free bug in make_request.

A single request to RAID1 or RAID10 might result in multiple
requests if there are known bad blocks that need to be avoided.

To detect if we need to submit another write request we test:
 	if (sectors_handled < (bio->bi_size >> 9)) {

However this is after we call **_write_done() so the 'bio' no longer
belongs to us - the writes could have completed and the bio freed.

So move the **_write_done call until after the test against
bio->bi_size.

This addresses https://bugzilla.kernel.org/show_bug.cgi?id=41862

Reported-by: Bruno Wolff III <bruno@wolff.to>
Tested-by: Bruno Wolff III <bruno@wolff.to>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c  | 14 +++++++++-----
 drivers/md/raid10.c | 13 ++++++++-----
 2 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 32323f0afd89..f4622dd8fc59 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1099,12 +1099,11 @@ read_again:
 		bio_list_add(&conf->pending_bio_list, mbio);
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 	}
-	r1_bio_write_done(r1_bio);
-
-	/* In case raid1d snuck in to freeze_array */
-	wake_up(&conf->wait_barrier);
-
+	/* Mustn't call r1_bio_write_done before this next test,
+	 * as it could result in the bio being freed.
+	 */
 	if (sectors_handled < (bio->bi_size >> 9)) {
+		r1_bio_write_done(r1_bio);
 		/* We need another r1_bio.  It has already been counted
 		 * in bio->bi_phys_segments
 		 */
@@ -1117,6 +1116,11 @@ read_again:
 		goto retry_write;
 	}
 
+	r1_bio_write_done(r1_bio);
+
+	/* In case raid1d snuck in to freeze_array */
+	wake_up(&conf->wait_barrier);
+
 	if (do_sync || !bitmap || !plugged)
 		md_wakeup_thread(mddev->thread);
 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f6873fc8e5ee..d7a8468ddeab 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1132,13 +1132,12 @@ retry_write:
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 	}
 
-	/* Remove the bias on 'remaining' */
-	one_write_done(r10_bio);
-
-	/* In case raid10d snuck in to freeze_array */
-	wake_up(&conf->wait_barrier);
+	/* Don't remove the bias on 'remaining' (one_write_done) until
+	 * after checking if we need to go around again.
+	 */
 
 	if (sectors_handled < (bio->bi_size >> 9)) {
+		one_write_done(r10_bio);
 		/* We need another r10_bio.  It has already been counted
 		 * in bio->bi_phys_segments.
 		 */
@@ -1152,6 +1151,10 @@ retry_write:
 		r10_bio->state = 0;
 		goto retry_write;
 	}
+	one_write_done(r10_bio);
+
+	/* In case raid10d snuck in to freeze_array */
+	wake_up(&conf->wait_barrier);
 
 	if (do_sync || !mddev->bitmap || !plugged)
 		md_wakeup_thread(mddev->thread);
-- 
cgit v1.2.3


From 27a7b260f71439c40546b43588448faac01adb93 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Sat, 10 Sep 2011 17:21:28 +1000
Subject: md: Fix handling for devices from 2TB to 4TB in 0.90 metadata.

0.90 metadata uses an unsigned 32bit number to count the number of
kilobytes used from each device.
This should allow up to 4TB per device.
However we multiply this by 2 (to get sectors) before casting to a
larger type, so sizes above 2TB get truncated.

Also we allow rdev->sectors to be larger than 4TB, so it is possible
for the array to be resized larger than the metadata can handle.
So make sure rdev->sectors never exceeds 4TB when 0.90 metadata is in
used.

Also the sanity check at the end of super_90_load should include level
1 as it used ->size too. (RAID0 and Linear don't use ->size at all).

Reported-by: Pim Zandbergen <P.Zandbergen@macroscoop.nl>
Cc: stable@kernel.org
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3742ce8b0acf..5404b2295820 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1138,8 +1138,11 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
 			ret = 0;
 	}
 	rdev->sectors = rdev->sb_start;
+	/* Limit to 4TB as metadata cannot record more than that */
+	if (rdev->sectors >= (2ULL << 32))
+		rdev->sectors = (2ULL << 32) - 2;
 
-	if (rdev->sectors < sb->size * 2 && sb->level > 1)
+	if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
 		/* "this cannot possibly happen" ... */
 		ret = -EINVAL;
 
@@ -1173,7 +1176,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		mddev->clevel[0] = 0;
 		mddev->layout = sb->layout;
 		mddev->raid_disks = sb->raid_disks;
-		mddev->dev_sectors = sb->size * 2;
+		mddev->dev_sectors = ((sector_t)sb->size) * 2;
 		mddev->events = ev1;
 		mddev->bitmap_info.offset = 0;
 		mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
@@ -1415,6 +1418,11 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
 	rdev->sb_start = calc_dev_sboffset(rdev);
 	if (!num_sectors || num_sectors > rdev->sb_start)
 		num_sectors = rdev->sb_start;
+	/* Limit to 4TB as metadata cannot record more than that.
+	 * 4TB == 2^32 KB, or 2*2^32 sectors.
+	 */
+	if (num_sectors >= (2ULL << 32))
+		num_sectors = (2ULL << 32) - 2;
 	md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
 		       rdev->sb_page);
 	md_super_wait(rdev->mddev);
-- 
cgit v1.2.3


From 01f96c0a9922cd9919baf9d16febdf7016177a12 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 21 Sep 2011 15:30:20 +1000
Subject: md: Avoid waking up a thread after it has been freed.

Two related problems:

1/ some error paths call "md_unregister_thread(mddev->thread)"
   without subsequently clearing ->thread.  A subsequent call
   to mddev_unlock will try to wake the thread, and crash.

2/ Most calls to md_wakeup_thread are protected against the thread
   disappeared either by:
      - holding the ->mutex
      - having an active request, so something else must be keeping
        the array active.
   However mddev_unlock calls md_wakeup_thread after dropping the
   mutex and without any certainty of an active request, so the
   ->thread could theoretically disappear.
   So we need a spinlock to provide some protections.

So change md_unregister_thread to take a pointer to the thread
pointer, and ensure that it always does the required locking, and
clears the pointer properly.

Reported-by: "Moshe Melnikov" <moshe@zadarastorage.com>
Signed-off-by: NeilBrown <neilb@suse.de>
cc: stable@kernel.org
---
 drivers/md/md.c        | 22 +++++++++++++++++++---
 drivers/md/md.h        |  2 +-
 drivers/md/multipath.c |  3 +--
 drivers/md/raid1.c     |  3 +--
 drivers/md/raid10.c    |  5 ++---
 drivers/md/raid5.c     |  6 ++----
 6 files changed, 26 insertions(+), 15 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5404b2295820..5c95ccb59500 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -61,6 +61,11 @@
 static void autostart_arrays(int part);
 #endif
 
+/* pers_list is a list of registered personalities protected
+ * by pers_lock.
+ * pers_lock does extra service to protect accesses to
+ * mddev->thread when the mutex cannot be held.
+ */
 static LIST_HEAD(pers_list);
 static DEFINE_SPINLOCK(pers_lock);
 
@@ -739,7 +744,12 @@ static void mddev_unlock(mddev_t * mddev)
 	} else
 		mutex_unlock(&mddev->reconfig_mutex);
 
+	/* was we've dropped the mutex we need a spinlock to
+	 * make sur the thread doesn't disappear
+	 */
+	spin_lock(&pers_lock);
 	md_wakeup_thread(mddev->thread);
+	spin_unlock(&pers_lock);
 }
 
 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
@@ -6429,11 +6439,18 @@ mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
 	return thread;
 }
 
-void md_unregister_thread(mdk_thread_t *thread)
+void md_unregister_thread(mdk_thread_t **threadp)
 {
+	mdk_thread_t *thread = *threadp;
 	if (!thread)
 		return;
 	dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
+	/* Locking ensures that mddev_unlock does not wake_up a
+	 * non-existent thread
+	 */
+	spin_lock(&pers_lock);
+	*threadp = NULL;
+	spin_unlock(&pers_lock);
 
 	kthread_stop(thread->tsk);
 	kfree(thread);
@@ -7340,8 +7357,7 @@ static void reap_sync_thread(mddev_t *mddev)
 	mdk_rdev_t *rdev;
 
 	/* resync has finished, collect result */
-	md_unregister_thread(mddev->sync_thread);
-	mddev->sync_thread = NULL;
+	md_unregister_thread(&mddev->sync_thread);
 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
 	    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
 		/* success...*/
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 1e586bb4452e..0a309dc29b45 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -560,7 +560,7 @@ extern int register_md_personality(struct mdk_personality *p);
 extern int unregister_md_personality(struct mdk_personality *p);
 extern mdk_thread_t * md_register_thread(void (*run) (mddev_t *mddev),
 				mddev_t *mddev, const char *name);
-extern void md_unregister_thread(mdk_thread_t *thread);
+extern void md_unregister_thread(mdk_thread_t **threadp);
 extern void md_wakeup_thread(mdk_thread_t *thread);
 extern void md_check_recovery(mddev_t *mddev);
 extern void md_write_start(mddev_t *mddev, struct bio *bi);
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 3535c23af288..d5b5fb300171 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -514,8 +514,7 @@ static int multipath_stop (mddev_t *mddev)
 {
 	multipath_conf_t *conf = mddev->private;
 
-	md_unregister_thread(mddev->thread);
-	mddev->thread = NULL;
+	md_unregister_thread(&mddev->thread);
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	mempool_destroy(conf->pool);
 	kfree(conf->multipaths);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f4622dd8fc59..d9587dffe533 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2562,8 +2562,7 @@ static int stop(mddev_t *mddev)
 	raise_barrier(conf);
 	lower_barrier(conf);
 
-	md_unregister_thread(mddev->thread);
-	mddev->thread = NULL;
+	md_unregister_thread(&mddev->thread);
 	if (conf->r1bio_pool)
 		mempool_destroy(conf->r1bio_pool);
 	kfree(conf->mirrors);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d7a8468ddeab..0cd9672cf9cb 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2955,7 +2955,7 @@ static int run(mddev_t *mddev)
 	return 0;
 
 out_free_conf:
-	md_unregister_thread(mddev->thread);
+	md_unregister_thread(&mddev->thread);
 	if (conf->r10bio_pool)
 		mempool_destroy(conf->r10bio_pool);
 	safe_put_page(conf->tmppage);
@@ -2973,8 +2973,7 @@ static int stop(mddev_t *mddev)
 	raise_barrier(conf, 0);
 	lower_barrier(conf);
 
-	md_unregister_thread(mddev->thread);
-	mddev->thread = NULL;
+	md_unregister_thread(&mddev->thread);
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	if (conf->r10bio_pool)
 		mempool_destroy(conf->r10bio_pool);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 43709fa6b6df..ac5e8b57e50f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4941,8 +4941,7 @@ static int run(mddev_t *mddev)
 
 	return 0;
 abort:
-	md_unregister_thread(mddev->thread);
-	mddev->thread = NULL;
+	md_unregister_thread(&mddev->thread);
 	if (conf) {
 		print_raid5_conf(conf);
 		free_conf(conf);
@@ -4956,8 +4955,7 @@ static int stop(mddev_t *mddev)
 {
 	raid5_conf_t *conf = mddev->private;
 
-	md_unregister_thread(mddev->thread);
-	mddev->thread = NULL;
+	md_unregister_thread(&mddev->thread);
 	if (mddev->queue)
 		mddev->queue->backing_dev_info.congested_fn = NULL;
 	free_conf(conf);
-- 
cgit v1.2.3


From 68e58a294fb26f692697179e3f3ecf88dd8cb97c Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Sun, 25 Sep 2011 23:26:15 +0100
Subject: dm: flakey fix corrupt_bio_byte error path

If no arguments were provided to the corrupt_bio_byte feature an error
should be returned immediately.

Reported-by: Zdenek Kabelac <zkabelac@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-flakey.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 89f73ca22cfa..f84c08029b21 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -81,8 +81,10 @@ static int parse_features(struct dm_arg_set *as, struct flakey_c *fc,
 		 * corrupt_bio_byte <Nth_byte> <direction> <value> <bio_flags>
 		 */
 		if (!strcasecmp(arg_name, "corrupt_bio_byte")) {
-			if (!argc)
+			if (!argc) {
 				ti->error = "Feature corrupt_bio_byte requires parameters";
+				return -EINVAL;
+			}
 
 			r = dm_read_arg(_args + 1, as, &fc->corrupt_bio_byte, &ti->error);
 			if (r)
-- 
cgit v1.2.3


From 876fbba1db4a377f050a2bb49b474c7527b2995d Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Sun, 25 Sep 2011 23:26:17 +0100
Subject: dm table: avoid crash if integrity profile changes

Commit a63a5cf (dm: improve block integrity support) introduced a
two-phase initialization of a DM device's integrity profile.  This
patch avoids dereferencing a NULL 'template_disk' pointer in
blk_integrity_register() if there is an integrity profile mismatch in
dm_table_set_integrity().

This can occur if the integrity profiles for stacked devices in a DM
table are changed between the call to dm_table_prealloc_integrity() and
dm_table_set_integrity().

Reported-by: Zdenek Kabelac <zkabelac@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Cc: stable@kernel.org # 2.6.39
---
 drivers/md/dm-table.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 986b8754bb08..322669807077 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1238,14 +1238,15 @@ static void dm_table_set_integrity(struct dm_table *t)
 		return;
 
 	template_disk = dm_table_get_integrity_disk(t, true);
-	if (!template_disk &&
-	    blk_integrity_is_initialized(dm_disk(t->md))) {
+	if (template_disk)
+		blk_integrity_register(dm_disk(t->md),
+				       blk_get_integrity(template_disk));
+	else if (blk_integrity_is_initialized(dm_disk(t->md)))
 		DMWARN("%s: device no longer has a valid integrity profile",
 		       dm_device_name(t->md));
-		return;
-	}
-	blk_integrity_register(dm_disk(t->md),
-			       blk_get_integrity(template_disk));
+	else
+		DMWARN("%s: unable to establish an integrity profile",
+		       dm_device_name(t->md));
 }
 
 static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
-- 
cgit v1.2.3


From 8232480944d173378082ebb2cac8a3207c08cf31 Mon Sep 17 00:00:00 2001
From: Jonthan Brassow <jbrassow@redhat.com>
Date: Sun, 25 Sep 2011 23:26:19 +0100
Subject: dm: raid fix write_mostly arg validation

Fix off-by-one error in validation of write_mostly.

The user-supplied value given for the 'write_mostly' argument must be an
index starting at 0.  The validation of the supplied argument failed to
check for 'N' ('>' vs '>='), which would have caused an access beyond the
end of the array.

Reported-by: Doug Ledford <dledford@redhat.com>
Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-raid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index a002dd85db1e..86df8b2cf927 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -449,7 +449,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 				rs->ti->error = "write_mostly option is only valid for RAID1";
 				return -EINVAL;
 			}
-			if (value > rs->md.raid_disks) {
+			if (value >= rs->md.raid_disks) {
 				rs->ti->error = "Invalid write_mostly drive index given";
 				return -EINVAL;
 			}
-- 
cgit v1.2.3


From 983c7db347db8ce2d8453fd1d89b7a4bb6920d56 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Sun, 25 Sep 2011 23:26:21 +0100
Subject: dm crypt: always disable discard_zeroes_data

If optional discard support in dm-crypt is enabled, discards requests
bypass the crypt queue and blocks of the underlying device are discarded.
For the read path, discarded blocks are handled the same as normal
ciphertext blocks, thus decrypted.

So if the underlying device announces discarded regions return zeroes,
dm-crypt must disable this flag because after decryption there is just
random noise instead of zeroes.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c         |  2 ++
 drivers/md/dm-table.c         | 19 +++++++++++++++++++
 include/linux/device-mapper.h |  5 +++++
 3 files changed, 26 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 49da55c1528a..8c2a000cf3f5 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1698,6 +1698,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 
 	ti->num_flush_requests = 1;
+	ti->discard_zeroes_data_unsupported = 1;
+
 	return 0;
 
 bad:
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 322669807077..bc04518e9d8b 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1283,6 +1283,22 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
 	return 0;
 }
 
+static bool dm_table_discard_zeroes_data(struct dm_table *t)
+{
+	struct dm_target *ti;
+	unsigned i = 0;
+
+	/* Ensure that all targets supports discard_zeroes_data. */
+	while (i < dm_table_get_num_targets(t)) {
+		ti = dm_table_get_target(t, i++);
+
+		if (ti->discard_zeroes_data_unsupported)
+			return 0;
+	}
+
+	return 1;
+}
+
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 			       struct queue_limits *limits)
 {
@@ -1305,6 +1321,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 	}
 	blk_queue_flush(q, flush);
 
+	if (!dm_table_discard_zeroes_data(t))
+		q->limits.discard_zeroes_data = 0;
+
 	dm_table_set_integrity(t);
 
 	/*
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 3fa1f3d90ce0..99e3e50b5c57 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -197,6 +197,11 @@ struct dm_target {
 	 * whether or not its underlying devices have support.
 	 */
 	unsigned discards_supported:1;
+
+	/*
+	 * Set if this target does not return zeroes on discarded blocks.
+	 */
+	unsigned discard_zeroes_data_unsupported:1;
 };
 
 /* Each target can link one of these into the table */
-- 
cgit v1.2.3