From e502fb8f8801c9561c57397e7fd917187762324e Mon Sep 17 00:00:00 2001
From: Tahsin Erdogan <tahsin@google.com>
Date: Thu, 14 Jan 2016 14:41:32 -0800
Subject: deadline: remove unused struct member

commit 63de428b139d3d31d86ebe25ae97b33f6540fb7e ("deadline-iosched:
allow non-sequential batching") removed last use of last_sector.

Signed-off-by: Tahsin Erdogan <tahsin@google.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/deadline-iosched.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'block')

diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index a753df2b3fc2..d0dd7882d8c7 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -39,7 +39,6 @@ struct deadline_data {
 	 */
 	struct request *next_rq[2];
 	unsigned int batching;		/* number of sequential requests made */
-	sector_t last_sector;		/* head position */
 	unsigned int starved;		/* times reads have starved writes */
 
 	/*
@@ -210,8 +209,6 @@ deadline_move_request(struct deadline_data *dd, struct request *rq)
 	dd->next_rq[WRITE] = NULL;
 	dd->next_rq[data_dir] = deadline_latter_request(rq);
 
-	dd->last_sector = rq_end_sector(rq);
-
 	/*
 	 * take it off the sort and fifo list, move
 	 * to dispatch queue
-- 
cgit v1.2.3


From d57d611505d911c6f9f81cd9bd6dbd293d66dd9f Mon Sep 17 00:00:00 2001
From: Stephane Gasparini <stephane.gasparini@linux.intel.com>
Date: Tue, 9 Feb 2016 17:07:38 +0100
Subject: kernel/fs: fix I/O wait not accounted for RW O_DSYNC

 When a process is doing Random Write with O_DSYNC flag
 the I/O wait are not accounted in the kernel (get_cpu_iowait_time_us).
 This is preventing the governor or the cpufreq driver to account for
 I/O wait and thus use the right pstate

Signed-off-by: Stephane Gasparini <stephane.gasparini@linux.intel.com>
Signed-off-by: Philippe Longepe <philippe.longepe@linux.intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/bio.c b/block/bio.c
index dbabd48b1934..f53a691b6533 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -874,7 +874,7 @@ int submit_bio_wait(int rw, struct bio *bio)
 	bio->bi_private = &ret;
 	bio->bi_end_io = submit_bio_wait_endio;
 	submit_bio(rw, bio);
-	wait_for_completion(&ret.event);
+	wait_for_completion_io(&ret.event);
 
 	return ret.error;
 }
-- 
cgit v1.2.3


From 39a169b62b415390398291080dafe63aec751e0a Mon Sep 17 00:00:00 2001
From: Roman Pen <roman.penyaev@profitbricks.com>
Date: Tue, 9 Feb 2016 12:33:35 -0700
Subject: block: fix module reference leak on put_disk() call for cgroups
 throttle

get_disk(),get_gendisk() calls have non explicit side effect: they
increase the reference on the disk owner module.

The following is the correct sequence how to get a disk reference and
to put it:

    disk = get_gendisk(...);

    /* use disk */

    owner = disk->fops->owner;
    put_disk(disk);
    module_put(owner);

fs/block_dev.c is aware of this required module_put() call, but f.e.
blkg_conf_finish(), which is located in block/blk-cgroup.c, does not put
a module reference.  To see a leakage in action cgroups throttle config
can be used.  In the following script I'm removing throttle for /dev/ram0
(actually this is NOP, because throttle was never set for this device):

    # lsmod | grep brd
    brd                     5175  0
    # i=100; while [ $i -gt 0 ]; do echo "1:0 0" > \
        /sys/fs/cgroup/blkio/blkio.throttle.read_bps_device; i=$(($i - 1)); \
    done
    # lsmod | grep brd
    brd                     5175  100

Now brd module has 100 references.

The issue is fixed by calling module_put() just right away put_disk().

Signed-off-by: Roman Pen <roman.penyaev@profitbricks.com>
Cc: Gi-Oh Kim <gi-oh.kim@profitbricks.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-cgroup.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 5a37188b559f..66e6f1aae02e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -788,6 +788,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 {
 	struct gendisk *disk;
 	struct blkcg_gq *blkg;
+	struct module *owner;
 	unsigned int major, minor;
 	int key_len, part, ret;
 	char *body;
@@ -804,7 +805,9 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 	if (!disk)
 		return -ENODEV;
 	if (part) {
+		owner = disk->fops->owner;
 		put_disk(disk);
+		module_put(owner);
 		return -ENODEV;
 	}
 
@@ -820,7 +823,9 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 		ret = PTR_ERR(blkg);
 		rcu_read_unlock();
 		spin_unlock_irq(disk->queue->queue_lock);
+		owner = disk->fops->owner;
 		put_disk(disk);
+		module_put(owner);
 		/*
 		 * If queue was bypassing, we should retry.  Do so after a
 		 * short msleep().  It isn't strictly necessary but queue
@@ -851,9 +856,13 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
 void blkg_conf_finish(struct blkg_conf_ctx *ctx)
 	__releases(ctx->disk->queue->queue_lock) __releases(rcu)
 {
+	struct module *owner;
+
 	spin_unlock_irq(ctx->disk->queue->queue_lock);
 	rcu_read_unlock();
+	owner = ctx->disk->fops->owner;
 	put_disk(ctx->disk);
+	module_put(owner);
 }
 EXPORT_SYMBOL_GPL(blkg_conf_finish);
 
-- 
cgit v1.2.3


From 5f009d3f8e6685fe8c6215082c1696a08b411220 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Wed, 10 Feb 2016 16:52:47 -0700
Subject: block: Initialize max_dev_sectors to 0

The new queue limit is not used by the majority of block drivers, and
should be initialized to 0 for the driver's requested settings to be used.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Acked-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-settings.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index dd4973583978..c7bb666aafd1 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -91,8 +91,8 @@ void blk_set_default_limits(struct queue_limits *lim)
 	lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
 	lim->virt_boundary_mask = 0;
 	lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
-	lim->max_sectors = lim->max_dev_sectors = lim->max_hw_sectors =
-		BLK_SAFE_MAX_SECTORS;
+	lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
+	lim->max_dev_sectors = 0;
 	lim->chunk_sectors = 0;
 	lim->max_write_same_sectors = 0;
 	lim->max_discard_sectors = 0;
-- 
cgit v1.2.3


From a59e0f5795fe52dad42a99c00287e3766153b312 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Thu, 11 Feb 2016 13:05:38 -0700
Subject: blk-mq: End unstarted requests on dying queue

Go directly to ending a request if it wasn't started. Previously, completing a
request may invoke a driver callback for a request it didn't initialize.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Reviewed-by: Sagi Grimberg <sagig@mellanox.com>
Reviewed-by: Johannes Thumshirn <jthumshirn at suse.de>
Acked-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4c0622fae413..56c0a726b619 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -599,8 +599,10 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 		 * If a request wasn't started before the queue was
 		 * marked dying, kill it here or it'll go unnoticed.
 		 */
-		if (unlikely(blk_queue_dying(rq->q)))
-			blk_mq_complete_request(rq, -EIO);
+		if (unlikely(blk_queue_dying(rq->q))) {
+			rq->errors = -EIO;
+			blk_mq_end_request(rq, rq->errors);
+		}
 		return;
 	}
 
-- 
cgit v1.2.3


From 2d99b55d378c996b9692a0c93dd25f4ed5d58934 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Fri, 12 Feb 2016 09:39:15 +0100
Subject: bio: return EINTR if copying to user space got interrupted

Commit 35dc248383bbab0a7203fca4d722875bc81ef091 introduced a check for
current->mm to see if we have a user space context and only copies data
if we do. Now if an IO gets interrupted by a signal data isn't copied
into user space any more (as we don't have a user space context) but
user space isn't notified about it.

This patch modifies the behaviour to return -EINTR from bio_uncopy_user()
to notify userland that a signal has interrupted the syscall, otherwise
it could lead to a situation where the caller may get a buffer with
no data returned.

This can be reproduced by issuing SG_IO ioctl()s in one thread while
constantly sending signals to it.

Fixes: 35dc248 [SCSI] sg: Fix user memory corruption when SG_IO is interrupted by a signal
Signed-off-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Hannes Reinecke <hare@suse.de>
Cc: stable@vger.kernel.org # v.3.11+
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/bio.c b/block/bio.c
index f53a691b6533..cf7591551b17 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1090,9 +1090,12 @@ int bio_uncopy_user(struct bio *bio)
 	if (!bio_flagged(bio, BIO_NULL_MAPPED)) {
 		/*
 		 * if we're in a workqueue, the request is orphaned, so
-		 * don't copy into a random user address space, just free.
+		 * don't copy into a random user address space, just free
+		 * and return -EINTR so user space doesn't expect any data.
 		 */
-		if (current->mm && bio_data_dir(bio) == READ)
+		if (!current->mm)
+			ret = -EINTR;
+		else if (bio_data_dir(bio) == READ)
 			ret = bio_copy_to_iter(bio, bmd->iter);
 		if (bmd->is_our_pages)
 			bio_free_pages(bio);
-- 
cgit v1.2.3


From 18f922d037211a15543af935861bf92161e697e9 Mon Sep 17 00:00:00 2001
From: Alan <gnomes@lxorguk.ukuu.org.uk>
Date: Wed, 17 Feb 2016 14:15:30 +0000
Subject: blk: fix overflow in queue_discard_max_hw_show

We get this right for queue_discard_max_show but not max_hw_show. Follow the
same pattern as queue_discard_max_show instead so that we don't truncate.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-sysfs.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'block')

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index e140cc487ce1..dd93763057ce 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -147,10 +147,9 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag
 
 static ssize_t queue_discard_max_hw_show(struct request_queue *q, char *page)
 {
-	unsigned long long val;
 
-	val = q->limits.max_hw_discard_sectors << 9;
-	return sprintf(page, "%llu\n", val);
+	return sprintf(page, "%llu\n",
+		(unsigned long long)q->limits.max_hw_discard_sectors << 9);
 }
 
 static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
-- 
cgit v1.2.3


From 03cdadb04077b9311bbc67d98cc5401aff76482d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 26 Feb 2016 15:19:43 -0800
Subject: block: disable block device DAX by default

The recent *sync enabling discovered that we are inserting into the
block_device pagecache counter to the expectations of the dirty data
tracking for dax mappings.  This can lead to data corruption.

We want to support DAX for block devices eventually, but it requires
wider changes to properly manage the pagecache.

   dump_stack+0x85/0xc2
   dax_writeback_mapping_range+0x60/0xe0
   blkdev_writepages+0x3f/0x50
   do_writepages+0x21/0x30
   __filemap_fdatawrite_range+0xc6/0x100
   filemap_write_and_wait+0x4a/0xa0
   set_blocksize+0x70/0xd0
   sb_set_blocksize+0x1d/0x50
   ext4_fill_super+0x75b/0x3360
   mount_bdev+0x180/0x1b0
   ext4_mount+0x15/0x20
   mount_fs+0x38/0x170

Mark the support broken so its disabled by default, but otherwise still
available for testing.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reported-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Suggested-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@fb.com>
Cc: Matthew Wilcox <matthew.r.wilcox@intel.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/Kconfig  | 13 +++++++++++++
 fs/block_dev.c |  6 +++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/Kconfig b/block/Kconfig
index 161491d0a879..0363cd731320 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -88,6 +88,19 @@ config BLK_DEV_INTEGRITY
 	T10/SCSI Data Integrity Field or the T13/ATA External Path
 	Protection.  If in doubt, say N.
 
+config BLK_DEV_DAX
+	bool "Block device DAX support"
+	depends on FS_DAX
+	depends on BROKEN
+	help
+	  When DAX support is available (CONFIG_FS_DAX) raw block
+	  devices can also support direct userspace access to the
+	  storage capacity via MMAP(2) similar to a file on a
+	  DAX-enabled filesystem.  However, the DAX I/O-path disables
+	  some standard I/O-statistics, and the MMAP(2) path has some
+	  operational differences due to bypassing the page
+	  cache.  If in doubt, say N.
+
 config BLK_DEV_THROTTLING
 	bool "Block layer bio throttling support"
 	depends on BLK_CGROUP=y
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 39b3a174a425..31c6d1090f11 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1201,7 +1201,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 		bdev->bd_disk = disk;
 		bdev->bd_queue = disk->queue;
 		bdev->bd_contains = bdev;
-		bdev->bd_inode->i_flags = disk->fops->direct_access ? S_DAX : 0;
+		if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && disk->fops->direct_access)
+			bdev->bd_inode->i_flags = S_DAX;
+		else
+			bdev->bd_inode->i_flags = 0;
+
 		if (!partno) {
 			ret = -ENXIO;
 			bdev->bd_part = disk_get_part(disk, partno);
-- 
cgit v1.2.3