From 06f4e94898918bcad00cdd4d349313a439d6911e Mon Sep 17 00:00:00 2001
From: Zefan Li <lizefan@huawei.com>
Date: Tue, 9 Aug 2016 11:25:01 +0800
Subject: cpuset: make sure new tasks conform to the current config of the
 cpuset

A new task inherits cpus_allowed and mems_allowed masks from its parent,
but if someone changes cpuset's config by writing to cpuset.cpus/cpuset.mems
before this new task is inserted into the cgroup's task list, the new task
won't be updated accordingly.

Signed-off-by: Zefan Li <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: stable@vger.kernel.org
---
 kernel/cpuset.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c7fd2778ed50..c27e53326bef 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2069,6 +2069,20 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
 	mutex_unlock(&cpuset_mutex);
 }
 
+/*
+ * Make sure the new task conform to the current state of its parent,
+ * which could have been changed by cpuset just after it inherits the
+ * state from the parent and before it sits on the cgroup's task list.
+ */
+void cpuset_fork(struct task_struct *task)
+{
+	if (task_css_is_root(task, cpuset_cgrp_id))
+		return;
+
+	set_cpus_allowed_ptr(task, &current->cpus_allowed);
+	task->mems_allowed = current->mems_allowed;
+}
+
 struct cgroup_subsys cpuset_cgrp_subsys = {
 	.css_alloc	= cpuset_css_alloc,
 	.css_online	= cpuset_css_online,
@@ -2079,6 +2093,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
 	.attach		= cpuset_attach,
 	.post_attach	= cpuset_post_attach,
 	.bind		= cpuset_bind,
+	.fork		= cpuset_fork,
 	.legacy_cftypes	= files,
 	.early_init	= true,
 };
-- 
cgit v1.2.3


From 7afafc8a44bf0ab841b17d450b02aedb3a138985 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 16 Aug 2016 10:59:35 +0300
Subject: block: Fix secure erase

Commit 288dab8a35a0 ("block: add a separate operation type for secure
erase") split REQ_OP_SECURE_ERASE from REQ_OP_DISCARD without considering
all the places REQ_OP_DISCARD was being used to mean either. Fix those.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Fixes: 288dab8a35a0 ("block: add a separate operation type for secure erase")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/bio.c              | 21 +++++++++++----------
 block/blk-merge.c        | 33 +++++++++++++++++++--------------
 block/elevator.c         |  2 +-
 drivers/mmc/card/block.c |  1 +
 drivers/mmc/card/queue.c |  3 ++-
 drivers/mmc/card/queue.h |  4 +++-
 include/linux/bio.h      | 10 ++++++++--
 include/linux/blkdev.h   |  6 ++++--
 kernel/trace/blktrace.c  |  2 +-
 9 files changed, 50 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/block/bio.c b/block/bio.c
index f39477538fef..aa7354088008 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -667,18 +667,19 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
 	bio->bi_iter.bi_sector	= bio_src->bi_iter.bi_sector;
 	bio->bi_iter.bi_size	= bio_src->bi_iter.bi_size;
 
-	if (bio_op(bio) == REQ_OP_DISCARD)
-		goto integrity_clone;
-
-	if (bio_op(bio) == REQ_OP_WRITE_SAME) {
+	switch (bio_op(bio)) {
+	case REQ_OP_DISCARD:
+	case REQ_OP_SECURE_ERASE:
+		break;
+	case REQ_OP_WRITE_SAME:
 		bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
-		goto integrity_clone;
+		break;
+	default:
+		bio_for_each_segment(bv, bio_src, iter)
+			bio->bi_io_vec[bio->bi_vcnt++] = bv;
+		break;
 	}
 
-	bio_for_each_segment(bv, bio_src, iter)
-		bio->bi_io_vec[bio->bi_vcnt++] = bv;
-
-integrity_clone:
 	if (bio_integrity(bio_src)) {
 		int ret;
 
@@ -1788,7 +1789,7 @@ struct bio *bio_split(struct bio *bio, int sectors,
 	 * Discards need a mutable bio_vec to accommodate the payload
 	 * required by the DSM TRIM and UNMAP commands.
 	 */
-	if (bio_op(bio) == REQ_OP_DISCARD)
+	if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE)
 		split = bio_clone_bioset(bio, gfp, bs);
 	else
 		split = bio_clone_fast(bio, gfp, bs);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 3eec75a9e91d..72627e3cf91e 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -172,12 +172,18 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
 	struct bio *split, *res;
 	unsigned nsegs;
 
-	if (bio_op(*bio) == REQ_OP_DISCARD)
+	switch (bio_op(*bio)) {
+	case REQ_OP_DISCARD:
+	case REQ_OP_SECURE_ERASE:
 		split = blk_bio_discard_split(q, *bio, bs, &nsegs);
-	else if (bio_op(*bio) == REQ_OP_WRITE_SAME)
+		break;
+	case REQ_OP_WRITE_SAME:
 		split = blk_bio_write_same_split(q, *bio, bs, &nsegs);
-	else
+		break;
+	default:
 		split = blk_bio_segment_split(q, *bio, q->bio_split, &nsegs);
+		break;
+	}
 
 	/* physical segments can be figured out during splitting */
 	res = split ? split : *bio;
@@ -213,7 +219,7 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 	 * This should probably be returning 0, but blk_add_request_payload()
 	 * (Christoph!!!!)
 	 */
-	if (bio_op(bio) == REQ_OP_DISCARD)
+	if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE)
 		return 1;
 
 	if (bio_op(bio) == REQ_OP_WRITE_SAME)
@@ -385,7 +391,9 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
 	nsegs = 0;
 	cluster = blk_queue_cluster(q);
 
-	if (bio_op(bio) == REQ_OP_DISCARD) {
+	switch (bio_op(bio)) {
+	case REQ_OP_DISCARD:
+	case REQ_OP_SECURE_ERASE:
 		/*
 		 * This is a hack - drivers should be neither modifying the
 		 * biovec, nor relying on bi_vcnt - but because of
@@ -393,19 +401,16 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
 		 * a payload we need to set up here (thank you Christoph) and
 		 * bi_vcnt is really the only way of telling if we need to.
 		 */
-
-		if (bio->bi_vcnt)
-			goto single_segment;
-
-		return 0;
-	}
-
-	if (bio_op(bio) == REQ_OP_WRITE_SAME) {
-single_segment:
+		if (!bio->bi_vcnt)
+			return 0;
+		/* Fall through */
+	case REQ_OP_WRITE_SAME:
 		*sg = sglist;
 		bvec = bio_iovec(bio);
 		sg_set_page(*sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
 		return 1;
+	default:
+		break;
 	}
 
 	for_each_bio(bio)
diff --git a/block/elevator.c b/block/elevator.c
index 7096c22041e7..f7d973a56fd7 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -366,7 +366,7 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
 	list_for_each_prev(entry, &q->queue_head) {
 		struct request *pos = list_entry_rq(entry);
 
-		if ((req_op(rq) == REQ_OP_DISCARD) != (req_op(pos) == REQ_OP_DISCARD))
+		if (req_op(rq) != req_op(pos))
 			break;
 		if (rq_data_dir(rq) != rq_data_dir(pos))
 			break;
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index 48a5dd740f3b..82503e6f04b3 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -1726,6 +1726,7 @@ static u8 mmc_blk_prep_packed_list(struct mmc_queue *mq, struct request *req)
 			break;
 
 		if (req_op(next) == REQ_OP_DISCARD ||
+		    req_op(next) == REQ_OP_SECURE_ERASE ||
 		    req_op(next) == REQ_OP_FLUSH)
 			break;
 
diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index bf14642a576a..29578e98603d 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -33,7 +33,8 @@ static int mmc_prep_request(struct request_queue *q, struct request *req)
 	/*
 	 * We only like normal block requests and discards.
 	 */
-	if (req->cmd_type != REQ_TYPE_FS && req_op(req) != REQ_OP_DISCARD) {
+	if (req->cmd_type != REQ_TYPE_FS && req_op(req) != REQ_OP_DISCARD &&
+	    req_op(req) != REQ_OP_SECURE_ERASE) {
 		blk_dump_rq_flags(req, "MMC bad request");
 		return BLKPREP_KILL;
 	}
diff --git a/drivers/mmc/card/queue.h b/drivers/mmc/card/queue.h
index d62531124d54..fee5e1271465 100644
--- a/drivers/mmc/card/queue.h
+++ b/drivers/mmc/card/queue.h
@@ -4,7 +4,9 @@
 static inline bool mmc_req_is_special(struct request *req)
 {
 	return req &&
-		(req_op(req) == REQ_OP_FLUSH || req_op(req) == REQ_OP_DISCARD);
+		(req_op(req) == REQ_OP_FLUSH ||
+		 req_op(req) == REQ_OP_DISCARD ||
+		 req_op(req) == REQ_OP_SECURE_ERASE);
 }
 
 struct request;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 59ffaa68b11b..23ddf4b46a9b 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -71,7 +71,8 @@ static inline bool bio_has_data(struct bio *bio)
 {
 	if (bio &&
 	    bio->bi_iter.bi_size &&
-	    bio_op(bio) != REQ_OP_DISCARD)
+	    bio_op(bio) != REQ_OP_DISCARD &&
+	    bio_op(bio) != REQ_OP_SECURE_ERASE)
 		return true;
 
 	return false;
@@ -79,7 +80,9 @@ static inline bool bio_has_data(struct bio *bio)
 
 static inline bool bio_no_advance_iter(struct bio *bio)
 {
-	return bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_WRITE_SAME;
+	return bio_op(bio) == REQ_OP_DISCARD ||
+	       bio_op(bio) == REQ_OP_SECURE_ERASE ||
+	       bio_op(bio) == REQ_OP_WRITE_SAME;
 }
 
 static inline bool bio_is_rw(struct bio *bio)
@@ -199,6 +202,9 @@ static inline unsigned bio_segments(struct bio *bio)
 	if (bio_op(bio) == REQ_OP_DISCARD)
 		return 1;
 
+	if (bio_op(bio) == REQ_OP_SECURE_ERASE)
+		return 1;
+
 	if (bio_op(bio) == REQ_OP_WRITE_SAME)
 		return 1;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2c210b6a7bcf..e79055c8b577 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -882,7 +882,7 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
 static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
 						     int op)
 {
-	if (unlikely(op == REQ_OP_DISCARD))
+	if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE))
 		return min(q->limits.max_discard_sectors, UINT_MAX >> 9);
 
 	if (unlikely(op == REQ_OP_WRITE_SAME))
@@ -913,7 +913,9 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
 	if (unlikely(rq->cmd_type != REQ_TYPE_FS))
 		return q->limits.max_hw_sectors;
 
-	if (!q->limits.chunk_sectors || (req_op(rq) == REQ_OP_DISCARD))
+	if (!q->limits.chunk_sectors ||
+	    req_op(rq) == REQ_OP_DISCARD ||
+	    req_op(rq) == REQ_OP_SECURE_ERASE)
 		return blk_queue_get_max_sectors(q, req_op(rq));
 
 	return min(blk_max_size_offset(q, offset),
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 7598e6ca817a..dbafc5df03f3 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -223,7 +223,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	what |= MASK_TC_BIT(op_flags, META);
 	what |= MASK_TC_BIT(op_flags, PREFLUSH);
 	what |= MASK_TC_BIT(op_flags, FUA);
-	if (op == REQ_OP_DISCARD)
+	if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)
 		what |= BLK_TC_ACT(BLK_TC_DISCARD);
 	if (op == REQ_OP_FLUSH)
 		what |= BLK_TC_ACT(BLK_TC_FLUSH);
-- 
cgit v1.2.3


From 1e12c4a9393b75a744aada2c8115434572698bc3 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Thu, 11 Aug 2016 14:19:42 +0100
Subject: genirq: Correctly configure the trigger on chained interrupts

Commit 1e2a7d78499e ("irqdomain: Don't set type when mapping an IRQ")
moved the trigger configuration call from the irqdomain mapping to
the interrupt being actually requested.

This patch failed to handle the case where we configure a chained
interrupt, which doesn't get requested through the usual path.

In order to solve this, let's call __irq_set_trigger just before
starting the cascade interrupt. Special care must be taken to
make the flow handler stick, as the .irq_set_type method could
have reset it (it doesn't know we're dealing with a chained
interrupt).

Based on an initial patch by Jon Hunter.

Fixes: 1e2a7d78499e ("irqdomain: Don't set type when mapping an IRQ")
Reported-by: John Stultz <john.stultz@linaro.org>
Reported-by: Linus Walleij <linus.walleij@linaro.org>
Tested-by: John Stultz <john.stultz@linaro.org>
Acked-by: Jon Hunter <jonathanh@nvidia.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 kernel/irq/chip.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b4c1bc7c9ca2..637389088b3f 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -820,6 +820,17 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
 	desc->name = name;
 
 	if (handle != handle_bad_irq && is_chained) {
+		/*
+		 * We're about to start this interrupt immediately,
+		 * hence the need to set the trigger configuration.
+		 * But the .set_type callback may have overridden the
+		 * flow handler, ignoring that we're dealing with a
+		 * chained interrupt. Reset it immediately because we
+		 * do know better.
+		 */
+		__irq_set_trigger(desc, irqd_get_trigger_type(&desc->irq_data));
+		desc->handle_irq = handle;
+
 		irq_settings_set_noprobe(desc);
 		irq_settings_set_norequest(desc);
 		irq_settings_set_nothread(desc);
-- 
cgit v1.2.3


From 568ac888215c7fb2fabe8ea739b00ec3c1f5d440 Mon Sep 17 00:00:00 2001
From: Balbir Singh <bsingharora@gmail.com>
Date: Wed, 10 Aug 2016 15:43:06 -0400
Subject: cgroup: reduce read locked section of cgroup_threadgroup_rwsem during
 fork

cgroup_threadgroup_rwsem is acquired in read mode during process exit
and fork.  It is also grabbed in write mode during
__cgroups_proc_write().  I've recently run into a scenario with lots
of memory pressure and OOM and I am beginning to see

systemd

 __switch_to+0x1f8/0x350
 __schedule+0x30c/0x990
 schedule+0x48/0xc0
 percpu_down_write+0x114/0x170
 __cgroup_procs_write.isra.12+0xb8/0x3c0
 cgroup_file_write+0x74/0x1a0
 kernfs_fop_write+0x188/0x200
 __vfs_write+0x6c/0xe0
 vfs_write+0xc0/0x230
 SyS_write+0x6c/0x110
 system_call+0x38/0xb4

This thread is waiting on the reader of cgroup_threadgroup_rwsem to
exit.  The reader itself is under memory pressure and has gone into
reclaim after fork. There are times the reader also ends up waiting on
oom_lock as well.

 __switch_to+0x1f8/0x350
 __schedule+0x30c/0x990
 schedule+0x48/0xc0
 jbd2_log_wait_commit+0xd4/0x180
 ext4_evict_inode+0x88/0x5c0
 evict+0xf8/0x2a0
 dispose_list+0x50/0x80
 prune_icache_sb+0x6c/0x90
 super_cache_scan+0x190/0x210
 shrink_slab.part.15+0x22c/0x4c0
 shrink_zone+0x288/0x3c0
 do_try_to_free_pages+0x1dc/0x590
 try_to_free_pages+0xdc/0x260
 __alloc_pages_nodemask+0x72c/0xc90
 alloc_pages_current+0xb4/0x1a0
 page_table_alloc+0xc0/0x170
 __pte_alloc+0x58/0x1f0
 copy_page_range+0x4ec/0x950
 copy_process.isra.5+0x15a0/0x1870
 _do_fork+0xa8/0x4b0
 ppc_clone+0x8/0xc

In the meanwhile, all processes exiting/forking are blocked almost
stalling the system.

This patch moves the threadgroup_change_begin from before
cgroup_fork() to just before cgroup_canfork().  There is no nee to
worry about threadgroup changes till the task is actually added to the
threadgroup.  This avoids having to call reclaim with
cgroup_threadgroup_rwsem held.

tj: Subject and description edits.

Signed-off-by: Balbir Singh <bsingharora@gmail.com>
Acked-by: Zefan Li <lizefan@huawei.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: stable@vger.kernel.org # v4.2+
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/fork.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 52e725d4a866..aaf782327bf3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1404,7 +1404,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->real_start_time = ktime_get_boot_ns();
 	p->io_context = NULL;
 	p->audit_context = NULL;
-	threadgroup_change_begin(current);
 	cgroup_fork(p);
 #ifdef CONFIG_NUMA
 	p->mempolicy = mpol_dup(p->mempolicy);
@@ -1556,6 +1555,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	INIT_LIST_HEAD(&p->thread_group);
 	p->task_works = NULL;
 
+	threadgroup_change_begin(current);
 	/*
 	 * Ensure that the cgroup subsystem policies allow the new process to be
 	 * forked. It should be noted the the new process's css_set can be changed
@@ -1656,6 +1656,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 bad_fork_cancel_cgroup:
 	cgroup_cancel_fork(p);
 bad_fork_free_pid:
+	threadgroup_change_end(current);
 	if (pid != &init_struct_pid)
 		free_pid(pid);
 bad_fork_cleanup_thread:
@@ -1688,7 +1689,6 @@ bad_fork_cleanup_policy:
 	mpol_put(p->mempolicy);
 bad_fork_cleanup_threadgroup_lock:
 #endif
-	threadgroup_change_end(current);
 	delayacct_tsk_free(p);
 bad_fork_cleanup_count:
 	atomic_dec(&p->cred->user->processes);
-- 
cgit v1.2.3


From 4396f46c8c628329bd35ee4b84140b8b001a11eb Mon Sep 17 00:00:00 2001
From: Shawn Lin <shawn.lin@rock-chips.com>
Date: Mon, 22 Aug 2016 16:21:52 +0800
Subject: genirq: Fix potential memleak when failing to get irq pm

Obviously we should free action here if irq_chip_pm_get failed.

Fixes: be45beb2df69: "genirq: Add runtime power management support for IRQ chips"
Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com>
Cc: Jon Hunter <jonathanh@nvidia.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Link: http://lkml.kernel.org/r/1471854112-13006-1-git-send-email-shawn.lin@rock-chips.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 73a2b786b5e9..9530fcd27704 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1681,8 +1681,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
 	action->dev_id = dev_id;
 
 	retval = irq_chip_pm_get(&desc->irq_data);
-	if (retval < 0)
+	if (retval < 0) {
+		kfree(action);
 		return retval;
+	}
 
 	chip_bus_lock(desc);
 	retval = __setup_irq(irq, desc, action);
@@ -1985,8 +1987,10 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
 	action->percpu_dev_id = dev_id;
 
 	retval = irq_chip_pm_get(&desc->irq_data);
-	if (retval < 0)
+	if (retval < 0) {
+		kfree(action);
 		return retval;
+	}
 
 	chip_bus_lock(desc);
 	retval = __setup_irq(irq, desc, action);
-- 
cgit v1.2.3


From 3ee0ce2a54dff07d09440723594df89bc1a12e79 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 11 Aug 2016 07:06:45 -0700
Subject: genirq/affinity: Use get/put_online_cpus around cpumask operations

Without locking out CPU mask operations we might end up with an inconsistent
view of the cpumask in the function.

Fixes: 5e385a6ef31f: "genirq: Add a helper to spread an affinity mask for MSI/MSI-X vectors"
Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: http://lkml.kernel.org/r/1470924405-25728-1-git-send-email-hch@lst.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/affinity.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index f68959341c0f..32f6cfcff212 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -39,6 +39,7 @@ struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
 		return NULL;
 	}
 
+	get_online_cpus();
 	if (max_vecs >= num_online_cpus()) {
 		cpumask_copy(affinity_mask, cpu_online_mask);
 		*nr_vecs = num_online_cpus();
@@ -56,6 +57,7 @@ struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
 		}
 		*nr_vecs = vecs;
 	}
+	put_online_cpus();
 
 	return affinity_mask;
 }
-- 
cgit v1.2.3


From 27727df240c7cc84f2ba6047c6f18d5addfd25ef Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Tue, 23 Aug 2016 16:08:21 -0700
Subject: timekeeping: Avoid taking lock in NMI path with
 CONFIG_DEBUG_TIMEKEEPING

When I added some extra sanity checking in timekeeping_get_ns() under
CONFIG_DEBUG_TIMEKEEPING, I missed that the NMI safe __ktime_get_fast_ns()
method was using timekeeping_get_ns().

Thus the locking added to the debug checks broke the NMI-safety of
__ktime_get_fast_ns().

This patch open-codes the timekeeping_get_ns() logic for
__ktime_get_fast_ns(), so can avoid any deadlocks in NMI.

Fixes: 4ca22c2648f9 "timekeeping: Add warnings when overflows or underflows are observed"
Reported-by: Steven Rostedt <rostedt@goodmis.org>
Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: stable <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/1471993702-29148-2-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3b65746c7f15..e07fb093f819 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -401,7 +401,10 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
 	do {
 		seq = raw_read_seqcount_latch(&tkf->seq);
 		tkr = tkf->base + (seq & 0x01);
-		now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
+		now = ktime_to_ns(tkr->base);
+
+		now += clocksource_delta(tkr->read(tkr->clock),
+					 tkr->cycle_last, tkr->mask);
 	} while (read_seqcount_retry(&tkf->seq, seq));
 
 	return now;
-- 
cgit v1.2.3


From a4f8f6667f099036c88f231dcad4cf233652c824 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Tue, 23 Aug 2016 16:08:22 -0700
Subject: timekeeping: Cap array access in timekeeping_debug

It was reported that hibernation could fail on the 2nd attempt, where the
system hangs at hibernate() -> syscore_resume() -> i8237A_resume() ->
claim_dma_lock(), because the lock has already been taken.

However there is actually no other process would like to grab this lock on
that problematic platform.

Further investigation showed that the problem is triggered by setting
/sys/power/pm_trace to 1 before the 1st hibernation.

Since once pm_trace is enabled, the rtc becomes unmeaningful after suspend,
and meanwhile some BIOSes would like to adjust the 'invalid' RTC (e.g, smaller
than 1970) to the release date of that motherboard during POST stage, thus
after resumed, it may seem that the system had a significant long sleep time
which is a completely meaningless value.

Then in timekeeping_resume -> tk_debug_account_sleep_time, if the bit31 of the
sleep time happened to be set to 1, fls() returns 32 and we add 1 to
sleep_time_bin[32], which causes an out of bounds array access and therefor
memory being overwritten.

As depicted by System.map:
0xffffffff81c9d080 b sleep_time_bin
0xffffffff81c9d100 B dma_spin_lock
the dma_spin_lock.val is set to 1, which caused this problem.

This patch adds a sanity check in tk_debug_account_sleep_time()
to ensure we don't index past the sleep_time_bin array.

[jstultz: Problem diagnosed and original patch by Chen Yu, I've solved the
 issue slightly differently, but borrowed his excelent explanation of the
 issue here.]

Fixes: 5c83545f24ab "power: Add option to log time spent in suspend"
Reported-by: Janek Kozicki <cosurgi@gmail.com>
Reported-by: Chen Yu <yu.c.chen@intel.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: linux-pm@vger.kernel.org
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Xunlei Pang <xpang@redhat.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: stable <stable@vger.kernel.org>
Cc: Zhang Rui <rui.zhang@intel.com>
Link: http://lkml.kernel.org/r/1471993702-29148-3-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping_debug.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index f6bd65236712..107310a6f36f 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -23,7 +23,9 @@
 
 #include "timekeeping_internal.h"
 
-static unsigned int sleep_time_bin[32] = {0};
+#define NUM_BINS 32
+
+static unsigned int sleep_time_bin[NUM_BINS] = {0};
 
 static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
 {
@@ -69,6 +71,9 @@ late_initcall(tk_debug_sleep_time_init);
 
 void tk_debug_account_sleep_time(struct timespec64 *t)
 {
-	sleep_time_bin[fls(t->tv_sec)]++;
+	/* Cap bin index so we don't overflow the array */
+	int bin = min(fls(t->tv_sec), NUM_BINS-1);
+
+	sleep_time_bin[bin]++;
 }
 
-- 
cgit v1.2.3


From 8b6a3fe8fab97716990a3abde1a01fb5a34552a3 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Wed, 24 Aug 2016 10:07:14 +0100
Subject: perf/core: Use this_cpu_ptr() when stopping AUX events

When tearing down an AUX buf for an event via perf_mmap_close(),
__perf_event_output_stop() is called on the event's CPU to ensure that
trace generation is halted before the process of unmapping and
freeing the buffer pages begins.

The callback is performed via cpu_function_call(), which ensures that it
runs with interrupts disabled and is therefore not preemptible.
Unfortunately, the current code grabs the per-cpu context pointer using
get_cpu_ptr(), which unnecessarily disables preemption and doesn't pair
the call with put_cpu_ptr(), leading to a preempt_count() imbalance and
a BUG when freeing the AUX buffer later on:

  WARNING: CPU: 1 PID: 2249 at kernel/events/ring_buffer.c:539 __rb_free_aux+0x10c/0x120
  Modules linked in:
  [...]
  Call Trace:
   [<ffffffff813379dd>] dump_stack+0x4f/0x72
   [<ffffffff81059ff6>] __warn+0xc6/0xe0
   [<ffffffff8105a0c8>] warn_slowpath_null+0x18/0x20
   [<ffffffff8112761c>] __rb_free_aux+0x10c/0x120
   [<ffffffff81128163>] rb_free_aux+0x13/0x20
   [<ffffffff8112515e>] perf_mmap_close+0x29e/0x2f0
   [<ffffffff8111da30>] ? perf_iterate_ctx+0xe0/0xe0
   [<ffffffff8115f685>] remove_vma+0x25/0x60
   [<ffffffff81161796>] exit_mmap+0x106/0x140
   [<ffffffff8105725c>] mmput+0x1c/0xd0
   [<ffffffff8105cac3>] do_exit+0x253/0xbf0
   [<ffffffff8105e32e>] do_group_exit+0x3e/0xb0
   [<ffffffff81068d49>] get_signal+0x249/0x640
   [<ffffffff8101c273>] do_signal+0x23/0x640
   [<ffffffff81905f42>] ? _raw_write_unlock_irq+0x12/0x30
   [<ffffffff81905f69>] ? _raw_spin_unlock_irq+0x9/0x10
   [<ffffffff81901896>] ? __schedule+0x2c6/0x710
   [<ffffffff810022a4>] exit_to_usermode_loop+0x74/0x90
   [<ffffffff81002a56>] prepare_exit_to_usermode+0x26/0x30
   [<ffffffff81906d1b>] retint_user+0x8/0x10

This patch uses this_cpu_ptr() instead of get_cpu_ptr(), since preemption is
already disabled by the caller.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Reviewed-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Fixes: 95ff4ca26c49 ("perf/core: Free AUX pages in unmap path")
Link: http://lkml.kernel.org/r/20160824091905.GA16944@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5650f5317e0c..3cfabdf7b942 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6166,7 +6166,7 @@ static int __perf_pmu_output_stop(void *info)
 {
 	struct perf_event *event = info;
 	struct pmu *pmu = event->pmu;
-	struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+	struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
 	struct remote_output ro = {
 		.rb	= event->rb,
 	};
-- 
cgit v1.2.3


From e7d316a02f683864a12389f8808570e37fb90aa3 Mon Sep 17 00:00:00 2001
From: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Date: Thu, 25 Aug 2016 15:16:51 -0700
Subject: sysctl: handle error writing UINT_MAX to u32 fields

We have scripts which write to certain fields on 3.18 kernels but this
seems to be failing on 4.4 kernels.  An entry which we write to here is
xfrm_aevent_rseqth which is u32.

  echo 4294967295  > /proc/sys/net/core/xfrm_aevent_rseqth

Commit 230633d109e3 ("kernel/sysctl.c: detect overflows when converting
to int") prevented writing to sysctl entries when integer overflow
occurs.  However, this does not apply to unsigned integers.

Heinrich suggested that we introduce a new option to handle 64 bit
limits and set min as 0 and max as UINT_MAX.  This might not work as it
leads to issues similar to __do_proc_doulongvec_minmax.  Alternatively,
we would need to change the datatype of the entry to 64 bit.

  static int __do_proc_doulongvec_minmax(void *data, struct ctl_table
  {
      i = (unsigned long *) data;   //This cast is causing to read beyond the size of data (u32)
      vleft = table->maxlen / sizeof(unsigned long); //vleft is 0 because maxlen is sizeof(u32) which is lesser than sizeof(unsigned long) on x86_64.

Introduce a new proc handler proc_douintvec.  Individual proc entries
will need to be updated to use the new handler.

[akpm@linux-foundation.org: coding-style fixes]
Fixes: 230633d109e3 ("kernel/sysctl.c:detect overflows when converting to int")
Link: http://lkml.kernel.org/r/1471479806-5252-1-git-send-email-subashab@codeaurora.org
Signed-off-by: Subash Abhinov Kasiviswanathan <subashab@codeaurora.org>
Cc: Heinrich Schuchardt <xypron.glpk@gmx.de>
Cc: Kees Cook <keescook@chromium.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sysctl.h |  2 ++
 kernel/sysctl.c        | 45 +++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 45 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 697e160c78d0..a4f7203a9017 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -42,6 +42,8 @@ extern int proc_dostring(struct ctl_table *, int,
 			 void __user *, size_t *, loff_t *);
 extern int proc_dointvec(struct ctl_table *, int,
 			 void __user *, size_t *, loff_t *);
+extern int proc_douintvec(struct ctl_table *, int,
+			 void __user *, size_t *, loff_t *);
 extern int proc_dointvec_minmax(struct ctl_table *, int,
 				void __user *, size_t *, loff_t *);
 extern int proc_dointvec_jiffies(struct ctl_table *, int,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b43d0b27c1fe..a13bbdaab47d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2140,6 +2140,21 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
 	return 0;
 }
 
+static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
+				 int *valp,
+				 int write, void *data)
+{
+	if (write) {
+		if (*negp)
+			return -EINVAL;
+		*valp = *lvalp;
+	} else {
+		unsigned int val = *valp;
+		*lvalp = (unsigned long)val;
+	}
+	return 0;
+}
+
 static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
 
 static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
@@ -2259,8 +2274,27 @@ static int do_proc_dointvec(struct ctl_table *table, int write,
 int proc_dointvec(struct ctl_table *table, int write,
 		     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-    return do_proc_dointvec(table,write,buffer,lenp,ppos,
-		    	    NULL,NULL);
+	return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
+}
+
+/**
+ * proc_douintvec - read a vector of unsigned integers
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+ * values from/to the user buffer, treated as an ASCII string.
+ *
+ * Returns 0 on success.
+ */
+int proc_douintvec(struct ctl_table *table, int write,
+		     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return do_proc_dointvec(table, write, buffer, lenp, ppos,
+				do_proc_douintvec_conv, NULL);
 }
 
 /*
@@ -2858,6 +2892,12 @@ int proc_dointvec(struct ctl_table *table, int write,
 	return -ENOSYS;
 }
 
+int proc_douintvec(struct ctl_table *table, int write,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
 int proc_dointvec_minmax(struct ctl_table *table, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -2903,6 +2943,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
  * exception granted :-)
  */
 EXPORT_SYMBOL(proc_dointvec);
+EXPORT_SYMBOL(proc_douintvec);
 EXPORT_SYMBOL(proc_dointvec_jiffies);
 EXPORT_SYMBOL(proc_dointvec_minmax);
 EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
-- 
cgit v1.2.3


From ae6c33ba6e37eea3012fe2640b22400ef3f2d0f3 Mon Sep 17 00:00:00 2001
From: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Date: Thu, 25 Aug 2016 15:17:00 -0700
Subject: printk: fix parsing of "brl=" option

Commit bbeddf52adc1 ("printk: move braille console support into separate
braille.[ch] files") moved the parsing of braille-related options into
_braille_console_setup(), changing the type of variable str from char*
to char**.  In this commit, memcmp(str, "brl,", 4) was correctly updated
to memcmp(*str, "brl,", 4) but not memcmp(str, "brl=", 4).

Update the code to make "brl=" option work again and replace memcmp()
with strncmp() to make the compiler able to detect such an issue.

Fixes: bbeddf52adc1 ("printk: move braille console support into separate braille.[ch] files")
Link: http://lkml.kernel.org/r/20160823165700.28952-1-nicolas.iooss_linux@m4x.org
Signed-off-by: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Cc: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/braille.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
index 276762f3a460..d5760c42f042 100644
--- a/kernel/printk/braille.c
+++ b/kernel/printk/braille.c
@@ -9,10 +9,10 @@
 
 char *_braille_console_setup(char **str, char **brl_options)
 {
-	if (!memcmp(*str, "brl,", 4)) {
+	if (!strncmp(*str, "brl,", 4)) {
 		*brl_options = "";
 		*str += 4;
-	} else if (!memcmp(str, "brl=", 4)) {
+	} else if (!strncmp(*str, "brl=", 4)) {
 		*brl_options = *str + 4;
 		*str = strchr(*brl_options, ',');
 		if (!*str)
-- 
cgit v1.2.3


From 485a252a5559b45d7df04c819ec91177c62c270b Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 10 Aug 2016 16:28:09 -0700
Subject: seccomp: Fix tracer exit notifications during fatal signals

This fixes a ptrace vs fatal pending signals bug as manifested in
seccomp now that seccomp was reordered to happen after ptrace. The
short version is that seccomp should not attempt to call do_exit()
while fatal signals are pending under a tracer. The existing code was
trying to be as defensively paranoid as possible, but it now ends up
confusing ptrace. Instead, the syscall can just be skipped (which solves
the original concern that the do_exit() was addressing) and normal signal
handling, tracer notification, and process death can happen.

Paraphrasing from the original bug report:

If a tracee task is in a PTRACE_EVENT_SECCOMP trap, or has been resumed
after such a trap but not yet been scheduled, and another task in the
thread-group calls exit_group(), then the tracee task exits without the
ptracer receiving a PTRACE_EVENT_EXIT notification. Test case here:
https://gist.github.com/khuey/3c43ac247c72cef8c956ca73281c9be7

The bug happens because when __seccomp_filter() detects
fatal_signal_pending(), it calls do_exit() without dequeuing the fatal
signal. When do_exit() sends the PTRACE_EVENT_EXIT notification and
that task is descheduled, __schedule() notices that there is a fatal
signal pending and changes its state from TASK_TRACED to TASK_RUNNING.
That prevents the ptracer's waitpid() from returning the ptrace event.
A more detailed analysis is here:
https://github.com/mozilla/rr/issues/1762#issuecomment-237396255.

Reported-by: Robert O'Callahan <robert@ocallahan.org>
Reported-by: Kyle Huey <khuey@kylehuey.com>
Tested-by: Kyle Huey <khuey@kylehuey.com>
Fixes: 93e35efb8de4 ("x86/ptrace: run seccomp after ptrace")
Signed-off-by: Kees Cook <keescook@chromium.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: James Morris <james.l.morris@oracle.com>
---
 kernel/seccomp.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index ef6c6c3f9d8a..0db7c8a2afe2 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -605,12 +605,16 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
 		ptrace_event(PTRACE_EVENT_SECCOMP, data);
 		/*
 		 * The delivery of a fatal signal during event
-		 * notification may silently skip tracer notification.
-		 * Terminating the task now avoids executing a system
-		 * call that may not be intended.
+		 * notification may silently skip tracer notification,
+		 * which could leave us with a potentially unmodified
+		 * syscall that the tracer would have liked to have
+		 * changed. Since the process is about to die, we just
+		 * force the syscall to be skipped and let the signal
+		 * kill the process and correctly handle any tracer exit
+		 * notifications.
 		 */
 		if (fatal_signal_pending(current))
-			do_exit(SIGSYS);
+			goto skip;
 		/* Check if the tracer forced the syscall to be skipped. */
 		this_syscall = syscall_get_nr(current, task_pt_regs(current));
 		if (this_syscall < 0)
-- 
cgit v1.2.3


From cd81a9170e69e018bbaba547c1fd85a585f5697a Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mguzik@redhat.com>
Date: Tue, 23 Aug 2016 16:20:38 +0200
Subject: mm: introduce get_task_exe_file

For more convenient access if one has a pointer to the task.

As a minor nit take advantage of the fact that only task lock + rcu are
needed to safely grab ->exe_file. This saves mm refcount dance.

Use the helper in proc_exe_link.

Signed-off-by: Mateusz Guzik <mguzik@redhat.com>
Acked-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Richard Guy Briggs <rgb@redhat.com>
Cc: <stable@vger.kernel.org> # 4.3.x
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 fs/proc/base.c     |  7 +------
 include/linux/mm.h |  1 +
 kernel/fork.c      | 23 +++++++++++++++++++++++
 3 files changed, 25 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 0d163a84082d..da8b1943ba04 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1552,18 +1552,13 @@ static const struct file_operations proc_pid_set_comm_operations = {
 static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
 {
 	struct task_struct *task;
-	struct mm_struct *mm;
 	struct file *exe_file;
 
 	task = get_proc_task(d_inode(dentry));
 	if (!task)
 		return -ENOENT;
-	mm = get_task_mm(task);
+	exe_file = get_task_exe_file(task);
 	put_task_struct(task);
-	if (!mm)
-		return -ENOENT;
-	exe_file = get_mm_exe_file(mm);
-	mmput(mm);
 	if (exe_file) {
 		*exe_path = exe_file->f_path;
 		path_get(&exe_file->f_path);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8f468e0d2534..004c73a988b7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1987,6 +1987,7 @@ extern void mm_drop_all_locks(struct mm_struct *mm);
 
 extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
 extern struct file *get_mm_exe_file(struct mm_struct *mm);
+extern struct file *get_task_exe_file(struct task_struct *task);
 
 extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages);
 extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages);
diff --git a/kernel/fork.c b/kernel/fork.c
index d277e83ed3e0..42451aeb245f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -773,6 +773,29 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
 }
 EXPORT_SYMBOL(get_mm_exe_file);
 
+/**
+ * get_task_exe_file - acquire a reference to the task's executable file
+ *
+ * Returns %NULL if task's mm (if any) has no associated executable file or
+ * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
+ * User must release file via fput().
+ */
+struct file *get_task_exe_file(struct task_struct *task)
+{
+	struct file *exe_file = NULL;
+	struct mm_struct *mm;
+
+	task_lock(task);
+	mm = task->mm;
+	if (mm) {
+		if (!(task->flags & PF_KTHREAD))
+			exe_file = get_mm_exe_file(mm);
+	}
+	task_unlock(task);
+	return exe_file;
+}
+EXPORT_SYMBOL(get_task_exe_file);
+
 /**
  * get_task_mm - acquire a reference to the task's mm
  *
-- 
cgit v1.2.3


From 5efc244346f9f338765da3d592f7947b0afdc4b5 Mon Sep 17 00:00:00 2001
From: Mateusz Guzik <mguzik@redhat.com>
Date: Tue, 23 Aug 2016 16:20:39 +0200
Subject: audit: fix exe_file access in audit_exe_compare

Prior to the change the function would blindly deference mm, exe_file
and exe_file->f_inode, each of which could have been NULL or freed.

Use get_task_exe_file to safely obtain stable exe_file.

Signed-off-by: Mateusz Guzik <mguzik@redhat.com>
Acked-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Acked-by: Richard Guy Briggs <rgb@redhat.com>
Cc: <stable@vger.kernel.org> # 4.3.x
Signed-off-by: Paul Moore <paul@paul-moore.com>
---
 kernel/audit_watch.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 3cf1c5978d39..4846691957da 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -19,6 +19,7 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include <linux/file.h>
 #include <linux/kernel.h>
 #include <linux/audit.h>
 #include <linux/kthread.h>
@@ -544,10 +545,11 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
 	unsigned long ino;
 	dev_t dev;
 
-	rcu_read_lock();
-	exe_file = rcu_dereference(tsk->mm->exe_file);
+	exe_file = get_task_exe_file(tsk);
+	if (!exe_file)
+		return 0;
 	ino = exe_file->f_inode->i_ino;
 	dev = exe_file->f_inode->i_sb->s_dev;
-	rcu_read_unlock();
+	fput(exe_file);
 	return audit_mark_compare(mark, ino, dev);
 }
-- 
cgit v1.2.3


From 070c43eea5043e950daa423707ae3c77e2f48edb Mon Sep 17 00:00:00 2001
From: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Date: Thu, 1 Sep 2016 16:14:44 -0700
Subject: kexec: fix double-free when failing to relocate the purgatory

If kexec_apply_relocations fails, kexec_load_purgatory frees pi->sechdrs
and pi->purgatory_buf.  This is redundant, because in case of error
kimage_file_prepare_segments calls kimage_file_post_load_cleanup, which
will also free those buffers.

This causes two warnings like the following, one for pi->sechdrs and the
other for pi->purgatory_buf:

  kexec-bzImage64: Loading purgatory failed
  ------------[ cut here ]------------
  WARNING: CPU: 1 PID: 2119 at mm/vmalloc.c:1490 __vunmap+0xc1/0xd0
  Trying to vfree() nonexistent vm area (ffffc90000e91000)
  Modules linked in:
  CPU: 1 PID: 2119 Comm: kexec Not tainted 4.8.0-rc3+ #5
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
  Call Trace:
    dump_stack+0x4d/0x65
    __warn+0xcb/0xf0
    warn_slowpath_fmt+0x4f/0x60
    ? find_vmap_area+0x19/0x70
    ? kimage_file_post_load_cleanup+0x47/0xb0
    __vunmap+0xc1/0xd0
    vfree+0x2e/0x70
    kimage_file_post_load_cleanup+0x5e/0xb0
    SyS_kexec_file_load+0x448/0x680
    ? putname+0x54/0x60
    ? do_sys_open+0x190/0x1f0
    entry_SYSCALL_64_fastpath+0x13/0x8f
  ---[ end trace 158bb74f5950ca2b ]---

Fix by setting pi->sechdrs an pi->purgatory_buf to NULL, since vfree
won't try to free a NULL pointer.

Link: http://lkml.kernel.org/r/1472083546-23683-1-git-send-email-bauerman@linux.vnet.ibm.com
Signed-off-by: Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com>
Acked-by: Baoquan He <bhe@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec_file.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 503bc2d348e5..037c321c5618 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -887,7 +887,10 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min,
 	return 0;
 out:
 	vfree(pi->sechdrs);
+	pi->sechdrs = NULL;
+
 	vfree(pi->purgatory_buf);
+	pi->purgatory_buf = NULL;
 	return ret;
 }
 
-- 
cgit v1.2.3


From 236dec051078a8691950f56949612b4b74107e48 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 1 Sep 2016 16:14:47 -0700
Subject: kconfig: tinyconfig: provide whole choice blocks to avoid warnings

Using "make tinyconfig" produces a couple of annoying warnings that show
up for build test machines all the time:

    .config:966:warning: override: NOHIGHMEM changes choice state
    .config:965:warning: override: SLOB changes choice state
    .config:963:warning: override: KERNEL_XZ changes choice state
    .config:962:warning: override: CC_OPTIMIZE_FOR_SIZE changes choice state
    .config:933:warning: override: SLOB changes choice state
    .config:930:warning: override: CC_OPTIMIZE_FOR_SIZE changes choice state
    .config:870:warning: override: SLOB changes choice state
    .config:868:warning: override: KERNEL_XZ changes choice state
    .config:867:warning: override: CC_OPTIMIZE_FOR_SIZE changes choice state

I've made a previous attempt at fixing them and we discussed a number of
alternatives.

I tried changing the Makefile to use "merge_config.sh -n
$(fragment-list)" but couldn't get that to work properly.

This is yet another approach, based on the observation that we do want
to see a warning for conflicting 'choice' options, and that we can
simply make them non-conflicting by listing all other options as
disabled.  This is a trivial patch that we can apply independent of
plans for other changes.

Link: http://lkml.kernel.org/r/20160829214952.1334674-2-arnd@arndb.de
Link: https://storage.kernelci.org/mainline/v4.7-rc6/x86-tinyconfig/build.log
https://patchwork.kernel.org/patch/9212749/
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Reviewed-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/configs/tiny.config | 2 ++
 kernel/configs/tiny.config   | 8 ++++++++
 2 files changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config
index 4e2ecfa23c15..4b429df40d7a 100644
--- a/arch/x86/configs/tiny.config
+++ b/arch/x86/configs/tiny.config
@@ -1 +1,3 @@
 CONFIG_NOHIGHMEM=y
+# CONFIG_HIGHMEM4G is not set
+# CONFIG_HIGHMEM64G is not set
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
index c2de56ab0fce..7fa0c4ae6394 100644
--- a/kernel/configs/tiny.config
+++ b/kernel/configs/tiny.config
@@ -1,4 +1,12 @@
+# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+# CONFIG_KERNEL_GZIP is not set
+# CONFIG_KERNEL_BZIP2 is not set
+# CONFIG_KERNEL_LZMA is not set
 CONFIG_KERNEL_XZ=y
+# CONFIG_KERNEL_LZO is not set
+# CONFIG_KERNEL_LZ4 is not set
 CONFIG_OPTIMIZE_INLINING=y
+# CONFIG_SLAB is not set
+# CONFIG_SLUB is not set
 CONFIG_SLOB=y
-- 
cgit v1.2.3


From 19feeff18bbfde659baa58c2346f15a24d7c405e Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky.work@gmail.com>
Date: Thu, 1 Sep 2016 16:15:04 -0700
Subject: printk/nmi: avoid direct printk()-s from __printk_nmi_flush()

__printk_nmi_flush() can be called from nmi_panic(), therefore it has to
test whether it's executed in NMI context and thus must route the
messages through deferred printk() or via direct printk().

This is to avoid potential deadlocks, as described in commit
cf9b1106c81c ("printk/nmi: flush NMI messages on the system panic").

However there remain two places where __printk_nmi_flush() does
unconditional direct printk() calls:

 - pr_err("printk_nmi_flush: internal error ...")
 - pr_cont("\n")

Factor out print_nmi_seq_line() parts into a new printk_nmi_flush_line()
function, which takes care of in_nmi(), and use it in
__printk_nmi_flush() for printing and error-reporting.

Link: http://lkml.kernel.org/r/20160830161354.581-1-sergey.senozhatsky@gmail.com
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Petr Mladek <pmladek@suse.com>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk/nmi.c | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c
index b69eb8a2876f..16bab471c7e2 100644
--- a/kernel/printk/nmi.c
+++ b/kernel/printk/nmi.c
@@ -99,26 +99,32 @@ again:
 	return add;
 }
 
-/*
- * printk one line from the temporary buffer from @start index until
- * and including the @end index.
- */
-static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end)
+static void printk_nmi_flush_line(const char *text, int len)
 {
-	const char *buf = s->buffer + start;
-
 	/*
 	 * The buffers are flushed in NMI only on panic.  The messages must
 	 * go only into the ring buffer at this stage.  Consoles will get
 	 * explicitly called later when a crashdump is not generated.
 	 */
 	if (in_nmi())
-		printk_deferred("%.*s", (end - start) + 1, buf);
+		printk_deferred("%.*s", len, text);
 	else
-		printk("%.*s", (end - start) + 1, buf);
+		printk("%.*s", len, text);
 
 }
 
+/*
+ * printk one line from the temporary buffer from @start index until
+ * and including the @end index.
+ */
+static void printk_nmi_flush_seq_line(struct nmi_seq_buf *s,
+					int start, int end)
+{
+	const char *buf = s->buffer + start;
+
+	printk_nmi_flush_line(buf, (end - start) + 1);
+}
+
 /*
  * Flush data from the associated per_CPU buffer. The function
  * can be called either via IRQ work or independently.
@@ -150,9 +156,11 @@ more:
 	 * the buffer an unexpected way. If we printed something then
 	 * @len must only increase.
 	 */
-	if (i && i >= len)
-		pr_err("printk_nmi_flush: internal error: i=%d >= len=%zu\n",
-		       i, len);
+	if (i && i >= len) {
+		const char *msg = "printk_nmi_flush: internal error\n";
+
+		printk_nmi_flush_line(msg, strlen(msg));
+	}
 
 	if (!len)
 		goto out; /* Someone else has already flushed the buffer. */
@@ -166,14 +174,14 @@ more:
 	/* Print line by line. */
 	for (; i < size; i++) {
 		if (s->buffer[i] == '\n') {
-			print_nmi_seq_line(s, last_i, i);
+			printk_nmi_flush_seq_line(s, last_i, i);
 			last_i = i + 1;
 		}
 	}
 	/* Check if there was a partial line. */
 	if (last_i < size) {
-		print_nmi_seq_line(s, last_i, size - 1);
-		pr_cont("\n");
+		printk_nmi_flush_seq_line(s, last_i, size - 1);
+		printk_nmi_flush_line("\n", strlen("\n"));
 	}
 
 	/*
-- 
cgit v1.2.3


From c11600e4fed67ae4cd6a8096936afd445410e8ed Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 1 Sep 2016 16:15:07 -0700
Subject: mm, mempolicy: task->mempolicy must be NULL before dropping final
 reference

KASAN allocates memory from the page allocator as part of
kmem_cache_free(), and that can reference current->mempolicy through any
number of allocation functions.  It needs to be NULL'd out before the
final reference is dropped to prevent a use-after-free bug:

	BUG: KASAN: use-after-free in alloc_pages_current+0x363/0x370 at addr ffff88010b48102c
	CPU: 0 PID: 15425 Comm: trinity-c2 Not tainted 4.8.0-rc2+ #140
	...
	Call Trace:
		dump_stack
		kasan_object_err
		kasan_report_error
		__asan_report_load2_noabort
		alloc_pages_current	<-- use after free
		depot_save_stack
		save_stack
		kasan_slab_free
		kmem_cache_free
		__mpol_put		<-- free
		do_exit

This patch sets current->mempolicy to NULL before dropping the final
reference.

Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1608301442180.63329@chino.kir.corp.google.com
Fixes: cd11016e5f52 ("mm, kasan: stackdepot implementation. Enable stackdepot for SLAB")
Signed-off-by: David Rientjes <rientjes@google.com>
Reported-by: Vegard Nossum <vegard.nossum@oracle.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: <stable@vger.kernel.org>	[4.6+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h |  4 ++++
 kernel/exit.c             |  7 +------
 mm/mempolicy.c            | 17 +++++++++++++++++
 3 files changed, 22 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 4429d255c8ab..5e5b2969d931 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -195,6 +195,7 @@ static inline bool vma_migratable(struct vm_area_struct *vma)
 }
 
 extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
+extern void mpol_put_task_policy(struct task_struct *);
 
 #else
 
@@ -297,5 +298,8 @@ static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma,
 	return -1; /* no node preference */
 }
 
+static inline void mpol_put_task_policy(struct task_struct *task)
+{
+}
 #endif /* CONFIG_NUMA */
 #endif
diff --git a/kernel/exit.c b/kernel/exit.c
index 2f974ae042a6..091a78be3b09 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -848,12 +848,7 @@ void do_exit(long code)
 	TASKS_RCU(preempt_enable());
 	exit_notify(tsk, group_dead);
 	proc_exit_connector(tsk);
-#ifdef CONFIG_NUMA
-	task_lock(tsk);
-	mpol_put(tsk->mempolicy);
-	tsk->mempolicy = NULL;
-	task_unlock(tsk);
-#endif
+	mpol_put_task_policy(tsk);
 #ifdef CONFIG_FUTEX
 	if (unlikely(current->pi_state_cache))
 		kfree(current->pi_state_cache);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d8c4e38fb5f4..2da72a5b6ecc 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2336,6 +2336,23 @@ out:
 	return ret;
 }
 
+/*
+ * Drop the (possibly final) reference to task->mempolicy.  It needs to be
+ * dropped after task->mempolicy is set to NULL so that any allocation done as
+ * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
+ * policy.
+ */
+void mpol_put_task_policy(struct task_struct *task)
+{
+	struct mempolicy *pol;
+
+	task_lock(task);
+	pol = task->mempolicy;
+	task->mempolicy = NULL;
+	task_unlock(task);
+	mpol_put(pol);
+}
+
 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 {
 	pr_debug("deleting %lx-l%lx\n", n->start, n->end);
-- 
cgit v1.2.3


From 735f2770a770156100f534646158cb58cb8b2939 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 1 Sep 2016 16:15:13 -0700
Subject: kernel/fork: fix CLONE_CHILD_CLEARTID regression in nscd

Commit fec1d0115240 ("[PATCH] Disable CLONE_CHILD_CLEARTID for abnormal
exit") has caused a subtle regression in nscd which uses
CLONE_CHILD_CLEARTID to clear the nscd_certainly_running flag in the
shared databases, so that the clients are notified when nscd is
restarted.  Now, when nscd uses a non-persistent database, clients that
have it mapped keep thinking the database is being updated by nscd, when
in fact nscd has created a new (anonymous) one (for non-persistent
databases it uses an unlinked file as backend).

The original proposal for the CLONE_CHILD_CLEARTID change claimed
(https://lkml.org/lkml/2006/10/25/233):

: The NPTL library uses the CLONE_CHILD_CLEARTID flag on clone() syscalls
: on behalf of pthread_create() library calls.  This feature is used to
: request that the kernel clear the thread-id in user space (at an address
: provided in the syscall) when the thread disassociates itself from the
: address space, which is done in mm_release().
:
: Unfortunately, when a multi-threaded process incurs a core dump (such as
: from a SIGSEGV), the core-dumping thread sends SIGKILL signals to all of
: the other threads, which then proceed to clear their user-space tids
: before synchronizing in exit_mm() with the start of core dumping.  This
: misrepresents the state of process's address space at the time of the
: SIGSEGV and makes it more difficult for someone to debug NPTL and glibc
: problems (misleading him/her to conclude that the threads had gone away
: before the fault).
:
: The fix below is to simply avoid the CLONE_CHILD_CLEARTID action if a
: core dump has been initiated.

The resulting patch from Roland (https://lkml.org/lkml/2006/10/26/269)
seems to have a larger scope than the original patch asked for.  It
seems that limitting the scope of the check to core dumping should work
for SIGSEGV issue describe above.

[Changelog partly based on Andreas' description]
Fixes: fec1d0115240 ("[PATCH] Disable CLONE_CHILD_CLEARTID for abnormal exit")
Link: http://lkml.kernel.org/r/1471968749-26173-1-git-send-email-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Tested-by: William Preston <wpreston@suse.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@hack.frob.com>
Cc: Andreas Schwab <schwab@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index aaf782327bf3..93bdba13d7d9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -913,14 +913,12 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 	deactivate_mm(tsk, mm);
 
 	/*
-	 * If we're exiting normally, clear a user-space tid field if
-	 * requested.  We leave this alone when dying by signal, to leave
-	 * the value intact in a core dump, and to save the unnecessary
-	 * trouble, say, a killed vfork parent shouldn't touch this mm.
-	 * Userland only wants this done for a sys_exit.
+	 * Signal userspace if we're not exiting with a core dump
+	 * because we want to leave the value intact for debugging
+	 * purposes.
 	 */
 	if (tsk->clear_child_tid) {
-		if (!(tsk->flags & PF_SIGNALED) &&
+		if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
 		    atomic_read(&mm->mm_users) > 1) {
 			/*
 			 * We don't check the error code - if userspace has
-- 
cgit v1.2.3


From 08d072599234c959b0b82b63fa252c129225a899 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Fri, 2 Sep 2016 14:38:23 +0800
Subject: tick/nohz: Fix softlockup on scheduler stalls in kvm guest
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

tick_nohz_start_idle() is prevented to be called if the idle tick can't
be stopped since commit 1f3b0f8243cb934 ("tick/nohz: Optimize nohz idle
enter"). As a result, after suspend/resume the host machine, full dynticks
kvm guest will softlockup:

 NMI watchdog: BUG: soft lockup - CPU#0 stuck for 26s! [swapper/0:0]
 Call Trace:
  default_idle+0x31/0x1a0
  arch_cpu_idle+0xf/0x20
  default_idle_call+0x2a/0x50
  cpu_startup_entry+0x39b/0x4d0
  rest_init+0x138/0x140
  ? rest_init+0x5/0x140
  start_kernel+0x4c1/0x4ce
  ? set_init_arg+0x55/0x55
  ? early_idt_handler_array+0x120/0x120
  x86_64_start_reservations+0x24/0x26
  x86_64_start_kernel+0x142/0x14f

In addition, cat /proc/stat | grep cpu in guest or host:

cpu  398 16 5049 15754 5490 0 1 46 0 0
cpu0 206 5 450 0 0 0 1 14 0 0
cpu1 81 0 3937 3149 1514 0 0 9 0 0
cpu2 45 6 332 6052 2243 0 0 11 0 0
cpu3 65 2 328 6552 1732 0 0 11 0 0

The idle and iowait states are weird 0 for cpu0(housekeeping).

The bug is present in both guest and host kernels, and they both have
cpu0's idle and iowait states issue, however, host kernel's suspend/resume
path etc will touch watchdog to avoid the softlockup.

- The watchdog will not be touched in tick_nohz_stop_idle path (need be
  touched since the scheduler stall is expected) if idle_active flags are
  not detected.
- The idle and iowait states will not be accounted when exit idle loop
  (resched or interrupt) if idle start time and idle_active flags are
  not set.

This patch fixes it by reverting commit 1f3b0f8243cb934 since can't stop
idle tick doesn't mean can't be idle.

Fixes: 1f3b0f8243cb934 ("tick/nohz: Optimize nohz idle enter")
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Cc: Sanjeev Yadav<sanjeev.yadav@spreadtrum.com>
Cc: Gaurav Jindal<gaurav.jindal@spreadtrum.com>
Cc: stable@vger.kernel.org
Cc: kvm@vger.kernel.org
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Link: http://lkml.kernel.org/r/1472798303-4154-1-git-send-email-wanpeng.li@hotmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 204fdc86863d..2ec7c00228f3 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -908,10 +908,11 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)
 	ktime_t now, expires;
 	int cpu = smp_processor_id();
 
+	now = tick_nohz_start_idle(ts);
+
 	if (can_stop_idle_tick(cpu, ts)) {
 		int was_stopped = ts->tick_stopped;
 
-		now = tick_nohz_start_idle(ts);
 		ts->idle_calls++;
 
 		expires = tick_nohz_stop_sched_tick(ts, now, cpu);
-- 
cgit v1.2.3


From 58763148758057ffc447bf990321d3ea86d199a0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 30 Aug 2016 10:15:03 +0200
Subject: perf/core: Remove WARN from perf_event_read()

This effectively reverts commit:

  71e7bc2bab77 ("perf/core: Check return value of the perf_event_read() IPI")

... and puts in a comment explaining why we ignore the return value.

Reported-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: David Carrillo-Cisneros <davidcc@google.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 71e7bc2bab77 ("perf/core: Check return value of the perf_event_read() IPI")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3cfabdf7b942..07ac8596a728 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3549,10 +3549,18 @@ static int perf_event_read(struct perf_event *event, bool group)
 			.group = group,
 			.ret = 0,
 		};
-		ret = smp_call_function_single(event->oncpu, __perf_event_read, &data, 1);
-		/* The event must have been read from an online CPU: */
-		WARN_ON_ONCE(ret);
-		ret = ret ? : data.ret;
+		/*
+		 * Purposely ignore the smp_call_function_single() return
+		 * value.
+		 *
+		 * If event->oncpu isn't a valid CPU it means the event got
+		 * scheduled out and that will have updated the event count.
+		 *
+		 * Therefore, either way, we'll have an up-to-date event count
+		 * after this.
+		 */
+		(void)smp_call_function_single(event->oncpu, __perf_event_read, &data, 1);
+		ret = data.ret;
 	} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
 		struct perf_event_context *ctx = event->ctx;
 		unsigned long flags;
-- 
cgit v1.2.3


From 135e8c9250dd5c8c9aae5984fde6f230d0cbfeaf Mon Sep 17 00:00:00 2001
From: Balbir Singh <bsingharora@gmail.com>
Date: Mon, 5 Sep 2016 13:16:40 +1000
Subject: sched/core: Fix a race between try_to_wake_up() and a woken up task

The origin of the issue I've seen is related to
a missing memory barrier between check for task->state and
the check for task->on_rq.

The task being woken up is already awake from a schedule()
and is doing the following:

	do {
		schedule()
		set_current_state(TASK_(UN)INTERRUPTIBLE);
	} while (!cond);

The waker, actually gets stuck doing the following in
try_to_wake_up():

	while (p->on_cpu)
		cpu_relax();

Analysis:

The instance I've seen involves the following race:

 CPU1					CPU2

 while () {
   if (cond)
     break;
   do {
     schedule();
     set_current_state(TASK_UN..)
   } while (!cond);
					wakeup_routine()
					  spin_lock_irqsave(wait_lock)
   raw_spin_lock_irqsave(wait_lock)	  wake_up_process()
 }					  try_to_wake_up()
 set_current_state(TASK_RUNNING);	  ..
 list_del(&waiter.list);

CPU2 wakes up CPU1, but before it can get the wait_lock and set
current state to TASK_RUNNING the following occurs:

 CPU3
 wakeup_routine()
 raw_spin_lock_irqsave(wait_lock)
 if (!list_empty)
   wake_up_process()
   try_to_wake_up()
   raw_spin_lock_irqsave(p->pi_lock)
   ..
   if (p->on_rq && ttwu_wakeup())
   ..
   while (p->on_cpu)
     cpu_relax()
   ..

CPU3 tries to wake up the task on CPU1 again since it finds
it on the wait_queue, CPU1 is spinning on wait_lock, but immediately
after CPU2, CPU3 got it.

CPU3 checks the state of p on CPU1, it is TASK_UNINTERRUPTIBLE and
the task is spinning on the wait_lock. Interestingly since p->on_rq
is checked under pi_lock, I've noticed that try_to_wake_up() finds
p->on_rq to be 0. This was the most confusing bit of the analysis,
but p->on_rq is changed under runqueue lock, rq_lock, the p->on_rq
check is not reliable without this fix IMHO. The race is visible
(based on the analysis) only when ttwu_queue() does a remote wakeup
via ttwu_queue_remote. In which case the p->on_rq change is not
done uder the pi_lock.

The result is that after a while the entire system locks up on
the raw_spin_irqlock_save(wait_lock) and the holder spins infintely

Reproduction of the issue:

The issue can be reproduced after a long run on my system with 80
threads and having to tweak available memory to very low and running
memory stress-ng mmapfork test. It usually takes a long time to
reproduce. I am trying to work on a test case that can reproduce
the issue faster, but thats work in progress. I am still testing the
changes on my still in a loop and the tests seem OK thus far.

Big thanks to Benjamin and Nick for helping debug this as well.
Ben helped catch the missing barrier, Nick caught every missing
bit in my theory.

Signed-off-by: Balbir Singh <bsingharora@gmail.com>
[ Updated comment to clarify matching barriers. Many
  architectures do not have a full barrier in switch_to()
  so that cannot be relied upon. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nicholas Piggin <nicholas.piggin@gmail.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/e02cce7b-d9ca-1ad0-7a61-ea97c7582b37@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2a906f20fba7..44817c640e99 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2016,6 +2016,28 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	success = 1; /* we're going to change ->state */
 	cpu = task_cpu(p);
 
+	/*
+	 * Ensure we load p->on_rq _after_ p->state, otherwise it would
+	 * be possible to, falsely, observe p->on_rq == 0 and get stuck
+	 * in smp_cond_load_acquire() below.
+	 *
+	 * sched_ttwu_pending()                 try_to_wake_up()
+	 *   [S] p->on_rq = 1;                  [L] P->state
+	 *       UNLOCK rq->lock  -----.
+	 *                              \
+	 *				 +---   RMB
+	 * schedule()                   /
+	 *       LOCK rq->lock    -----'
+	 *       UNLOCK rq->lock
+	 *
+	 * [task p]
+	 *   [S] p->state = UNINTERRUPTIBLE     [L] p->on_rq
+	 *
+	 * Pairs with the UNLOCK+LOCK on rq->lock from the
+	 * last wakeup of our task and the schedule that got our task
+	 * current.
+	 */
+	smp_rmb();
 	if (p->on_rq && ttwu_remote(p, wake_flags))
 		goto stat;
 
-- 
cgit v1.2.3


From c86d06ba2818c5126078cb0cf4e0175ec381045b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 5 Sep 2016 08:38:13 -0400
Subject: PM / QoS: avoid calling cancel_delayed_work_sync() during early boot

of_clk_init() ends up calling into pm_qos_update_request() very early
during boot where irq is expected to stay disabled.
pm_qos_update_request() uses cancel_delayed_work_sync() which
correctly assumes that irq is enabled on invocation and
unconditionally disables and re-enables it.

Gate cancel_delayed_work_sync() invocation with kevented_up() to avoid
enabling irq unexpectedly during early boot.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-and-tested-by: Qiao Zhou <qiaozhou@asrmicro.com>
Link: http://lkml.kernel.org/r/d2501c4c-8e7b-bea3-1b01-000b36b5dfe9@asrmicro.com
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/power/qos.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 97b0df71303e..168ff442ebde 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -482,7 +482,16 @@ void pm_qos_update_request(struct pm_qos_request *req,
 		return;
 	}
 
-	cancel_delayed_work_sync(&req->work);
+	/*
+	 * This function may be called very early during boot, for example,
+	 * from of_clk_init(), where irq needs to stay disabled.
+	 * cancel_delayed_work_sync() assumes that irq is enabled on
+	 * invocation and re-enables it on return.  Avoid calling it until
+	 * workqueue is initialized.
+	 */
+	if (keventd_up())
+		cancel_delayed_work_sync(&req->work);
+
 	__pm_qos_update_request(req, new_value);
 }
 EXPORT_SYMBOL_GPL(pm_qos_update_request);
-- 
cgit v1.2.3


From 9049771f7d5490a302589976984810064c83ab40 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Wed, 7 Sep 2016 08:51:21 -0700
Subject: mm: fix cache mode of dax pmd mappings

track_pfn_insert() in vmf_insert_pfn_pmd() is marking dax mappings as
uncacheable rendering them impractical for application usage.  DAX-pte
mappings are cached and the goal of establishing DAX-pmd mappings is to
attain more performance, not dramatically less (3 orders of magnitude).

track_pfn_insert() relies on a previous call to reserve_memtype() to
establish the expected page_cache_mode for the range.  While memremap()
arranges for reserve_memtype() to be called, devm_memremap_pages() does
not.  So, teach track_pfn_insert() and untrack_pfn() how to handle
tracking without a vma, and arrange for devm_memremap_pages() to
establish the write-back-cache reservation in the memtype tree.

Cc: <stable@vger.kernel.org>
Cc: Matthew Wilcox <mawilcox@microsoft.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Nilesh Choudhury <nilesh.choudhury@oracle.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Toshi Kani <toshi.kani@hpe.com>
Reported-by: Kai Zhang <kai.ka.zhang@oracle.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 arch/x86/mm/pat.c | 17 ++++++++++-------
 kernel/memremap.c |  9 +++++++++
 2 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index ecb1b69c1651..170cc4ff057b 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -927,9 +927,10 @@ int track_pfn_copy(struct vm_area_struct *vma)
 }
 
 /*
- * prot is passed in as a parameter for the new mapping. If the vma has a
- * linear pfn mapping for the entire range reserve the entire vma range with
- * single reserve_pfn_range call.
+ * prot is passed in as a parameter for the new mapping. If the vma has
+ * a linear pfn mapping for the entire range, or no vma is provided,
+ * reserve the entire pfn + size range with single reserve_pfn_range
+ * call.
  */
 int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
 		    unsigned long pfn, unsigned long addr, unsigned long size)
@@ -938,11 +939,12 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
 	enum page_cache_mode pcm;
 
 	/* reserve the whole chunk starting from paddr */
-	if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) {
+	if (!vma || (addr == vma->vm_start
+				&& size == (vma->vm_end - vma->vm_start))) {
 		int ret;
 
 		ret = reserve_pfn_range(paddr, size, prot, 0);
-		if (!ret)
+		if (ret == 0 && vma)
 			vma->vm_flags |= VM_PAT;
 		return ret;
 	}
@@ -997,7 +999,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 	resource_size_t paddr;
 	unsigned long prot;
 
-	if (!(vma->vm_flags & VM_PAT))
+	if (vma && !(vma->vm_flags & VM_PAT))
 		return;
 
 	/* free the chunk starting from pfn or the whole chunk */
@@ -1011,7 +1013,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 		size = vma->vm_end - vma->vm_start;
 	}
 	free_pfn_range(paddr, size);
-	vma->vm_flags &= ~VM_PAT;
+	if (vma)
+		vma->vm_flags &= ~VM_PAT;
 }
 
 /*
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 251d16b4cb41..b501e390bb34 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -247,6 +247,7 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
 	align_start = res->start & ~(SECTION_SIZE - 1);
 	align_size = ALIGN(resource_size(res), SECTION_SIZE);
 	arch_remove_memory(align_start, align_size);
+	untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
 	pgmap_radix_release(res);
 	dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
 			"%s: failed to free all reserved pages\n", __func__);
@@ -282,6 +283,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 		struct percpu_ref *ref, struct vmem_altmap *altmap)
 {
 	resource_size_t key, align_start, align_size, align_end;
+	pgprot_t pgprot = PAGE_KERNEL;
 	struct dev_pagemap *pgmap;
 	struct page_map *page_map;
 	int error, nid, is_ram;
@@ -351,6 +353,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 	if (nid < 0)
 		nid = numa_mem_id();
 
+	error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(align_start), 0,
+			align_size);
+	if (error)
+		goto err_pfn_remap;
+
 	error = arch_add_memory(nid, align_start, align_size, true);
 	if (error)
 		goto err_add_memory;
@@ -371,6 +378,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
 	return __va(res->start);
 
  err_add_memory:
+	untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
+ err_pfn_remap:
  err_radix:
 	pgmap_radix_release(res);
 	devres_free(page_map);
-- 
cgit v1.2.3


From 767ae08678c2c796bcd7f582ee457aee20a28a1e Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Tue, 6 Sep 2016 16:23:49 +0300
Subject: perf/core: Fix a race between mmap_close() and set_output() of AUX
 events

In the mmap_close() path we need to stop all the AUX events that are
writing data to the AUX area that we are unmapping, before we can
safely free the pages. To determine if an event needs to be stopped,
we're comparing its ->rb against the one that's getting unmapped.
However, a SET_OUTPUT ioctl may turn up inside an AUX transaction
and swizzle event::rb to some other ring buffer, but the transaction
will keep writing data to the old ring buffer until the event gets
scheduled out. At this point, mmap_close() will skip over such an
event and will proceed to free the AUX area, while it's still being
used by this event, which will set off a warning in the mmap_close()
path and cause a memory corruption.

To avoid this, always stop an AUX event before its ->rb is updated;
this will release the (potentially) last reference on the AUX area
of the buffer. If the event gets restarted, its new ring buffer will
be used. If another SET_OUTPUT comes and switches it back to the
old ring buffer that's getting unmapped, it's also fine: this
ring buffer's aux_mmap_count will be zero and AUX transactions won't
start any more.

Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160906132353.19887-2-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/core.c | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 07ac8596a728..a54f2c2cdb20 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2496,11 +2496,11 @@ static int __perf_event_stop(void *info)
 	return 0;
 }
 
-static int perf_event_restart(struct perf_event *event)
+static int perf_event_stop(struct perf_event *event, int restart)
 {
 	struct stop_event_data sd = {
 		.event		= event,
-		.restart	= 1,
+		.restart	= restart,
 	};
 	int ret = 0;
 
@@ -4845,6 +4845,19 @@ static void ring_buffer_attach(struct perf_event *event,
 		spin_unlock_irqrestore(&rb->event_lock, flags);
 	}
 
+	/*
+	 * Avoid racing with perf_mmap_close(AUX): stop the event
+	 * before swizzling the event::rb pointer; if it's getting
+	 * unmapped, its aux_mmap_count will be 0 and it won't
+	 * restart. See the comment in __perf_pmu_output_stop().
+	 *
+	 * Data will inevitably be lost when set_output is done in
+	 * mid-air, but then again, whoever does it like this is
+	 * not in for the data anyway.
+	 */
+	if (has_aux(event))
+		perf_event_stop(event, 0);
+
 	rcu_assign_pointer(event->rb, rb);
 
 	if (old_rb) {
@@ -6120,7 +6133,7 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
 	raw_spin_unlock_irqrestore(&ifh->lock, flags);
 
 	if (restart)
-		perf_event_restart(event);
+		perf_event_stop(event, 1);
 }
 
 void perf_event_exec(void)
@@ -6164,7 +6177,13 @@ static void __perf_event_output_stop(struct perf_event *event, void *data)
 
 	/*
 	 * In case of inheritance, it will be the parent that links to the
-	 * ring-buffer, but it will be the child that's actually using it:
+	 * ring-buffer, but it will be the child that's actually using it.
+	 *
+	 * We are using event::rb to determine if the event should be stopped,
+	 * however this may race with ring_buffer_attach() (through set_output),
+	 * which will make us skip the event that actually needs to be stopped.
+	 * So ring_buffer_attach() has to stop an aux event before re-assigning
+	 * its rb pointer.
 	 */
 	if (rcu_dereference(parent->rb) == rb)
 		ro->err = __perf_event_stop(&sd);
@@ -6678,7 +6697,7 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
 	raw_spin_unlock_irqrestore(&ifh->lock, flags);
 
 	if (restart)
-		perf_event_restart(event);
+		perf_event_stop(event, 1);
 }
 
 /*
@@ -7867,7 +7886,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
 	mmput(mm);
 
 restart:
-	perf_event_restart(event);
+	perf_event_stop(event, 1);
 }
 
 /*
-- 
cgit v1.2.3


From b79ccadd6bb10e72cf784a298ca6dc1398eb9a24 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Date: Tue, 6 Sep 2016 16:23:50 +0300
Subject: perf/core: Fix aux_mmap_count vs aux_refcount order

The order of accesses to ring buffer's aux_mmap_count and aux_refcount
has to be preserved across the users, namely perf_mmap_close() and
perf_aux_output_begin(), otherwise the inversion can result in the latter
holding the last reference to the aux buffer and subsequently free'ing
it in atomic context, triggering a warning.

> ------------[ cut here ]------------
> WARNING: CPU: 0 PID: 257 at kernel/events/ring_buffer.c:541 __rb_free_aux+0x11a/0x130
> CPU: 0 PID: 257 Comm: stopbug Not tainted 4.8.0-rc1+ #2596
> Call Trace:
>  [<ffffffff810f3e0b>] __warn+0xcb/0xf0
>  [<ffffffff810f3f3d>] warn_slowpath_null+0x1d/0x20
>  [<ffffffff8121182a>] __rb_free_aux+0x11a/0x130
>  [<ffffffff812127a8>] rb_free_aux+0x18/0x20
>  [<ffffffff81212913>] perf_aux_output_begin+0x163/0x1e0
>  [<ffffffff8100c33a>] bts_event_start+0x3a/0xd0
>  [<ffffffff8100c42d>] bts_event_add+0x5d/0x80
>  [<ffffffff81203646>] event_sched_in.isra.104+0xf6/0x2f0
>  [<ffffffff8120652e>] group_sched_in+0x6e/0x190
>  [<ffffffff8120694e>] ctx_sched_in+0x2fe/0x5f0
>  [<ffffffff81206ca0>] perf_event_sched_in+0x60/0x80
>  [<ffffffff81206d1b>] ctx_resched+0x5b/0x90
>  [<ffffffff81207281>] __perf_event_enable+0x1e1/0x240
>  [<ffffffff81200639>] event_function+0xa9/0x180
>  [<ffffffff81202000>] ? perf_cgroup_attach+0x70/0x70
>  [<ffffffff8120203f>] remote_function+0x3f/0x50
>  [<ffffffff811971f3>] flush_smp_call_function_queue+0x83/0x150
>  [<ffffffff81197bd3>] generic_smp_call_function_single_interrupt+0x13/0x60
>  [<ffffffff810a6477>] smp_call_function_single_interrupt+0x27/0x40
>  [<ffffffff81a26ea9>] call_function_single_interrupt+0x89/0x90
>  [<ffffffff81120056>] finish_task_switch+0xa6/0x210
>  [<ffffffff81120017>] ? finish_task_switch+0x67/0x210
>  [<ffffffff81a1e83d>] __schedule+0x3dd/0xb50
>  [<ffffffff81a1efe5>] schedule+0x35/0x80
>  [<ffffffff81128031>] sys_sched_yield+0x61/0x70
>  [<ffffffff81a25be5>] entry_SYSCALL_64_fastpath+0x18/0xa8
> ---[ end trace 6235f556f5ea83a9 ]---

This patch puts the checks in perf_aux_output_begin() in the same order
as that of perf_mmap_close().

Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: vince@deater.net
Link: http://lkml.kernel.org/r/20160906132353.19887-3-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/events/ring_buffer.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index ae9b90dc9a5a..257fa460b846 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -330,15 +330,22 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
 	if (!rb)
 		return NULL;
 
-	if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount))
+	if (!rb_has_aux(rb))
 		goto err;
 
 	/*
-	 * If rb::aux_mmap_count is zero (and rb_has_aux() above went through),
-	 * the aux buffer is in perf_mmap_close(), about to get freed.
+	 * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(),
+	 * about to get freed, so we leave immediately.
+	 *
+	 * Checking rb::aux_mmap_count and rb::refcount has to be done in
+	 * the same order, see perf_mmap_close. Otherwise we end up freeing
+	 * aux pages in this path, which is a bug, because in_atomic().
 	 */
 	if (!atomic_read(&rb->aux_mmap_count))
-		goto err_put;
+		goto err;
+
+	if (!atomic_inc_not_zero(&rb->aux_refcount))
+		goto err;
 
 	/*
 	 * Nesting is not supported for AUX area, make sure nested
-- 
cgit v1.2.3