From 610467270fb368584b74567edd21c8cc5104490f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 8 Jul 2017 07:17:02 -0400 Subject: cgroup: don't call migration methods if there are no tasks to migrate Subsystem migration methods shouldn't be called for empty migrations. cgroup_migrate_execute() implements this guarantee by bailing early if there are no source css_sets. This used to be correct before a79a908fd2b0 ("cgroup: introduce cgroup namespaces"), but no longer since the commit because css_sets can stay pinned without tasks in them. This caused cgroup_migrate_execute() call into cpuset migration methods with an empty cgroup_taskset. cpuset migration methods correctly assume that cgroup_taskset_first() never returns NULL; however, due to the bug, it can, leading to the following oops. Unable to handle kernel paging request for data at address 0x00000960 Faulting instruction address: 0xc0000000001d6868 Oops: Kernel access of bad area, sig: 11 [#1] ... CPU: 14 PID: 16947 Comm: kworker/14:0 Tainted: G W 4.12.0-rc4-next-20170609 #2 Workqueue: events cpuset_hotplug_workfn task: c00000000ca60580 task.stack: c00000000c728000 NIP: c0000000001d6868 LR: c0000000001d6858 CTR: c0000000001d6810 REGS: c00000000c72b720 TRAP: 0300 Tainted: GW (4.12.0-rc4-next-20170609) MSR: 8000000000009033 CR: 44722422 XER: 20000000 CFAR: c000000000008710 DAR: 0000000000000960 DSISR: 40000000 SOFTE: 1 GPR00: c0000000001d6858 c00000000c72b9a0 c000000001536e00 0000000000000000 GPR04: c00000000c72b9c0 0000000000000000 c00000000c72bad0 c000000766367678 GPR08: c000000766366d10 c00000000c72b958 c000000001736e00 0000000000000000 GPR12: c0000000001d6810 c00000000e749300 c000000000123ef8 c000000775af4180 GPR16: 0000000000000000 0000000000000000 c00000075480e9c0 c00000075480e9e0 GPR20: c00000075480e8c0 0000000000000001 0000000000000000 c00000000c72ba20 GPR24: c00000000c72baa0 c00000000c72bac0 c000000001407248 c00000000c72ba20 GPR28: c00000000141fc80 c00000000c72bac0 c00000000c6bc790 0000000000000000 NIP [c0000000001d6868] cpuset_can_attach+0x58/0x1b0 LR [c0000000001d6858] cpuset_can_attach+0x48/0x1b0 Call Trace: [c00000000c72b9a0] [c0000000001d6858] cpuset_can_attach+0x48/0x1b0 (unreliable) [c00000000c72ba00] [c0000000001cbe80] cgroup_migrate_execute+0xb0/0x450 [c00000000c72ba80] [c0000000001d3754] cgroup_transfer_tasks+0x1c4/0x360 [c00000000c72bba0] [c0000000001d923c] cpuset_hotplug_workfn+0x86c/0xa20 [c00000000c72bca0] [c00000000011aa44] process_one_work+0x1e4/0x580 [c00000000c72bd30] [c00000000011ae78] worker_thread+0x98/0x5c0 [c00000000c72bdc0] [c000000000124058] kthread+0x168/0x1b0 [c00000000c72be30] [c00000000000b2e8] ret_from_kernel_thread+0x5c/0x74 Instruction dump: f821ffa1 7c7d1b78 60000000 60000000 38810020 7fa3eb78 3f42ffed 4bff4c25 60000000 3b5a0448 3d420020 eb610020 7f43d378 e9290000 f92af200 ---[ end trace dcaaf98fb36d9e64 ]--- This patch fixes the bug by adding an explicit nr_tasks counter to cgroup_taskset and skipping calling the migration methods if the counter is zero. While at it, remove the now spurious check on no source css_sets. Signed-off-by: Tejun Heo Reported-and-tested-by: Abdul Haleem Cc: Roman Gushchin Cc: stable@vger.kernel.org # v4.6+ Fixes: a79a908fd2b0 ("cgroup: introduce cgroup namespaces") Link: http://lkml.kernel.org/r/1497266622.15415.39.camel@abdul.in.ibm.com --- kernel/cgroup/cgroup-internal.h | 3 +++ kernel/cgroup/cgroup.c | 58 ++++++++++++++++++++++------------------- 2 files changed, 34 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 793565c05742..8b4c3c2f2509 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -33,6 +33,9 @@ struct cgroup_taskset { struct list_head src_csets; struct list_head dst_csets; + /* the number of tasks in the set */ + int nr_tasks; + /* the subsys currently being processed */ int ssid; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 620794a20a33..cc53111072d8 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2006,6 +2006,8 @@ static void cgroup_migrate_add_task(struct task_struct *task, if (!cset->mg_src_cgrp) return; + mgctx->tset.nr_tasks++; + list_move_tail(&task->cg_list, &cset->mg_tasks); if (list_empty(&cset->mg_node)) list_add_tail(&cset->mg_node, @@ -2094,21 +2096,19 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) struct css_set *cset, *tmp_cset; int ssid, failed_ssid, ret; - /* methods shouldn't be called if no task is actually migrating */ - if (list_empty(&tset->src_csets)) - return 0; - /* check that we can legitimately attach to the cgroup */ - do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { - if (ss->can_attach) { - tset->ssid = ssid; - ret = ss->can_attach(tset); - if (ret) { - failed_ssid = ssid; - goto out_cancel_attach; + if (tset->nr_tasks) { + do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { + if (ss->can_attach) { + tset->ssid = ssid; + ret = ss->can_attach(tset); + if (ret) { + failed_ssid = ssid; + goto out_cancel_attach; + } } - } - } while_each_subsys_mask(); + } while_each_subsys_mask(); + } /* * Now that we're guaranteed success, proceed to move all tasks to @@ -2137,25 +2137,29 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) */ tset->csets = &tset->dst_csets; - do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { - if (ss->attach) { - tset->ssid = ssid; - ss->attach(tset); - } - } while_each_subsys_mask(); + if (tset->nr_tasks) { + do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { + if (ss->attach) { + tset->ssid = ssid; + ss->attach(tset); + } + } while_each_subsys_mask(); + } ret = 0; goto out_release_tset; out_cancel_attach: - do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { - if (ssid == failed_ssid) - break; - if (ss->cancel_attach) { - tset->ssid = ssid; - ss->cancel_attach(tset); - } - } while_each_subsys_mask(); + if (tset->nr_tasks) { + do_each_subsys_mask(ss, ssid, mgctx->ss_mask) { + if (ssid == failed_ssid) + break; + if (ss->cancel_attach) { + tset->ssid = ssid; + ss->cancel_attach(tset); + } + } while_each_subsys_mask(); + } out_release_tset: spin_lock_irq(&css_set_lock); list_splice_init(&tset->dst_csets, &tset->src_csets); -- cgit v1.2.3 From 7af608e4f9530372aec6e940552bf76595f2e265 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jul 2017 17:57:46 -0400 Subject: cgroup: create dfl_root files on subsys registration On subsystem registration, css_populate_dir() is not called on the new root css, so the interface files for the subsystem on cgrp_dfl_root aren't created on registration. This is a residue from the days when cgrp_dfl_root was used only as the parking spot for unused subsystems, which no longer is true as it's used as the root for cgroup2. This is often fine as later operations tend to create them as a part of mount (cgroup1) or subtree_control operations (cgroup2); however, it's not difficult to mount cgroup2 with the controller interface files missing as Waiman found out. Fix it by invoking css_populate_dir() on the root css on subsys registration. Signed-off-by: Tejun Heo Reported-and-tested-by: Waiman Long Cc: stable@vger.kernel.org # v4.5+ Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index cc53111072d8..744975947d01 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4673,6 +4673,10 @@ int __init cgroup_init(void) if (ss->bind) ss->bind(init_css_set.subsys[ssid]); + + mutex_lock(&cgroup_mutex); + css_populate_dir(init_css_set.subsys[ssid]); + mutex_unlock(&cgroup_mutex); } /* init_css_set.subsys[] has been updated, re-hash */ -- cgit v1.2.3 From 5c0338c68706be53b3dc472e4308961c36e4ece1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Jul 2017 18:41:52 -0400 Subject: workqueue: restore WQ_UNBOUND/max_active==1 to be ordered The combination of WQ_UNBOUND and max_active == 1 used to imply ordered execution. After NUMA affinity 4c16bd327c74 ("workqueue: implement NUMA affinity for unbound workqueues"), this is no longer true due to per-node worker pools. While the right way to create an ordered workqueue is alloc_ordered_workqueue(), the documentation has been misleading for a long time and people do use WQ_UNBOUND and max_active == 1 for ordered workqueues which can lead to subtle bugs which are very difficult to trigger. It's unlikely that we'd see noticeable performance impact by enforcing ordering on WQ_UNBOUND / max_active == 1 workqueues. Let's automatically set __WQ_ORDERED for those workqueues. Signed-off-by: Tejun Heo Reported-by: Christoph Hellwig Reported-by: Alexei Potashnik Fixes: 4c16bd327c74 ("workqueue: implement NUMA affinity for unbound workqueues") Cc: stable@vger.kernel.org # v3.10+ --- kernel/workqueue.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a86688fabc55..abe4a4971c24 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3929,6 +3929,16 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, struct workqueue_struct *wq; struct pool_workqueue *pwq; + /* + * Unbound && max_active == 1 used to imply ordered, which is no + * longer the case on NUMA machines due to per-node pools. While + * alloc_ordered_workqueue() is the right way to create an ordered + * workqueue, keep the previous behavior to avoid subtle breakages + * on NUMA. + */ + if ((flags & WQ_UNBOUND) && max_active == 1) + flags |= __WQ_ORDERED; + /* see the comment above the definition of WQ_POWER_EFFICIENT */ if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient) flags |= WQ_UNBOUND; -- cgit v1.2.3 From 3c74541777302eec43a0d1327c4d58b8659a776b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jul 2017 08:14:15 -0400 Subject: cgroup: fix error return value from cgroup_subtree_control() While refactoring, f7b2814bb9b6 ("cgroup: factor out cgroup_{apply|finalize}_control() from cgroup_subtree_control_write()") broke error return value from the function. The return value from the last operation is always overridden to zero. Fix it. Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # v4.6+ Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 744975947d01..df2e0f14a95d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3001,11 +3001,11 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, cgrp->subtree_control &= ~disable; ret = cgroup_apply_control(cgrp); - cgroup_finalize_control(cgrp, ret); + if (ret) + goto out_unlock; kernfs_activate(cgrp->kn); - ret = 0; out_unlock: cgroup_kn_unlock(of->kn); return ret ?: nbytes; -- cgit v1.2.3 From 9305706c2e808ae59f1eb201867f82f1ddf6d7a6 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Fri, 21 Jul 2017 14:37:34 +0100 Subject: bpf/verifier: fix min/max handling in BPF_SUB We have to subtract the src max from the dst min, and vice-versa, since (e.g.) the smallest result comes from the largest subtrahend. Fixes: 484611357c19 ("bpf: allow access into map value arrays") Signed-off-by: Edward Cree Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index af9e84a4944e..664d93972373 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1865,10 +1865,12 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, * do our normal operations to the register, we need to set the values * to the min/max since they are undefined. */ - if (min_val == BPF_REGISTER_MIN_RANGE) - dst_reg->min_value = BPF_REGISTER_MIN_RANGE; - if (max_val == BPF_REGISTER_MAX_RANGE) - dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + if (opcode != BPF_SUB) { + if (min_val == BPF_REGISTER_MIN_RANGE) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; + if (max_val == BPF_REGISTER_MAX_RANGE) + dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + } switch (opcode) { case BPF_ADD: @@ -1879,10 +1881,17 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env, dst_reg->min_align = min(src_align, dst_align); break; case BPF_SUB: + /* If one of our values was at the end of our ranges, then the + * _opposite_ value in the dst_reg goes to the end of our range. + */ + if (min_val == BPF_REGISTER_MIN_RANGE) + dst_reg->max_value = BPF_REGISTER_MAX_RANGE; + if (max_val == BPF_REGISTER_MAX_RANGE) + dst_reg->min_value = BPF_REGISTER_MIN_RANGE; if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE) - dst_reg->min_value -= min_val; + dst_reg->min_value -= max_val; if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE) - dst_reg->max_value -= max_val; + dst_reg->max_value -= min_val; dst_reg->min_align = min(src_align, dst_align); break; case BPF_MUL: -- cgit v1.2.3 From bf50f0e8a03005d19de66d01261d855cdeedf572 Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Mon, 24 Jul 2017 13:56:28 -0600 Subject: sched/core: Fix some documentation build warnings The kerneldoc comments for try_to_wake_up_local() were out of date, leading to these documentation build warnings: ./kernel/sched/core.c:2080: warning: No description found for parameter 'rf' ./kernel/sched/core.c:2080: warning: Excess function parameter 'cookie' description in 'try_to_wake_up_local' Update the comment to reflect current reality and give us some peace and quiet. Signed-off-by: Jonathan Corbet Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/20170724135628.695cecfc@lwn.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 17c667b427b4..0869b20fba81 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2069,7 +2069,7 @@ out: /** * try_to_wake_up_local - try to wake up a local task with rq lock held * @p: the thread to be awakened - * @cookie: context's cookie for pinning + * @rf: request-queue flags for pinning * * Put @p on the run-queue if it's not already there. The caller must * ensure that this_rq() is locked, @p is bound to this_rq() and not -- cgit v1.2.3 From 0a94efb5acbb6980d7c9ab604372d93cd507e4d8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 23 Jul 2017 08:36:15 -0400 Subject: workqueue: implicit ordered attribute should be overridable 5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered") automatically enabled ordered attribute for unbound workqueues w/ max_active == 1. Because ordered workqueues reject max_active and some attribute changes, this implicit ordered mode broke cases where the user creates an unbound workqueue w/ max_active == 1 and later explicitly changes the related attributes. This patch distinguishes explicit and implicit ordered setting and overrides from attribute changes if implict. Signed-off-by: Tejun Heo Fixes: 5c0338c68706 ("workqueue: restore WQ_UNBOUND/max_active==1 to be ordered") --- include/linux/workqueue.h | 4 +++- kernel/workqueue.c | 13 +++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index c102ef65cb64..db6dc9dc0482 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -323,6 +323,7 @@ enum { __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ + __WQ_ORDERED_EXPLICIT = 1 << 18, /* internal: alloc_ordered_workqueue() */ __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ @@ -422,7 +423,8 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, * Pointer to the allocated workqueue on success, %NULL on failure. */ #define alloc_ordered_workqueue(fmt, flags, args...) \ - alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) + alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | \ + __WQ_ORDERED_EXPLICIT | (flags), 1, ##args) #define create_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name)) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index abe4a4971c24..7146ea70a62d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3744,8 +3744,12 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, return -EINVAL; /* creating multiple pwqs breaks ordering guarantee */ - if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) - return -EINVAL; + if (!list_empty(&wq->pwqs)) { + if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) + return -EINVAL; + + wq->flags &= ~__WQ_ORDERED; + } ctx = apply_wqattrs_prepare(wq, attrs); if (!ctx) @@ -4129,13 +4133,14 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) struct pool_workqueue *pwq; /* disallow meddling with max_active for ordered workqueues */ - if (WARN_ON(wq->flags & __WQ_ORDERED)) + if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) return; max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); mutex_lock(&wq->mutex); + wq->flags &= ~__WQ_ORDERED; wq->saved_max_active = max_active; for_each_pwq(pwq, wq) @@ -5263,7 +5268,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) * attributes breaks ordering guarantee. Disallow exposing ordered * workqueues. */ - if (WARN_ON(wq->flags & __WQ_ORDERED)) + if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT)) return -EINVAL; wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); -- cgit v1.2.3 From 8397913303abc9333f376a518a8368fa22ca5e6e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 27 Jul 2017 12:21:11 +0200 Subject: genirq/cpuhotplug: Revert "Set force affinity flag on hotplug migration" That commit was part of the changes moving x86 to the generic CPU hotplug interrupt migration code. The force flag was required on x86 before the hierarchical irqdomain rework, but invoking set_affinity() with force=true stayed and had no side effects. At some point in the past, the force flag got repurposed to support the exynos timer interrupt affinity setting to a not yet online CPU, so the interrupt controller callback does not verify the supplied affinity mask against cpu_online_mask. Setting the flag in the CPU hotplug code causes the cpu online masking to be blocked on these irq controllers and results in potentially affining an interrupt to the CPU which is unplugged, i.e. instead of moving it away, it's just reassigned to it. As the force flags is not longer needed on x86, it's safe to revert that patch so the ARM irqchips which use the force flag work again. Add comments to that effect, so this won't happen again. Note: The online mask handling should be done in the generic code and the force flag and the masking in the irq chips removed all together, but that's not a change possible for 4.13. Fixes: 77f85e66aa8b ("genirq/cpuhotplug: Set force affinity flag on hotplug migration") Reported-by: Will Deacon Signed-off-by: Thomas Gleixner Acked-by: Will Deacon Cc: Marc Zyngier Cc: Russell King Cc: LAK Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1707271217590.3109@nanos Signed-off-by: Thomas Gleixner --- include/linux/irq.h | 7 ++++++- kernel/irq/cpuhotplug.c | 9 +++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index 00db35b61e9e..d2d543794093 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -388,7 +388,12 @@ static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d) * @irq_mask_ack: ack and mask an interrupt source * @irq_unmask: unmask an interrupt source * @irq_eoi: end of interrupt - * @irq_set_affinity: set the CPU affinity on SMP machines + * @irq_set_affinity: Set the CPU affinity on SMP machines. If the force + * argument is true, it tells the driver to + * unconditionally apply the affinity setting. Sanity + * checks against the supplied affinity mask are not + * required. This is used for CPU hotplug where the + * target CPU is not yet set in the cpu_online_mask. * @irq_retrigger: resend an IRQ to the CPU * @irq_set_type: set the flow type (IRQ_TYPE_LEVEL/etc.) of an IRQ * @irq_set_wake: enable/disable power-management wake-on of an IRQ diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index aee8f7ec40af..638eb9c83d9f 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c @@ -95,8 +95,13 @@ static bool migrate_one_irq(struct irq_desc *desc) affinity = cpu_online_mask; brokeaff = true; } - - err = irq_do_set_affinity(d, affinity, true); + /* + * Do not set the force argument of irq_do_set_affinity() as this + * disables the masking of offline CPUs from the supplied affinity + * mask and therefore might keep/reassign the irq to the outgoing + * CPU. + */ + err = irq_do_set_affinity(d, affinity, false); if (err) { pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n", d->irq, err); -- cgit v1.2.3 From 1ad0f0a7aa1bf3bd42dcd108a96713d255eacd9f Mon Sep 17 00:00:00 2001 From: Michael Bringmann Date: Thu, 27 Jul 2017 16:27:14 -0500 Subject: workqueue: Work around edge cases for calc of pool's cpumask There is an underlying assumption/trade-off in many layers of the Linux system that CPU <-> node mapping is static. This is despite the presence of features like NUMA and 'hotplug' that support the dynamic addition/ removal of fundamental system resources like CPUs and memory. PowerPC systems, however, do provide extensive features for the dynamic change of resources available to a system. Currently, there is little or no synchronization protection around the updating of the CPU <-> node mapping, and the export/update of this information for other layers / modules. In systems which can change this mapping during 'hotplug', like PowerPC, the information is changing underneath all layers that might reference it. This patch attempts to ensure that a valid, usable cpumask attribute is used by the workqueue infrastructure when setting up new resource pools. It prevents a crash that has been observed when an 'empty' cpumask is passed along to the worker/task scheduling code. It is intended as a temporary workaround until a more fundamental review and correction of the issue can be done. [With additions to the patch provided by Tejun Hao ] Signed-off-by: Michael Bringmann Signed-off-by: Tejun Heo --- kernel/workqueue.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7146ea70a62d..ca937b0c3a96 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3577,6 +3577,13 @@ static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, /* yeap, return possible CPUs in @node that @attrs wants */ cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]); + + if (cpumask_empty(cpumask)) { + pr_warn_once("WARNING: workqueue cpumask: online intersect > " + "possible intersect\n"); + return false; + } + return !cpumask_equal(cpumask, attrs->cpumask); use_dfl: -- cgit v1.2.3 From 89b096898a8450b0a5b97d521e000ae9f94f81f9 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 27 Jul 2017 21:02:46 +0200 Subject: bpf: don't indicate success when copy_from_user fails err in bpf_prog_get_info_by_fd() still holds 0 at that time from prior check_uarg_tail_zero() check. Explicitly return -EFAULT instead, so user space can be notified of buggy behavior. Fixes: 1e2709769086 ("bpf: Add BPF_OBJ_GET_INFO_BY_FD") Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 045646da97cc..84bb39975ad4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1289,7 +1289,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info_len = min_t(u32, sizeof(info), info_len); if (copy_from_user(&info, uinfo, info_len)) - return err; + return -EFAULT; info.type = prog->type; info.id = prog->aux->id; -- cgit v1.2.3 From 9975a54b3c9ecf029cbf5dd7a8c9701b1d74029e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 28 Jul 2017 17:05:25 +0200 Subject: bpf: fix bpf_prog_get_info_by_fd to dump correct xlated_prog_len bpf_prog_size(prog->len) is not the correct length we want to dump back to user space. The code in bpf_prog_get_info_by_fd() uses this to copy prog->insnsi to user space, but bpf_prog_size(prog->len) also includes the size of struct bpf_prog itself plus program instructions and is usually used either in context of accounting or for bpf_prog_alloc() et al, thus we copy out of bounds in bpf_prog_get_info_by_fd() potentially. Use the correct bpf_prog_insn_size() instead. Fixes: 1e2709769086 ("bpf: Add BPF_OBJ_GET_INFO_BY_FD") Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 84bb39975ad4..6c772adabad2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1312,7 +1312,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, } ulen = info.xlated_prog_len; - info.xlated_prog_len = bpf_prog_size(prog->len); + info.xlated_prog_len = bpf_prog_insn_size(prog); if (info.xlated_prog_len && ulen) { uinsns = u64_to_user_ptr(info.xlated_prog_insns); ulen = min_t(u32, info.xlated_prog_len, ulen); -- cgit v1.2.3 From 34f41c0316ed52b0b44542491d89278efdaa70e4 Mon Sep 17 00:00:00 2001 From: Matija Glavinic Pecotic Date: Tue, 1 Aug 2017 09:11:52 +0200 Subject: timers: Fix overflow in get_next_timer_interrupt For e.g. HZ=100, timer being 430 jiffies in the future, and 32 bit unsigned int, there is an overflow on unsigned int right-hand side of the expression which results with wrong values being returned. Type cast the multiplier to 64bit to avoid that issue. Fixes: 46c8f0b077a8 ("timers: Fix get_next_timer_interrupt() computation") Signed-off-by: Matija Glavinic Pecotic Signed-off-by: Thomas Gleixner Reviewed-by: Alexander Sverdlin Cc: khilman@baylibre.com Cc: akpm@linux-foundation.org Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/a7900f04-2a21-c9fd-67be-ab334d459ee5@nokia.com --- kernel/time/timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 71ce3f4eead3..8f5d1bf18854 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1495,7 +1495,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) base->is_idle = false; } else { if (!is_max_delta) - expires = basem + (nextevt - basej) * TICK_NSEC; + expires = basem + (u64)(nextevt - basej) * TICK_NSEC; /* * If we expect to sleep more than a tick, mark the base idle: */ -- cgit v1.2.3 From 4bb0f0e73c8c30917d169c4a0f1ac083690c545b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 1 Aug 2017 12:01:52 -0400 Subject: tracing: Call clear_boot_tracer() at lateinit_sync The clear_boot_tracer function is used to reset the default_bootup_tracer string to prevent it from being accessed after boot, as it originally points to init data. But since clear_boot_tracer() is called via the init_lateinit() call, it races with the initcall for registering the hwlat tracer. If someone adds "ftrace=hwlat" to the kernel command line, depending on how the linker sets up the text, the saved command line may be cleared, and the hwlat tracer never is initialized. Simply have the clear_boot_tracer() be called by initcall_lateinit_sync() as that's for tasks to be called after lateinit. Link: https://bugzilla.kernel.org/show_bug.cgi?id=196551 Cc: stable@vger.kernel.org Fixes: e7c15cd8a ("tracing: Added hardware latency tracer") Reported-by: Zamir SUN Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 42b9355033d4..784fb43b2abe 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8407,4 +8407,4 @@ __init static int clear_boot_tracer(void) } fs_initcall(tracer_init_tracefs); -late_initcall(clear_boot_tracer); +late_initcall_sync(clear_boot_tracer); -- cgit v1.2.3 From 147d88e0b5eb90191bc5c12ca0a3c410b75a13d2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 1 Aug 2017 14:02:01 +0300 Subject: tracing: Missing error code in tracer_alloc_buffers() If ring_buffer_alloc() or one of the next couple function calls fail then we should return -ENOMEM but the current code returns success. Link: http://lkml.kernel.org/r/20170801110201.ajdkct7vwzixahvx@mwanda Cc: Sebastian Andrzej Siewior Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: b32614c03413 ('tracing/rb: Convert to hotplug state machine') Signed-off-by: Dan Carpenter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 784fb43b2abe..d815fc317e9d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8293,6 +8293,7 @@ __init static int tracer_alloc_buffers(void) if (ret < 0) goto out_free_cpumask; /* Used for event triggers */ + ret = -ENOMEM; temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE); if (!temp_buffer) goto out_rm_hp_state; -- cgit v1.2.3 From a7e52ad7ed82e21273eccff93d1477a7b313aabb Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 2 Aug 2017 14:20:54 -0400 Subject: ring-buffer: Have ring_buffer_alloc_read_page() return error on offline CPU Chunyu Hu reported: "per_cpu trace directories and files are created for all possible cpus, but only the cpus which have ever been on-lined have their own per cpu ring buffer (allocated by cpuhp threads). While trace_buffers_open, the open handler for trace file 'trace_pipe_raw' is always trying to access field of ring_buffer_per_cpu, and would panic with the NULL pointer. Align the behavior of trace_pipe_raw with trace_pipe, that returns -NODEV when openning it if that cpu does not have trace ring buffer. Reproduce: cat /sys/kernel/debug/tracing/per_cpu/cpu31/trace_pipe_raw (cpu31 is never on-lined, this is a 16 cores x86_64 box) Tested with: 1) boot with maxcpus=14, read trace_pipe_raw of cpu15. Got -NODEV. 2) oneline cpu15, read trace_pipe_raw of cpu15. Get the raw trace data. Call trace: [ 5760.950995] RIP: 0010:ring_buffer_alloc_read_page+0x32/0xe0 [ 5760.961678] tracing_buffers_read+0x1f6/0x230 [ 5760.962695] __vfs_read+0x37/0x160 [ 5760.963498] ? __vfs_read+0x5/0x160 [ 5760.964339] ? security_file_permission+0x9d/0xc0 [ 5760.965451] ? __vfs_read+0x5/0x160 [ 5760.966280] vfs_read+0x8c/0x130 [ 5760.967070] SyS_read+0x55/0xc0 [ 5760.967779] do_syscall_64+0x67/0x150 [ 5760.968687] entry_SYSCALL64_slow_path+0x25/0x25" This was introduced by the addition of the feature to reuse reader pages instead of re-allocating them. The problem is that the allocation of a reader page (which is per cpu) does not check if the cpu is online and set up for the ring buffer. Link: http://lkml.kernel.org/r/1500880866-1177-1-git-send-email-chuhu@redhat.com Cc: stable@vger.kernel.org Fixes: 73a757e63114 ("ring-buffer: Return reader page back into existing ring buffer") Reported-by: Chunyu Hu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 14 +++++++++----- kernel/trace/ring_buffer_benchmark.c | 2 +- kernel/trace/trace.c | 16 +++++++++++----- 3 files changed, 21 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 529cc50d7243..81279c6602ff 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4386,15 +4386,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); * the page that was allocated, with the read page of the buffer. * * Returns: - * The page allocated, or NULL on error. + * The page allocated, or ERR_PTR */ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) { - struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_per_cpu *cpu_buffer; struct buffer_data_page *bpage = NULL; unsigned long flags; struct page *page; + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return ERR_PTR(-ENODEV); + + cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); @@ -4412,7 +4416,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, 0); if (!page) - return NULL; + return ERR_PTR(-ENOMEM); bpage = page_address(page); @@ -4467,8 +4471,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); * * for example: * rpage = ring_buffer_alloc_read_page(buffer, cpu); - * if (!rpage) - * return error; + * if (IS_ERR(rpage)) + * return PTR_ERR(rpage); * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); * if (ret >= 0) * process_page(rpage, ret); diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 9fbcaf567886..68ee79afe31c 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -113,7 +113,7 @@ static enum event_status read_page(int cpu) int i; bpage = ring_buffer_alloc_read_page(buffer, cpu); - if (!bpage) + if (IS_ERR(bpage)) return EVENT_DROPPED; ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d815fc317e9d..44004d8aa3b3 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6598,7 +6598,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, { struct ftrace_buffer_info *info = filp->private_data; struct trace_iterator *iter = &info->iter; - ssize_t ret; + ssize_t ret = 0; ssize_t size; if (!count) @@ -6612,10 +6612,15 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (!info->spare) { info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, iter->cpu_file); - info->spare_cpu = iter->cpu_file; + if (IS_ERR(info->spare)) { + ret = PTR_ERR(info->spare); + info->spare = NULL; + } else { + info->spare_cpu = iter->cpu_file; + } } if (!info->spare) - return -ENOMEM; + return ret; /* Do we have previous read data to read? */ if (info->read < PAGE_SIZE) @@ -6790,8 +6795,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, ref->ref = 1; ref->buffer = iter->trace_buffer->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); - if (!ref->page) { - ret = -ENOMEM; + if (IS_ERR(ref->page)) { + ret = PTR_ERR(ref->page); + ref->page = NULL; kfree(ref); break; } -- cgit v1.2.3 From 27e37d84e55f47bf28f7072fadf8fa2cbc3d9375 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 2 Aug 2017 13:31:50 -0700 Subject: pid: kill pidhash_size in pidhash_init() After commit 3d375d78593c ("mm: update callers to use HASH_ZERO flag"), drop unused pidhash_size in pidhash_init(). Link: http://lkml.kernel.org/r/1500389267-49222-1-git-send-email-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 731c4e528f4e..c69c30d827e5 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -575,13 +575,10 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) */ void __init pidhash_init(void) { - unsigned int pidhash_size; - pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, HASH_EARLY | HASH_SMALL | HASH_ZERO, &pidhash_shift, NULL, 0, 4096); - pidhash_size = 1U << pidhash_shift; } void __init pidmap_init(void) -- cgit v1.2.3 From 89affbf5d9ebb15c6460596822e8857ea2f9e735 Mon Sep 17 00:00:00 2001 From: Dima Zavin Date: Wed, 2 Aug 2017 13:32:18 -0700 Subject: cpuset: fix a deadlock due to incomplete patching of cpusets_enabled() In codepaths that use the begin/retry interface for reading mems_allowed_seq with irqs disabled, there exists a race condition that stalls the patch process after only modifying a subset of the static_branch call sites. This problem manifested itself as a deadlock in the slub allocator, inside get_any_partial. The loop reads mems_allowed_seq value (via read_mems_allowed_begin), performs the defrag operation, and then verifies the consistency of mem_allowed via the read_mems_allowed_retry and the cookie returned by xxx_begin. The issue here is that both begin and retry first check if cpusets are enabled via cpusets_enabled() static branch. This branch can be rewritted dynamically (via cpuset_inc) if a new cpuset is created. The x86 jump label code fully synchronizes across all CPUs for every entry it rewrites. If it rewrites only one of the callsites (specifically the one in read_mems_allowed_retry) and then waits for the smp_call_function(do_sync_core) to complete while a CPU is inside the begin/retry section with IRQs off and the mems_allowed value is changed, we can hang. This is because begin() will always return 0 (since it wasn't patched yet) while retry() will test the 0 against the actual value of the seq counter. The fix is to use two different static keys: one for begin (pre_enable_key) and one for retry (enable_key). In cpuset_inc(), we first bump the pre_enable key to ensure that cpuset_mems_allowed_begin() always return a valid seqcount if are enabling cpusets. Similarly, when disabling cpusets via cpuset_dec(), we first ensure that callers of cpuset_mems_allowed_retry() will start ignoring the seqcount value before we let cpuset_mems_allowed_begin() return 0. The relevant stack traces of the two stuck threads: CPU: 1 PID: 1415 Comm: mkdir Tainted: G L 4.9.36-00104-g540c51286237 #4 Hardware name: Default string Default string/Hardware, BIOS 4.29.1-20170526215256 05/26/2017 task: ffff8817f9c28000 task.stack: ffffc9000ffa4000 RIP: smp_call_function_many+0x1f9/0x260 Call Trace: smp_call_function+0x3b/0x70 on_each_cpu+0x2f/0x90 text_poke_bp+0x87/0xd0 arch_jump_label_transform+0x93/0x100 __jump_label_update+0x77/0x90 jump_label_update+0xaa/0xc0 static_key_slow_inc+0x9e/0xb0 cpuset_css_online+0x70/0x2e0 online_css+0x2c/0xa0 cgroup_apply_control_enable+0x27f/0x3d0 cgroup_mkdir+0x2b7/0x420 kernfs_iop_mkdir+0x5a/0x80 vfs_mkdir+0xf6/0x1a0 SyS_mkdir+0xb7/0xe0 entry_SYSCALL_64_fastpath+0x18/0xad ... CPU: 2 PID: 1 Comm: init Tainted: G L 4.9.36-00104-g540c51286237 #4 Hardware name: Default string Default string/Hardware, BIOS 4.29.1-20170526215256 05/26/2017 task: ffff8818087c0000 task.stack: ffffc90000030000 RIP: int3+0x39/0x70 Call Trace: <#DB> ? ___slab_alloc+0x28b/0x5a0 ? copy_process.part.40+0xf7/0x1de0 __slab_alloc.isra.80+0x54/0x90 copy_process.part.40+0xf7/0x1de0 copy_process.part.40+0xf7/0x1de0 kmem_cache_alloc_node+0x8a/0x280 copy_process.part.40+0xf7/0x1de0 _do_fork+0xe7/0x6c0 _raw_spin_unlock_irq+0x2d/0x60 trace_hardirqs_on_caller+0x136/0x1d0 entry_SYSCALL_64_fastpath+0x5/0xad do_syscall_64+0x27/0x350 SyS_clone+0x19/0x20 do_syscall_64+0x60/0x350 entry_SYSCALL64_slow_path+0x25/0x25 Link: http://lkml.kernel.org/r/20170731040113.14197-1-dmitriyz@waymo.com Fixes: 46e700abc44c ("mm, page_alloc: remove unnecessary taking of a seqlock when cpusets are disabled") Signed-off-by: Dima Zavin Reported-by: Cliff Spradlin Acked-by: Vlastimil Babka Cc: Peter Zijlstra Cc: Christopher Lameter Cc: Li Zefan Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 19 +++++++++++++++++-- kernel/cgroup/cpuset.c | 1 + 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 119a3f9604b0..898cfe2eeb42 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -18,6 +18,19 @@ #ifdef CONFIG_CPUSETS +/* + * Static branch rewrites can happen in an arbitrary order for a given + * key. In code paths where we need to loop with read_mems_allowed_begin() and + * read_mems_allowed_retry() to get a consistent view of mems_allowed, we need + * to ensure that begin() always gets rewritten before retry() in the + * disabled -> enabled transition. If not, then if local irqs are disabled + * around the loop, we can deadlock since retry() would always be + * comparing the latest value of the mems_allowed seqcount against 0 as + * begin() still would see cpusets_enabled() as false. The enabled -> disabled + * transition should happen in reverse order for the same reasons (want to stop + * looking at real value of mems_allowed.sequence in retry() first). + */ +extern struct static_key_false cpusets_pre_enable_key; extern struct static_key_false cpusets_enabled_key; static inline bool cpusets_enabled(void) { @@ -32,12 +45,14 @@ static inline int nr_cpusets(void) static inline void cpuset_inc(void) { + static_branch_inc(&cpusets_pre_enable_key); static_branch_inc(&cpusets_enabled_key); } static inline void cpuset_dec(void) { static_branch_dec(&cpusets_enabled_key); + static_branch_dec(&cpusets_pre_enable_key); } extern int cpuset_init(void); @@ -115,7 +130,7 @@ extern void cpuset_print_current_mems_allowed(void); */ static inline unsigned int read_mems_allowed_begin(void) { - if (!cpusets_enabled()) + if (!static_branch_unlikely(&cpusets_pre_enable_key)) return 0; return read_seqcount_begin(¤t->mems_allowed_seq); @@ -129,7 +144,7 @@ static inline unsigned int read_mems_allowed_begin(void) */ static inline bool read_mems_allowed_retry(unsigned int seq) { - if (!cpusets_enabled()) + if (!static_branch_unlikely(&cpusets_enabled_key)) return false; return read_seqcount_retry(¤t->mems_allowed_seq, seq); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index ca8376e5008c..8d5151688504 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -63,6 +63,7 @@ #include #include +DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); /* See "Frequency meter" comments, below. */ -- cgit v1.2.3 From fbb77611e95d3d5b2af86a59754a3130877cb667 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Sat, 5 Aug 2017 23:00:50 +0300 Subject: Fix compat_sys_sigpending breakage The latest change of compat_sys_sigpending in commit 8f13621abced ("sigpending(): move compat to native") has broken it in two ways. First, it tries to write 4 bytes more than userspace expects: sizeof(old_sigset_t) == sizeof(long) == 8 instead of sizeof(compat_old_sigset_t) == sizeof(u32) == 4. Second, on big endian architectures these bytes are being written in the wrong order. This bug was found by strace test suite. Reported-by: Anatoly Pugachev Inspired-by: Eugene Syromyatnikov Fixes: 8f13621abced ("sigpending(): move compat to native") Signed-off-by: Dmitry V. Levin Acked-by: Al Viro Signed-off-by: Linus Torvalds --- kernel/signal.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index caed9133ae52..7e33f8c583e6 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3303,12 +3303,15 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32) { +#ifdef __BIG_ENDIAN sigset_t set; - int err = do_sigpending(&set, sizeof(old_sigset_t)); - if (err == 0) - if (copy_to_user(set32, &set, sizeof(old_sigset_t))) - err = -EFAULT; + int err = do_sigpending(&set, sizeof(set.sig[0])); + if (!err) + err = put_user(set.sig[0], set32); return err; +#else + return sys_rt_sigpending((sigset_t __user *)set32, sizeof(*set32)); +#endif } #endif -- cgit v1.2.3 From 48fb6f4db940e92cfb16cd878cddd59ea6120d06 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 9 Aug 2017 08:27:11 +0100 Subject: futex: Remove unnecessary warning from get_futex_key Commit 65d8fc777f6d ("futex: Remove requirement for lock_page() in get_futex_key()") removed an unnecessary lock_page() with the side-effect that page->mapping needed to be treated very carefully. Two defensive warnings were added in case any assumption was missed and the first warning assumed a correct application would not alter a mapping backing a futex key. Since merging, it has not triggered for any unexpected case but Mark Rutland reported the following bug triggering due to the first warning. kernel BUG at kernel/futex.c:679! Internal error: Oops - BUG: 0 [#1] PREEMPT SMP Modules linked in: CPU: 0 PID: 3695 Comm: syz-executor1 Not tainted 4.13.0-rc3-00020-g307fec773ba3 #3 Hardware name: linux,dummy-virt (DT) task: ffff80001e271780 task.stack: ffff000010908000 PC is at get_futex_key+0x6a4/0xcf0 kernel/futex.c:679 LR is at get_futex_key+0x6a4/0xcf0 kernel/futex.c:679 pc : [] lr : [] pstate: 80000145 The fact that it's a bug instead of a warning was due to an unrelated arm64 problem, but the warning itself triggered because the underlying mapping changed. This is an application issue but from a kernel perspective it's a recoverable situation and the warning is unnecessary so this patch removes the warning. The warning may potentially be triggered with the following test program from Mark although it may be necessary to adjust NR_FUTEX_THREADS to be a value smaller than the number of CPUs in the system. #include #include #include #include #include #include #include #include #define NR_FUTEX_THREADS 16 pthread_t threads[NR_FUTEX_THREADS]; void *mem; #define MEM_PROT (PROT_READ | PROT_WRITE) #define MEM_SIZE 65536 static int futex_wrapper(int *uaddr, int op, int val, const struct timespec *timeout, int *uaddr2, int val3) { syscall(SYS_futex, uaddr, op, val, timeout, uaddr2, val3); } void *poll_futex(void *unused) { for (;;) { futex_wrapper(mem, FUTEX_CMP_REQUEUE_PI, 1, NULL, mem + 4, 1); } } int main(int argc, char *argv[]) { int i; mem = mmap(NULL, MEM_SIZE, MEM_PROT, MAP_SHARED | MAP_ANONYMOUS, -1, 0); printf("Mapping @ %p\n", mem); printf("Creating futex threads...\n"); for (i = 0; i < NR_FUTEX_THREADS; i++) pthread_create(&threads[i], NULL, poll_futex, NULL); printf("Flipping mapping...\n"); for (;;) { mmap(mem, MEM_SIZE, MEM_PROT, MAP_FIXED | MAP_SHARED | MAP_ANONYMOUS, -1, 0); } return 0; } Reported-and-tested-by: Mark Rutland Signed-off-by: Mel Gorman Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org # 4.7+ Signed-off-by: Linus Torvalds --- kernel/futex.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 16dbe4c93895..f50b434756c1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -670,13 +670,14 @@ again: * this reference was taken by ihold under the page lock * pinning the inode in place so i_lock was unnecessary. The * only way for this check to fail is if the inode was - * truncated in parallel so warn for now if this happens. + * truncated in parallel which is almost certainly an + * application bug. In such a case, just retry. * * We are not calling into get_futex_key_refs() in file-backed * cases, therefore a successful atomic_inc return below will * guarantee that get_futex_key() will still imply smp_mb(); (B). */ - if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) { + if (!atomic_inc_not_zero(&inode->i_count)) { rcu_read_unlock(); put_page(page); -- cgit v1.2.3 From bfe334924ccd9f4a53f30240c03cf2f43f5b2df1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 2 Aug 2017 19:39:30 +0200 Subject: perf/x86: Fix RDPMC vs. mm_struct tracking Vince reported the following rdpmc() testcase failure: > Failing test case: > > fd=perf_event_open(); > addr=mmap(fd); > exec() // without closing or unmapping the event > fd=perf_event_open(); > addr=mmap(fd); > rdpmc() // GPFs due to rdpmc being disabled The problem is of course that exec() plays tricks with what is current->mm, only destroying the old mappings after having installed the new mm. Fix this confusion by passing along vma->vm_mm instead of relying on current->mm. Reported-by: Vince Weaver Tested-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: stable@vger.kernel.org Fixes: 1e0fb9ec679c ("perf: Add pmu callbacks to track event mapping and unmapping") Link: http://lkml.kernel.org/r/20170802173930.cstykcqefmqt7jau@hirez.programming.kicks-ass.net [ Minor cleanups. ] Signed-off-by: Ingo Molnar --- arch/x86/events/core.c | 16 +++++++--------- include/linux/perf_event.h | 4 ++-- kernel/events/core.c | 6 +++--- 3 files changed, 12 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 8e3db8f642a7..af12e294caed 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2114,7 +2114,7 @@ static void refresh_pce(void *ignored) load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm)); } -static void x86_pmu_event_mapped(struct perf_event *event) +static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) { if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) return; @@ -2129,22 +2129,20 @@ static void x86_pmu_event_mapped(struct perf_event *event) * For now, this can't happen because all callers hold mmap_sem * for write. If this changes, we'll need a different solution. */ - lockdep_assert_held_exclusive(¤t->mm->mmap_sem); + lockdep_assert_held_exclusive(&mm->mmap_sem); - if (atomic_inc_return(¤t->mm->context.perf_rdpmc_allowed) == 1) - on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); + if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1) + on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1); } -static void x86_pmu_event_unmapped(struct perf_event *event) +static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm) { - if (!current->mm) - return; if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) return; - if (atomic_dec_and_test(¤t->mm->context.perf_rdpmc_allowed)) - on_each_cpu_mask(mm_cpumask(current->mm), refresh_pce, NULL, 1); + if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed)) + on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1); } static int x86_pmu_event_idx(struct perf_event *event) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index a3b873fc59e4..b14095bcf4bb 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -310,8 +310,8 @@ struct pmu { * Notification that the event was mapped or unmapped. Called * in the context of the mapping task. */ - void (*event_mapped) (struct perf_event *event); /*optional*/ - void (*event_unmapped) (struct perf_event *event); /*optional*/ + void (*event_mapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ + void (*event_unmapped) (struct perf_event *event, struct mm_struct *mm); /* optional */ /* * Flags for ->add()/->del()/ ->start()/->stop(). There are diff --git a/kernel/events/core.c b/kernel/events/core.c index 426c2ffba16d..a654b8a3586f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5090,7 +5090,7 @@ static void perf_mmap_open(struct vm_area_struct *vma) atomic_inc(&event->rb->aux_mmap_count); if (event->pmu->event_mapped) - event->pmu->event_mapped(event); + event->pmu->event_mapped(event, vma->vm_mm); } static void perf_pmu_output_stop(struct perf_event *event); @@ -5113,7 +5113,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) unsigned long size = perf_data_size(rb); if (event->pmu->event_unmapped) - event->pmu->event_unmapped(event); + event->pmu->event_unmapped(event, vma->vm_mm); /* * rb->aux_mmap_count will always drop before rb->mmap_count and @@ -5411,7 +5411,7 @@ aux_unlock: vma->vm_ops = &perf_mmap_vmops; if (event->pmu->event_mapped) - event->pmu->event_mapped(event); + event->pmu->event_mapped(event, vma->vm_mm); return ret; } -- cgit v1.2.3 From 9b231d9f47c6114d317ce28cff92a74ad80547f5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Aug 2017 15:42:09 +0200 Subject: perf/core: Fix time on IOC_ENABLE Vince reported that when we do IOC_ENABLE/IOC_DISABLE while the task is SIGSTOP'ed state the timestamps go wobbly. It turns out we indeed fail to correctly account time while in 'OFF' state and doing IOC_ENABLE without getting scheduled in exposes the problem. Further thinking about this problem, it occurred to me that we can suffer a similar fate when we migrate an uncore event between CPUs. The perf_event_install() on the 'new' CPU will do add_event_to_ctx() which will reset all the time stamp, resulting in a subsequent update_event_times() to overwrite the total_time_* fields with smaller values. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/events/core.c | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index a654b8a3586f..ee20d4c546b5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2217,6 +2217,33 @@ static int group_can_go_on(struct perf_event *event, return can_add_hw; } +/* + * Complement to update_event_times(). This computes the tstamp_* values to + * continue 'enabled' state from @now, and effectively discards the time + * between the prior tstamp_stopped and now (as we were in the OFF state, or + * just switched (context) time base). + * + * This further assumes '@event->state == INACTIVE' (we just came from OFF) and + * cannot have been scheduled in yet. And going into INACTIVE state means + * '@event->tstamp_stopped = @now'. + * + * Thus given the rules of update_event_times(): + * + * total_time_enabled = tstamp_stopped - tstamp_enabled + * total_time_running = tstamp_stopped - tstamp_running + * + * We can insert 'tstamp_stopped == now' and reverse them to compute new + * tstamp_* values. + */ +static void __perf_event_enable_time(struct perf_event *event, u64 now) +{ + WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE); + + event->tstamp_stopped = now; + event->tstamp_enabled = now - event->total_time_enabled; + event->tstamp_running = now - event->total_time_running; +} + static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { @@ -2224,9 +2251,12 @@ static void add_event_to_ctx(struct perf_event *event, list_add_event(event, ctx); perf_group_attach(event); - event->tstamp_enabled = tstamp; - event->tstamp_running = tstamp; - event->tstamp_stopped = tstamp; + /* + * We can be called with event->state == STATE_OFF when we create with + * .disabled = 1. In that case the IOC_ENABLE will call this function. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) + __perf_event_enable_time(event, tstamp); } static void ctx_sched_out(struct perf_event_context *ctx, @@ -2471,10 +2501,11 @@ static void __perf_event_mark_enabled(struct perf_event *event) u64 tstamp = perf_event_time(event); event->state = PERF_EVENT_STATE_INACTIVE; - event->tstamp_enabled = tstamp - event->total_time_enabled; + __perf_event_enable_time(event, tstamp); list_for_each_entry(sub, &event->sibling_list, group_entry) { + /* XXX should not be > INACTIVE if event isn't */ if (sub->state >= PERF_EVENT_STATE_INACTIVE) - sub->tstamp_enabled = tstamp - sub->total_time_enabled; + __perf_event_enable_time(sub, tstamp); } } -- cgit v1.2.3 From d507e2ebd2c7be9138e5cf5c0cb1931c90c42ab1 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 10 Aug 2017 15:23:31 -0700 Subject: mm: fix global NR_SLAB_.*CLAIMABLE counter reads As Tetsuo points out: "Commit 385386cff4c6 ("mm: vmstat: move slab statistics from zone to node counters") broke "Slab:" field of /proc/meminfo . It shows nearly 0kB" In addition to /proc/meminfo, this problem also affects the slab counters OOM/allocation failure info dumps, can cause early -ENOMEM from overcommit protection, and miscalculate image size requirements during suspend-to-disk. This is because the patch in question switched the slab counters from the zone level to the node level, but forgot to update the global accessor functions to read the aggregate node data instead of the aggregate zone data. Use global_node_page_state() to access the global slab counters. Fixes: 385386cff4c6 ("mm: vmstat: move slab statistics from zone to node counters") Link: http://lkml.kernel.org/r/20170801134256.5400-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Tetsuo Handa Acked-by: Michal Hocko Cc: Josef Bacik Cc: Vladimir Davydov Cc: Stefan Agner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 8 ++++---- kernel/power/snapshot.c | 2 +- mm/page_alloc.c | 9 +++++---- mm/util.c | 2 +- 4 files changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 8a428498d6b2..509a61668d90 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -106,13 +106,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_node_page_state(NR_FILE_MAPPED)); show_val_kb(m, "Shmem: ", i.sharedram); show_val_kb(m, "Slab: ", - global_page_state(NR_SLAB_RECLAIMABLE) + - global_page_state(NR_SLAB_UNRECLAIMABLE)); + global_node_page_state(NR_SLAB_RECLAIMABLE) + + global_node_page_state(NR_SLAB_UNRECLAIMABLE)); show_val_kb(m, "SReclaimable: ", - global_page_state(NR_SLAB_RECLAIMABLE)); + global_node_page_state(NR_SLAB_RECLAIMABLE)); show_val_kb(m, "SUnreclaim: ", - global_page_state(NR_SLAB_UNRECLAIMABLE)); + global_node_page_state(NR_SLAB_UNRECLAIMABLE)); seq_printf(m, "KernelStack: %8lu kB\n", global_page_state(NR_KERNEL_STACK_KB)); show_val_kb(m, "PageTables: ", diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 222317721c5a..0972a8e09d08 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1650,7 +1650,7 @@ static unsigned long minimum_image_size(unsigned long saveable) { unsigned long size; - size = global_page_state(NR_SLAB_RECLAIMABLE) + size = global_node_page_state(NR_SLAB_RECLAIMABLE) + global_node_page_state(NR_ACTIVE_ANON) + global_node_page_state(NR_INACTIVE_ANON) + global_node_page_state(NR_ACTIVE_FILE) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fc32aa81f359..626a430e32d1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4458,8 +4458,9 @@ long si_mem_available(void) * Part of the reclaimable slab consists of items that are in use, * and cannot be freed. Cap this estimate at the low watermark. */ - available += global_page_state(NR_SLAB_RECLAIMABLE) - - min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); + available += global_node_page_state(NR_SLAB_RECLAIMABLE) - + min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2, + wmark_low); if (available < 0) available = 0; @@ -4602,8 +4603,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state(NR_FILE_DIRTY), global_node_page_state(NR_WRITEBACK), global_node_page_state(NR_UNSTABLE_NFS), - global_page_state(NR_SLAB_RECLAIMABLE), - global_page_state(NR_SLAB_UNRECLAIMABLE), + global_node_page_state(NR_SLAB_RECLAIMABLE), + global_node_page_state(NR_SLAB_UNRECLAIMABLE), global_node_page_state(NR_FILE_MAPPED), global_node_page_state(NR_SHMEM), global_page_state(NR_PAGETABLE), diff --git a/mm/util.c b/mm/util.c index 7b07ec852e01..9ecddf568fe3 100644 --- a/mm/util.c +++ b/mm/util.c @@ -633,7 +633,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) * which are reclaimable, under pressure. The dentry * cache and most inode caches should fall into this */ - free += global_page_state(NR_SLAB_RECLAIMABLE); + free += global_node_page_state(NR_SLAB_RECLAIMABLE); /* * Leave reserved pages. The pages are not for anonymous pages. -- cgit v1.2.3 From 16af97dc5a8975371a83d9e30a64038b48f40a2d Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 10 Aug 2017 15:23:56 -0700 Subject: mm: migrate: prevent racy access to tlb_flush_pending Patch series "fixes of TLB batching races", v6. It turns out that Linux TLB batching mechanism suffers from various races. Races that are caused due to batching during reclamation were recently handled by Mel and this patch-set deals with others. The more fundamental issue is that concurrent updates of the page-tables allow for TLB flushes to be batched on one core, while another core changes the page-tables. This other core may assume a PTE change does not require a flush based on the updated PTE value, while it is unaware that TLB flushes are still pending. This behavior affects KSM (which may result in memory corruption) and MADV_FREE and MADV_DONTNEED (which may result in incorrect behavior). A proof-of-concept can easily produce the wrong behavior of MADV_DONTNEED. Memory corruption in KSM is harder to produce in practice, but was observed by hacking the kernel and adding a delay before flushing and replacing the KSM page. Finally, there is also one memory barrier missing, which may affect architectures with weak memory model. This patch (of 7): Setting and clearing mm->tlb_flush_pending can be performed by multiple threads, since mmap_sem may only be acquired for read in task_numa_work(). If this happens, tlb_flush_pending might be cleared while one of the threads still changes PTEs and batches TLB flushes. This can lead to the same race between migration and change_protection_range() that led to the introduction of tlb_flush_pending. The result of this race was data corruption, which means that this patch also addresses a theoretically possible data corruption. An actual data corruption was not observed, yet the race was was confirmed by adding assertion to check tlb_flush_pending is not set by two threads, adding artificial latency in change_protection_range() and using sysctl to reduce kernel.numa_balancing_scan_delay_ms. Link: http://lkml.kernel.org/r/20170802000818.4760-2-namit@vmware.com Fixes: 20841405940e ("mm: fix TLB flush race between migration, and change_protection_range") Signed-off-by: Nadav Amit Acked-by: Mel Gorman Acked-by: Rik van Riel Acked-by: Minchan Kim Cc: Andy Lutomirski Cc: Hugh Dickins Cc: "David S. Miller" Cc: Andrea Arcangeli Cc: Heiko Carstens Cc: Ingo Molnar Cc: Jeff Dike Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Russell King Cc: Sergey Senozhatsky Cc: Tony Luck Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 31 ++++++++++++++++++++++--------- kernel/fork.c | 2 +- mm/debug.c | 2 +- mm/mprotect.c | 4 ++-- 4 files changed, 26 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7f384bb62d8e..f58f76ee1dfa 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -493,7 +493,7 @@ struct mm_struct { * can move process memory needs to flush the TLB when moving a * PROT_NONE or PROT_NUMA mapped page. */ - bool tlb_flush_pending; + atomic_t tlb_flush_pending; #endif #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* See flush_tlb_batched_pending() */ @@ -532,33 +532,46 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) static inline bool mm_tlb_flush_pending(struct mm_struct *mm) { barrier(); - return mm->tlb_flush_pending; + return atomic_read(&mm->tlb_flush_pending) > 0; } -static inline void set_tlb_flush_pending(struct mm_struct *mm) + +static inline void init_tlb_flush_pending(struct mm_struct *mm) { - mm->tlb_flush_pending = true; + atomic_set(&mm->tlb_flush_pending, 0); +} + +static inline void inc_tlb_flush_pending(struct mm_struct *mm) +{ + atomic_inc(&mm->tlb_flush_pending); /* - * Guarantee that the tlb_flush_pending store does not leak into the + * Guarantee that the tlb_flush_pending increase does not leak into the * critical section updating the page tables */ smp_mb__before_spinlock(); } + /* Clearing is done after a TLB flush, which also provides a barrier. */ -static inline void clear_tlb_flush_pending(struct mm_struct *mm) +static inline void dec_tlb_flush_pending(struct mm_struct *mm) { barrier(); - mm->tlb_flush_pending = false; + atomic_dec(&mm->tlb_flush_pending); } #else static inline bool mm_tlb_flush_pending(struct mm_struct *mm) { return false; } -static inline void set_tlb_flush_pending(struct mm_struct *mm) + +static inline void init_tlb_flush_pending(struct mm_struct *mm) { } -static inline void clear_tlb_flush_pending(struct mm_struct *mm) + +static inline void inc_tlb_flush_pending(struct mm_struct *mm) +{ +} + +static inline void dec_tlb_flush_pending(struct mm_struct *mm) { } #endif diff --git a/kernel/fork.c b/kernel/fork.c index 17921b0390b4..e075b7780421 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -807,7 +807,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_aio(mm); mm_init_owner(mm, p); mmu_notifier_mm_init(mm); - clear_tlb_flush_pending(mm); + init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; #endif diff --git a/mm/debug.c b/mm/debug.c index db1cd26d8752..d70103bb4731 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -159,7 +159,7 @@ void dump_mm(const struct mm_struct *mm) mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq, #endif #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) - mm->tlb_flush_pending, + atomic_read(&mm->tlb_flush_pending), #endif mm->def_flags, &mm->def_flags ); diff --git a/mm/mprotect.c b/mm/mprotect.c index 4180ad8cc9c5..bd0f409922cb 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -244,7 +244,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, BUG_ON(addr >= end); pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); - set_tlb_flush_pending(mm); + inc_tlb_flush_pending(mm); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) @@ -256,7 +256,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, /* Only flush the TLB if we actually modified any entries: */ if (pages) flush_tlb_range(vma, start, end); - clear_tlb_flush_pending(mm); + dec_tlb_flush_pending(mm); return pages; } -- cgit v1.2.3 From d76036ab47eafa6ce52b69482e91ca3ba337d6d6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 15 Aug 2017 13:00:36 +0200 Subject: audit: Fix use after free in audit_remove_watch_rule() audit_remove_watch_rule() drops watch's reference to parent but then continues to work with it. That is not safe as parent can get freed once we drop our reference. The following is a trivial reproducer: mount -o loop image /mnt touch /mnt/file auditctl -w /mnt/file -p wax umount /mnt auditctl -D Grab our own reference in audit_remove_watch_rule() earlier to make sure mark does not get freed under us. CC: stable@vger.kernel.org Reported-by: Tony Jones Signed-off-by: Jan Kara Tested-by: Tony Jones Signed-off-by: Paul Moore --- kernel/audit_watch.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index e0656bd63036..1c7ded42f82f 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -457,13 +457,15 @@ void audit_remove_watch_rule(struct audit_krule *krule) list_del(&krule->rlist); if (list_empty(&watch->rules)) { + /* + * audit_remove_watch() drops our reference to 'parent' which + * can get freed. Grab our own reference to be safe. + */ + audit_get_parent(parent); audit_remove_watch(watch); - - if (list_empty(&parent->watches)) { - audit_get_parent(parent); + if (list_empty(&parent->watches)) fsnotify_destroy_mark(&parent->mark, audit_watch_group); - audit_put_parent(parent); - } + audit_put_parent(parent); } } -- cgit v1.2.3 From b5fed474b98332559f2590c6bc90388a0899e793 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 15 Aug 2017 13:00:37 +0200 Subject: audit: Receive unmount event Although audit_watch_handle_event() can handle FS_UNMOUNT event, it is not part of AUDIT_FS_WATCH mask and thus such event never gets to audit_watch_handle_event(). Thus fsnotify marks are deleted by fsnotify subsystem on unmount without audit being notified about that which leads to a strange state of existing audit rules with dead fsnotify marks. Add FS_UNMOUNT to the mask of events to be received so that audit can clean up its state accordingly. Signed-off-by: Jan Kara Signed-off-by: Paul Moore --- kernel/audit_watch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 1c7ded42f82f..d1b5857b7e33 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -66,7 +66,7 @@ static struct fsnotify_group *audit_watch_group; /* fsnotify events we care about. */ #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ - FS_MOVE_SELF | FS_EVENT_ON_CHILD) + FS_MOVE_SELF | FS_EVENT_ON_CHILD | FS_UNMOUNT) static void audit_free_parent(struct audit_parent *parent) { -- cgit v1.2.3 From 88a5c690b66110ad255380d8f629c629cf6ca559 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 16 Aug 2017 01:45:33 +0200 Subject: bpf: fix bpf_trace_printk on 32 bit archs James reported that on MIPS32 bpf_trace_printk() is currently broken while MIPS64 works fine: bpf_trace_printk() uses conditional operators to attempt to pass different types to __trace_printk() depending on the format operators. This doesn't work as intended on 32-bit architectures where u32 and long are passed differently to u64, since the result of C conditional operators follows the "usual arithmetic conversions" rules, such that the values passed to __trace_printk() will always be u64 [causing issues later in the va_list handling for vscnprintf()]. For example the samples/bpf/tracex5 test printed lines like below on MIPS32, where the fd and buf have come from the u64 fd argument, and the size from the buf argument: [...] 1180.941542: 0x00000001: write(fd=1, buf= (null), size=6258688) Instead of this: [...] 1625.616026: 0x00000001: write(fd=1, buf=009e4000, size=512) One way to get it working is to expand various combinations of argument types into 8 different combinations for 32 bit and 64 bit kernels. Fix tested by James on MIPS32 and MIPS64 as well that it resolves the issue. Fixes: 9c959c863f82 ("tracing: Allow BPF programs to call bpf_trace_printk()") Reported-by: James Hogan Tested-by: James Hogan Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/trace/bpf_trace.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 37385193a608..dc498b605d5d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -204,10 +204,36 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, fmt_cnt++; } - return __trace_printk(1/* fake ip will not be printed */, fmt, - mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1, - mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2, - mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3); +/* Horrid workaround for getting va_list handling working with different + * argument type combinations generically for 32 and 64 bit archs. + */ +#define __BPF_TP_EMIT() __BPF_ARG3_TP() +#define __BPF_TP(...) \ + __trace_printk(1 /* Fake ip will not be printed. */, \ + fmt, ##__VA_ARGS__) + +#define __BPF_ARG1_TP(...) \ + ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \ + ? __BPF_TP(arg1, ##__VA_ARGS__) \ + : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \ + ? __BPF_TP((long)arg1, ##__VA_ARGS__) \ + : __BPF_TP((u32)arg1, ##__VA_ARGS__))) + +#define __BPF_ARG2_TP(...) \ + ((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \ + ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \ + : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \ + ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \ + : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__))) + +#define __BPF_ARG3_TP(...) \ + ((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \ + ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \ + : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \ + ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \ + : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__))) + + return __BPF_TP_EMIT(); } static const struct bpf_func_proto bpf_trace_printk_proto = { -- cgit v1.2.3 From e8f241893dfbbebe2813c01eac54f263e6a5e59c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 18 Aug 2017 10:53:45 +0100 Subject: genirq: Restore trigger settings in irq_modify_status() irq_modify_status starts by clearing the trigger settings from irq_data before applying the new settings, but doesn't restore them, leaving them to IRQ_TYPE_NONE. That's pretty confusing to the potential request_irq() that could follow. Instead, snapshot the settings before clearing them, and restore them if the irq_modify_status() invocation was not changing the trigger. Fixes: 1e2a7d78499e ("irqdomain: Don't set type when mapping an IRQ") Reported-and-tested-by: jeffy Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Cc: Jon Hunter Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170818095345.12378-1-marc.zyngier@arm.com --- kernel/irq/chip.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index a3cc37c0c85e..3675c6004f2a 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -1000,7 +1000,7 @@ EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name); void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { - unsigned long flags; + unsigned long flags, trigger, tmp; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) @@ -1014,6 +1014,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) irq_settings_clr_and_set(desc, clr, set); + trigger = irqd_get_trigger_type(&desc->irq_data); + irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT); if (irq_settings_has_no_balance_set(desc)) @@ -1025,7 +1027,11 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) if (irq_settings_is_level(desc)) irqd_set(&desc->irq_data, IRQD_LEVEL); - irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); + tmp = irq_settings_get_trigger_mask(desc); + if (tmp != IRQ_TYPE_NONE) + trigger = tmp; + + irqd_set(&desc->irq_data, trigger); irq_put_desc_unlock(desc, flags); } -- cgit v1.2.3 From 7edaeb6841dfb27e362288ab8466ebdc4972e867 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 15 Aug 2017 09:50:13 +0200 Subject: kernel/watchdog: Prevent false positives with turbo modes The hardlockup detector on x86 uses a performance counter based on unhalted CPU cycles and a periodic hrtimer. The hrtimer period is about 2/5 of the performance counter period, so the hrtimer should fire 2-3 times before the performance counter NMI fires. The NMI code checks whether the hrtimer fired since the last invocation. If not, it assumess a hard lockup. The calculation of those periods is based on the nominal CPU frequency. Turbo modes increase the CPU clock frequency and therefore shorten the period of the perf/NMI watchdog. With extreme Turbo-modes (3x nominal frequency) the perf/NMI period is shorter than the hrtimer period which leads to false positives. A simple fix would be to shorten the hrtimer period, but that comes with the side effect of more frequent hrtimer and softlockup thread wakeups, which is not desired. Implement a low pass filter, which checks the perf/NMI period against kernel time. If the perf/NMI fires before 4/5 of the watchdog period has elapsed then the event is ignored and postponed to the next perf/NMI. That solves the problem and avoids the overhead of shorter hrtimer periods and more frequent softlockup thread wakeups. Fixes: 58687acba592 ("lockup_detector: Combine nmi_watchdog and softlockup detector") Reported-and-tested-by: Kan Liang Signed-off-by: Thomas Gleixner Cc: dzickus@redhat.com Cc: prarit@redhat.com Cc: ak@linux.intel.com Cc: babu.moger@oracle.com Cc: peterz@infradead.org Cc: eranian@google.com Cc: acme@redhat.com Cc: stable@vger.kernel.org Cc: atomlin@redhat.com Cc: akpm@linux-foundation.org Cc: torvalds@linux-foundation.org Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1708150931310.1886@nanos --- arch/x86/Kconfig | 1 + include/linux/nmi.h | 8 +++++++ kernel/watchdog.c | 1 + kernel/watchdog_hld.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++ lib/Kconfig.debug | 7 ++++++ 5 files changed, 76 insertions(+) (limited to 'kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 781521b7cf9e..9101bfc85539 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -100,6 +100,7 @@ config X86 select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select GENERIC_TIME_VSYSCALL + select HARDLOCKUP_CHECK_TIMESTAMP if X86_64 select HAVE_ACPI_APEI if ACPI select HAVE_ACPI_APEI_NMI if ACPI select HAVE_ALIGNED_STRUCT_PAGE if SLUB diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 8aa01fd859fb..a36abe2da13e 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -168,6 +168,14 @@ extern int sysctl_hardlockup_all_cpu_backtrace; #define sysctl_softlockup_all_cpu_backtrace 0 #define sysctl_hardlockup_all_cpu_backtrace 0 #endif + +#if defined(CONFIG_HARDLOCKUP_CHECK_TIMESTAMP) && \ + defined(CONFIG_HARDLOCKUP_DETECTOR) +void watchdog_update_hrtimer_threshold(u64 period); +#else +static inline void watchdog_update_hrtimer_threshold(u64 period) { } +#endif + extern bool is_hardlockup(void); struct ctl_table; extern int proc_watchdog(struct ctl_table *, int , diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 06d3389bca0d..f5d52024f6b7 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -240,6 +240,7 @@ static void set_sample_period(void) * hardlockup detector generates a warning */ sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5); + watchdog_update_hrtimer_threshold(sample_period); } /* Commands for resetting the watchdog */ diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 295a0d84934c..3a09ea1b1d3d 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -37,6 +37,62 @@ void arch_touch_nmi_watchdog(void) } EXPORT_SYMBOL(arch_touch_nmi_watchdog); +#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP +static DEFINE_PER_CPU(ktime_t, last_timestamp); +static DEFINE_PER_CPU(unsigned int, nmi_rearmed); +static ktime_t watchdog_hrtimer_sample_threshold __read_mostly; + +void watchdog_update_hrtimer_threshold(u64 period) +{ + /* + * The hrtimer runs with a period of (watchdog_threshold * 2) / 5 + * + * So it runs effectively with 2.5 times the rate of the NMI + * watchdog. That means the hrtimer should fire 2-3 times before + * the NMI watchdog expires. The NMI watchdog on x86 is based on + * unhalted CPU cycles, so if Turbo-Mode is enabled the CPU cycles + * might run way faster than expected and the NMI fires in a + * smaller period than the one deduced from the nominal CPU + * frequency. Depending on the Turbo-Mode factor this might be fast + * enough to get the NMI period smaller than the hrtimer watchdog + * period and trigger false positives. + * + * The sample threshold is used to check in the NMI handler whether + * the minimum time between two NMI samples has elapsed. That + * prevents false positives. + * + * Set this to 4/5 of the actual watchdog threshold period so the + * hrtimer is guaranteed to fire at least once within the real + * watchdog threshold. + */ + watchdog_hrtimer_sample_threshold = period * 2; +} + +static bool watchdog_check_timestamp(void) +{ + ktime_t delta, now = ktime_get_mono_fast_ns(); + + delta = now - __this_cpu_read(last_timestamp); + if (delta < watchdog_hrtimer_sample_threshold) { + /* + * If ktime is jiffies based, a stalled timer would prevent + * jiffies from being incremented and the filter would look + * at a stale timestamp and never trigger. + */ + if (__this_cpu_inc_return(nmi_rearmed) < 10) + return false; + } + __this_cpu_write(nmi_rearmed, 0); + __this_cpu_write(last_timestamp, now); + return true; +} +#else +static inline bool watchdog_check_timestamp(void) +{ + return true; +} +#endif + static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, @@ -61,6 +117,9 @@ static void watchdog_overflow_callback(struct perf_event *event, return; } + if (!watchdog_check_timestamp()) + return; + /* check for a hardlockup * This is done by making sure our timer interrupt * is incrementing. The timer interrupt should have diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 98fe715522e8..c617b9d1d6cb 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -797,6 +797,13 @@ config HARDLOCKUP_DETECTOR_PERF bool select SOFTLOCKUP_DETECTOR +# +# Enables a timestamp based low pass filter to compensate for perf based +# hard lockup detection which runs too fast due to turbo modes. +# +config HARDLOCKUP_CHECK_TIMESTAMP + bool + # # arch/ can define HAVE_HARDLOCKUP_DETECTOR_ARCH to provide their own hard # lockup detector rather than the perf based detector. -- cgit v1.2.3 From 2ba293c9e7db150943f06b12d3eb7213e7fae624 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 18 Aug 2017 15:15:58 -0700 Subject: kmod: fix wait on recursive loop Recursive loops with module loading were previously handled in kmod by restricting the number of modprobe calls to 50 and if that limit was breached request_module() would return an error and a user would see the following on their kernel dmesg: request_module: runaway loop modprobe binfmt-464c Starting init:/sbin/init exists but couldn't execute it (error -8) This issue could happen for instance when a 64-bit kernel boots a 32-bit userspace on some architectures and has no 32-bit binary format hanlders. This is visible, for instance, when a CONFIG_MODULES enabled 64-bit MIPS kernel boots a into o32 root filesystem and the binfmt handler for o32 binaries is not built-in. After commit 6d7964a722af ("kmod: throttle kmod thread limit") we now don't have any visible signs of an error and the kernel just waits for the loop to end somehow. Although this *particular* recursive loop could also be addressed by doing a sanity check on search_binary_handler() and disallowing a modular binfmt to be required for modprobe, a generic solution for any recursive kernel kmod issues is still needed. This should catch these loops. We can investigate each loop and address each one separately as they come in, this however puts a stop gap for them as before. Link: http://lkml.kernel.org/r/20170809234635.13443-3-mcgrof@kernel.org Fixes: 6d7964a722af ("kmod: throttle kmod thread limit") Signed-off-by: Luis R. Rodriguez Reported-by: Matt Redfearn Tested-by: Matt Redfearn Cc: "Eric W. Biederman" Cc: Colin Ian King Cc: Dan Carpenter Cc: Daniel Mentz Cc: David Binderman Cc: Dmitry Torokhov Cc: Ingo Molnar Cc: Jessica Yu Cc: Josh Poimboeuf Cc: Kees Cook Cc: Michal Marek Cc: Miroslav Benes Cc: Peter Zijlstra (Intel) Cc: Petr Mladek Cc: Rusty Russell Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 6d016c5d97c8..2f37acde640b 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -70,6 +70,18 @@ static DECLARE_RWSEM(umhelper_sem); static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); static DECLARE_WAIT_QUEUE_HEAD(kmod_wq); +/* + * This is a restriction on having *all* MAX_KMOD_CONCURRENT threads + * running at the same time without returning. When this happens we + * believe you've somehow ended up with a recursive module dependency + * creating a loop. + * + * We have no option but to fail. + * + * Userspace should proactively try to detect and prevent these. + */ +#define MAX_KMOD_ALL_BUSY_TIMEOUT 5 + /* modprobe_path is set via /proc/sys. */ @@ -167,8 +179,17 @@ int __request_module(bool wait, const char *fmt, ...) pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...", atomic_read(&kmod_concurrent_max), MAX_KMOD_CONCURRENT, module_name); - wait_event_interruptible(kmod_wq, - atomic_dec_if_positive(&kmod_concurrent_max) >= 0); + ret = wait_event_killable_timeout(kmod_wq, + atomic_dec_if_positive(&kmod_concurrent_max) >= 0, + MAX_KMOD_ALL_BUSY_TIMEOUT * HZ); + if (!ret) { + pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now", + module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT); + return -ETIME; + } else if (ret == -ERESTARTSYS) { + pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name); + return ret; + } } trace_module_request(module_name, wait, _RET_IP_); -- cgit v1.2.3 From eb61b5911bdc923875cde99eb25203a0e2b06d43 Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Fri, 18 Aug 2017 15:16:18 -0700 Subject: signal: don't remove SIGNAL_UNKILLABLE for traced tasks. When forcing a signal, SIGNAL_UNKILLABLE is removed to prevent recursive faults, but this is undesirable when tracing. For example, debugging an init process (whether global or namespace), hitting a breakpoint and SIGTRAP will force SIGTRAP and then remove SIGNAL_UNKILLABLE. Everything continues fine, but then once debugging has finished, the init process is left killable which is unlikely what the user expects, resulting in either an accidentally killed init or an init that stops reaping zombies. Link: http://lkml.kernel.org/r/20170815112806.10728-1-jamie.iles@oracle.com Signed-off-by: Jamie Iles Acked-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 7e33f8c583e6..ed804a470dcd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1194,7 +1194,11 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) recalc_sigpending_and_wake(t); } } - if (action->sa.sa_handler == SIG_DFL) + /* + * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect + * debugging to leave init killable. + */ + if (action->sa.sa_handler == SIG_DFL && !t->ptrace) t->signal->flags &= ~SIGNAL_UNKILLABLE; ret = specific_send_sig_info(sig, info, t); spin_unlock_irqrestore(&t->sighand->siglock, flags); -- cgit v1.2.3 From 8fbbe2d7cc478d1544f41f2271787c993c23a4f6 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 19 Aug 2017 12:57:51 +0300 Subject: genirq/ipi: Fixup checks against nr_cpu_ids Valid CPU ids are [0, nr_cpu_ids-1] inclusive. Fixes: 3b8e29a82dd1 ("genirq: Implement ipi_send_mask/single()") Fixes: f9bce791ae2a ("genirq: Add a new function to get IPI reverse mapping") Signed-off-by: Alexey Dobriyan Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170819095751.GB27864@avx2 --- kernel/irq/ipi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c index 1a9abc1c8ea0..259a22aa9934 100644 --- a/kernel/irq/ipi.c +++ b/kernel/irq/ipi.c @@ -165,7 +165,7 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu) struct irq_data *data = irq_get_irq_data(irq); struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; - if (!data || !ipimask || cpu > nr_cpu_ids) + if (!data || !ipimask || cpu >= nr_cpu_ids) return INVALID_HWIRQ; if (!cpumask_test_cpu(cpu, ipimask)) @@ -195,7 +195,7 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data, if (!chip->ipi_send_single && !chip->ipi_send_mask) return -EINVAL; - if (cpu > nr_cpu_ids) + if (cpu >= nr_cpu_ids) return -EINVAL; if (dest) { -- cgit v1.2.3 From dd1c1f2f2028a7b851f701fc6a8ebe39dcb95e7c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 21 Aug 2017 17:35:02 +0200 Subject: pids: make task_tgid_nr_ns() safe This was reported many times, and this was even mentioned in commit 52ee2dfdd4f5 ("pids: refactor vnr/nr_ns helpers to make them safe") but somehow nobody bothered to fix the obvious problem: task_tgid_nr_ns() is not safe because task->group_leader points to nowhere after the exiting task passes exit_notify(), rcu_read_lock() can not help. We really need to change __unhash_process() to nullify group_leader, parent, and real_parent, but this needs some cleanups. Until then we can turn task_tgid_nr_ns() into another user of __task_pid_nr_ns() and fix the problem. Reported-by: Troy Kensinger Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- include/linux/pid.h | 4 +++- include/linux/sched.h | 51 +++++++++++++++++++++++++++------------------------ kernel/pid.c | 11 ++++------- 3 files changed, 34 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/include/linux/pid.h b/include/linux/pid.h index 4d179316e431..719582744a2e 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -8,7 +8,9 @@ enum pid_type PIDTYPE_PID, PIDTYPE_PGID, PIDTYPE_SID, - PIDTYPE_MAX + PIDTYPE_MAX, + /* only valid to __task_pid_nr_ns() */ + __PIDTYPE_TGID }; /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 8337e2db0bb2..c05ac5f5aa03 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1163,13 +1163,6 @@ static inline pid_t task_tgid_nr(struct task_struct *tsk) return tsk->tgid; } -extern pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); - -static inline pid_t task_tgid_vnr(struct task_struct *tsk) -{ - return pid_vnr(task_tgid(tsk)); -} - /** * pid_alive - check that a task structure is not stale * @p: Task structure to be checked. @@ -1185,23 +1178,6 @@ static inline int pid_alive(const struct task_struct *p) return p->pids[PIDTYPE_PID].pid != NULL; } -static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) -{ - pid_t pid = 0; - - rcu_read_lock(); - if (pid_alive(tsk)) - pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); - rcu_read_unlock(); - - return pid; -} - -static inline pid_t task_ppid_nr(const struct task_struct *tsk) -{ - return task_ppid_nr_ns(tsk, &init_pid_ns); -} - static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns); @@ -1223,6 +1199,33 @@ static inline pid_t task_session_vnr(struct task_struct *tsk) return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); } +static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, __PIDTYPE_TGID, ns); +} + +static inline pid_t task_tgid_vnr(struct task_struct *tsk) +{ + return __task_pid_nr_ns(tsk, __PIDTYPE_TGID, NULL); +} + +static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) +{ + pid_t pid = 0; + + rcu_read_lock(); + if (pid_alive(tsk)) + pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); + rcu_read_unlock(); + + return pid; +} + +static inline pid_t task_ppid_nr(const struct task_struct *tsk) +{ + return task_ppid_nr_ns(tsk, &init_pid_ns); +} + /* Obsolete, do not use: */ static inline pid_t task_pgrp_nr(struct task_struct *tsk) { diff --git a/kernel/pid.c b/kernel/pid.c index c69c30d827e5..020dedbdf066 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -527,8 +527,11 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, if (!ns) ns = task_active_pid_ns(current); if (likely(pid_alive(task))) { - if (type != PIDTYPE_PID) + if (type != PIDTYPE_PID) { + if (type == __PIDTYPE_TGID) + type = PIDTYPE_PID; task = task->group_leader; + } nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns); } rcu_read_unlock(); @@ -537,12 +540,6 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, } EXPORT_SYMBOL(__task_pid_nr_ns); -pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return pid_nr_ns(task_tgid(tsk), ns); -} -EXPORT_SYMBOL(task_tgid_nr_ns); - struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) { return ns_of_pid(task_pid(tsk)); -- cgit v1.2.3 From 2fe59f507a65dbd734b990a11ebc7488f6f87a24 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 22 Aug 2017 18:43:48 +1000 Subject: timers: Fix excessive granularity of new timers after a nohz idle When a timer base is idle, it is forwarded when a new timer is added to ensure that granularity does not become excessive. When not idle, the timer tick is expected to increment the base. However there are several problems: - If an existing timer is modified, the base is forwarded only after the index is calculated. - The base is not forwarded by add_timer_on. - There is a window after a timer is restarted from a nohz idle, after it is marked not-idle and before the timer tick on this CPU, where a timer may be added but the ancient base does not get forwarded. These result in excessive granularity (a 1 jiffy timeout can blow out to 100s of jiffies), which cause the rcu lockup detector to trigger, among other things. Fix this by keeping track of whether the timer base has been idle since it was last run or forwarded, and if so then forward it before adding a new timer. There is still a case where mod_timer optimises the case of a pending timer mod with the same expiry time, where the timer can see excessive granularity relative to the new, shorter interval. A comment is added, but it's not changed because it is an important fastpath for networking. This has been tested and found to fix the RCU softlockup messages. Testing was also done with tracing to measure requested versus achieved wakeup latencies for all non-deferrable timers in an idle system (with no lockup watchdogs running). Wakeup latency relative to absolute latency is calculated (note this suffers from round-up skew at low absolute times) and analysed: max avg std upstream 506.0 1.20 4.68 patched 2.0 1.08 0.15 The bug was noticed due to the lockup detector Kconfig changes dropping it out of people's .configs and resulting in larger base clk skew When the lockup detectors are enabled, no CPU can go idle for longer than 4 seconds, which limits the granularity errors. Sub-optimal timer behaviour is observable on a smaller scale in that case: max avg std upstream 9.0 1.05 0.19 patched 2.0 1.04 0.11 Fixes: Fixes: a683f390b93f ("timers: Forward the wheel clock whenever possible") Signed-off-by: Nicholas Piggin Signed-off-by: Thomas Gleixner Tested-by: Jonathan Cameron Tested-by: David Miller Cc: dzickus@redhat.com Cc: sfr@canb.auug.org.au Cc: mpe@ellerman.id.au Cc: Stephen Boyd Cc: linuxarm@huawei.com Cc: abdhalee@linux.vnet.ibm.com Cc: John Stultz Cc: akpm@linux-foundation.org Cc: paulmck@linux.vnet.ibm.com Cc: torvalds@linux-foundation.org Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170822084348.21436-1-npiggin@gmail.com --- kernel/time/timer.c | 50 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 8f5d1bf18854..f2674a056c26 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -203,6 +203,7 @@ struct timer_base { bool migration_enabled; bool nohz_active; bool is_idle; + bool must_forward_clk; DECLARE_BITMAP(pending_map, WHEEL_SIZE); struct hlist_head vectors[WHEEL_SIZE]; } ____cacheline_aligned; @@ -856,13 +857,19 @@ get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { - unsigned long jnow = READ_ONCE(jiffies); + unsigned long jnow; /* - * We only forward the base when it's idle and we have a delta between - * base clock and jiffies. + * We only forward the base when we are idle or have just come out of + * idle (must_forward_clk logic), and have a delta between base clock + * and jiffies. In the common case, run_timers will take care of it. */ - if (!base->is_idle || (long) (jnow - base->clk) < 2) + if (likely(!base->must_forward_clk)) + return; + + jnow = READ_ONCE(jiffies); + base->must_forward_clk = base->is_idle; + if ((long)(jnow - base->clk) < 2) return; /* @@ -938,6 +945,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) * same array bucket then just return: */ if (timer_pending(timer)) { + /* + * The downside of this optimization is that it can result in + * larger granularity than you would get from adding a new + * timer with this expiry. + */ if (timer->expires == expires) return 1; @@ -948,6 +960,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) * dequeue/enqueue dance. */ base = lock_timer_base(timer, &flags); + forward_timer_base(base); clk = base->clk; idx = calc_wheel_index(expires, clk); @@ -964,6 +977,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) } } else { base = lock_timer_base(timer, &flags); + forward_timer_base(base); } ret = detach_if_pending(timer, base, false); @@ -991,12 +1005,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) raw_spin_lock(&base->lock); WRITE_ONCE(timer->flags, (timer->flags & ~TIMER_BASEMASK) | base->cpu); + forward_timer_base(base); } } - /* Try to forward a stale timer base clock */ - forward_timer_base(base); - timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance @@ -1112,6 +1124,7 @@ void add_timer_on(struct timer_list *timer, int cpu) WRITE_ONCE(timer->flags, (timer->flags & ~TIMER_BASEMASK) | cpu); } + forward_timer_base(base); debug_activate(timer, timer->expires); internal_add_timer(base, timer); @@ -1497,10 +1510,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) if (!is_max_delta) expires = basem + (u64)(nextevt - basej) * TICK_NSEC; /* - * If we expect to sleep more than a tick, mark the base idle: + * If we expect to sleep more than a tick, mark the base idle. + * Also the tick is stopped so any added timer must forward + * the base clk itself to keep granularity small. This idle + * logic is only maintained for the BASE_STD base, deferrable + * timers may still see large granularity skew (by design). */ - if ((expires - basem) > TICK_NSEC) + if ((expires - basem) > TICK_NSEC) { + base->must_forward_clk = true; base->is_idle = true; + } } raw_spin_unlock(&base->lock); @@ -1611,6 +1630,19 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + /* + * must_forward_clk must be cleared before running timers so that any + * timer functions that call mod_timer will not try to forward the + * base. idle trcking / clock forwarding logic is only used with + * BASE_STD timers. + * + * The deferrable base does not do idle tracking at all, so we do + * not forward it. This can result in very large variations in + * granularity for deferrable timers, but they can be deferred for + * long periods due to idle. + */ + base->must_forward_clk = false; + __run_timers(base); if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); -- cgit v1.2.3 From a8f0f9e49956a74718874b800251455680085600 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 17 Aug 2017 16:37:25 -0400 Subject: ftrace: Check for null ret_stack on profile function graph entry function There's a small race when function graph shutsdown and the calling of the registered function graph entry callback. The callback must not reference the task's ret_stack without first checking that it is not NULL. Note, when a ret_stack is allocated for a task, it stays allocated until the task exits. The problem here, is that function_graph is shutdown, and a new task was created, which doesn't have its ret_stack allocated. But since some of the functions are still being traced, the callbacks can still be called. The normal function_graph code handles this, but starting with commit 8861dd303c ("ftrace: Access ret_stack->subtime only in the function profiler") the profiler code references the ret_stack on function entry, but doesn't check if it is NULL first. Link: https://bugzilla.kernel.org/show_bug.cgi?id=196611 Cc: stable@vger.kernel.org Fixes: 8861dd303c ("ftrace: Access ret_stack->subtime only in the function profiler") Reported-by: lilydjwg@gmail.com Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 02004ae91860..96cea88fa00f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -889,6 +889,10 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace) function_profile_call(trace->func, 0, NULL, NULL); + /* If function graph is shutting down, ret_stack can be NULL */ + if (!current->ret_stack) + return 0; + if (index >= 0 && index < FTRACE_RETFUNC_DEPTH) current->ret_stack[index].subtime = 0; -- cgit v1.2.3 From 475bb3c69ab05df2a6ecef6acc2393703d134180 Mon Sep 17 00:00:00 2001 From: Chunyu Hu Date: Mon, 14 Aug 2017 18:18:17 +0800 Subject: tracing: Fix kmemleak in tracing_map_array_free() kmemleak reported the below leak when I was doing clear of the hist trigger. With this patch, the kmeamleak is gone. unreferenced object 0xffff94322b63d760 (size 32): comm "bash", pid 1522, jiffies 4403687962 (age 2442.311s) hex dump (first 32 bytes): 00 01 00 00 04 00 00 00 08 00 00 00 ff 00 00 00 ................ 10 00 00 00 00 00 00 00 80 a8 7a f2 31 94 ff ff ..........z.1... backtrace: [] kmemleak_alloc+0x4a/0xa0 [] kmem_cache_alloc_trace+0xca/0x1d0 [] tracing_map_array_alloc+0x26/0x140 [] kretprobe_trampoline+0x0/0x50 [] create_hist_data+0x535/0x750 [] event_hist_trigger_func+0x1f7/0x420 [] event_trigger_write+0xfd/0x1a0 [] __vfs_write+0x37/0x170 [] vfs_write+0xb2/0x1b0 [] SyS_write+0x55/0xc0 [] do_syscall_64+0x67/0x150 [] return_from_SYSCALL_64+0x0/0x6a [] 0xffffffffffffffff unreferenced object 0xffff9431f27aa880 (size 128): comm "bash", pid 1522, jiffies 4403687962 (age 2442.311s) hex dump (first 32 bytes): 00 00 8c 2a 32 94 ff ff 00 f0 8b 2a 32 94 ff ff ...*2......*2... 00 e0 8b 2a 32 94 ff ff 00 d0 8b 2a 32 94 ff ff ...*2......*2... backtrace: [] kmemleak_alloc+0x4a/0xa0 [] __kmalloc+0xe8/0x220 [] tracing_map_array_alloc+0xb1/0x140 [] kretprobe_trampoline+0x0/0x50 [] create_hist_data+0x535/0x750 [] event_hist_trigger_func+0x1f7/0x420 [] event_trigger_write+0xfd/0x1a0 [] __vfs_write+0x37/0x170 [] vfs_write+0xb2/0x1b0 [] SyS_write+0x55/0xc0 [] do_syscall_64+0x67/0x150 [] return_from_SYSCALL_64+0x0/0x6a [] 0xffffffffffffffff Link: http://lkml.kernel.org/r/1502705898-27571-1-git-send-email-chuhu@redhat.com Cc: stable@vger.kernel.org Fixes: 08d43a5fa063 ("tracing: Add lock-free tracing_map") Signed-off-by: Chunyu Hu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/tracing_map.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 0a689bbb78ef..305039b122fa 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -221,16 +221,19 @@ void tracing_map_array_free(struct tracing_map_array *a) if (!a) return; - if (!a->pages) { - kfree(a); - return; - } + if (!a->pages) + goto free; for (i = 0; i < a->n_pages; i++) { if (!a->pages[i]) break; free_page((unsigned long)a->pages[i]); } + + kfree(a->pages); + + free: + kfree(a); } struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts, -- cgit v1.2.3 From 8b0db1a5bdfcee0dbfa89607672598ae203c9045 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 23 Aug 2017 12:46:27 -0400 Subject: tracing: Fix freeing of filter in create_filter() when set_str is false Performing the following task with kmemleak enabled: # cd /sys/kernel/tracing/events/irq/irq_handler_entry/ # echo 'enable_event:kmem:kmalloc:3 if irq >' > trigger # echo 'enable_event:kmem:kmalloc:3 if irq > 31' > trigger # echo scan > /sys/kernel/debug/kmemleak # cat /sys/kernel/debug/kmemleak unreferenced object 0xffff8800b9290308 (size 32): comm "bash", pid 1114, jiffies 4294848451 (age 141.139s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x4a/0xa0 [] kmem_cache_alloc_trace+0x158/0x290 [] create_filter_start.constprop.28+0x99/0x940 [] create_filter+0xa9/0x160 [] create_event_filter+0xc/0x10 [] set_trigger_filter+0xe5/0x210 [] event_enable_trigger_func+0x324/0x490 [] event_trigger_write+0x1a2/0x260 [] __vfs_write+0xd7/0x380 [] vfs_write+0x101/0x260 [] SyS_write+0xab/0x130 [] entry_SYSCALL_64_fastpath+0x1f/0xbe [] 0xffffffffffffffff The function create_filter() is passed a 'filterp' pointer that gets allocated, and if "set_str" is true, it is up to the caller to free it, even on error. The problem is that the pointer is not freed by create_filter() when set_str is false. This is a bug, and it is not up to the caller to free the filter on error if it doesn't care about the string. Link: http://lkml.kernel.org/r/1502705898-27571-2-git-send-email-chuhu@redhat.com Cc: stable@vger.kernel.org Fixes: 38b78eb85 ("tracing: Factorize filter creation") Reported-by: Chunyu Hu Tested-by: Chunyu Hu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 59a411ff60c7..181e139a8057 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1959,6 +1959,10 @@ static int create_filter(struct trace_event_call *call, if (err && set_str) append_filter_err(ps, filter); } + if (err && !set_str) { + free_event_filter(filter); + filter = NULL; + } create_filter_finish(ps); *filterp = filter; -- cgit v1.2.3 From 64aee2a965cf2954a038b5522f11d2cd2f0f8f3e Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 22 Jun 2017 15:41:38 +0100 Subject: perf/core: Fix group {cpu,task} validation Regardless of which events form a group, it does not make sense for the events to target different tasks and/or CPUs, as this leaves the group inconsistent and impossible to schedule. The core perf code assumes that these are consistent across (successfully intialised) groups. Core perf code only verifies this when moving SW events into a HW context. Thus, we can violate this requirement for pure SW groups and pure HW groups, unless the relevant PMU driver happens to perform this verification itself. These mismatched groups subsequently wreak havoc elsewhere. For example, we handle watchpoints as SW events, and reserve watchpoint HW on a per-CPU basis at pmu::event_init() time to ensure that any event that is initialised is guaranteed to have a slot at pmu::add() time. However, the core code only checks the group leader's cpu filter (via event_filter_match()), and can thus install follower events onto CPUs violating thier (mismatched) CPU filters, potentially installing them into a CPU without sufficient reserved slots. This can be triggered with the below test case, resulting in warnings from arch backends. #define _GNU_SOURCE #include #include #include #include #include #include #include static int perf_event_open(struct perf_event_attr *attr, pid_t pid, int cpu, int group_fd, unsigned long flags) { return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags); } char watched_char; struct perf_event_attr wp_attr = { .type = PERF_TYPE_BREAKPOINT, .bp_type = HW_BREAKPOINT_RW, .bp_addr = (unsigned long)&watched_char, .bp_len = 1, .size = sizeof(wp_attr), }; int main(int argc, char *argv[]) { int leader, ret; cpu_set_t cpus; /* * Force use of CPU0 to ensure our CPU0-bound events get scheduled. */ CPU_ZERO(&cpus); CPU_SET(0, &cpus); ret = sched_setaffinity(0, sizeof(cpus), &cpus); if (ret) { printf("Unable to set cpu affinity\n"); return 1; } /* open leader event, bound to this task, CPU0 only */ leader = perf_event_open(&wp_attr, 0, 0, -1, 0); if (leader < 0) { printf("Couldn't open leader: %d\n", leader); return 1; } /* * Open a follower event that is bound to the same task, but a * different CPU. This means that the group should never be possible to * schedule. */ ret = perf_event_open(&wp_attr, 0, 1, leader, 0); if (ret < 0) { printf("Couldn't open mismatched follower: %d\n", ret); return 1; } else { printf("Opened leader/follower with mismastched CPUs\n"); } /* * Open as many independent events as we can, all bound to the same * task, CPU0 only. */ do { ret = perf_event_open(&wp_attr, 0, 0, -1, 0); } while (ret >= 0); /* * Force enable/disble all events to trigger the erronoeous * installation of the follower event. */ printf("Opened all events. Toggling..\n"); for (;;) { prctl(PR_TASK_PERF_EVENTS_DISABLE, 0, 0, 0, 0); prctl(PR_TASK_PERF_EVENTS_ENABLE, 0, 0, 0, 0); } return 0; } Fix this by validating this requirement regardless of whether we're moving events. Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Zhou Chengming Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1498142498-15758-1-git-send-email-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ee20d4c546b5..3504125871d2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10032,28 +10032,27 @@ SYSCALL_DEFINE5(perf_event_open, goto err_context; /* - * Do not allow to attach to a group in a different - * task or CPU context: + * Make sure we're both events for the same CPU; + * grouping events for different CPUs is broken; since + * you can never concurrently schedule them anyhow. */ - if (move_group) { - /* - * Make sure we're both on the same task, or both - * per-cpu events. - */ - if (group_leader->ctx->task != ctx->task) - goto err_context; + if (group_leader->cpu != event->cpu) + goto err_context; - /* - * Make sure we're both events for the same CPU; - * grouping events for different CPUs is broken; since - * you can never concurrently schedule them anyhow. - */ - if (group_leader->cpu != event->cpu) - goto err_context; - } else { - if (group_leader->ctx != ctx) - goto err_context; - } + /* + * Make sure we're both on the same task, or both + * per-CPU events. + */ + if (group_leader->ctx->task != ctx->task) + goto err_context; + + /* + * Do not allow to attach to a group in a different task + * or CPU context. If we're moving SW events, we'll fix + * this up later, so allow that. + */ + if (!move_group && group_leader->ctx != ctx) + goto err_context; /* * Only a group leader can be exclusive or pinned -- cgit v1.2.3 From 2b7e8665b4ff51c034c55df3cff76518d1a9ee3a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 25 Aug 2017 15:55:43 -0700 Subject: fork: fix incorrect fput of ->exe_file causing use-after-free Commit 7c051267931a ("mm, fork: make dup_mmap wait for mmap_sem for write killable") made it possible to kill a forking task while it is waiting to acquire its ->mmap_sem for write, in dup_mmap(). However, it was overlooked that this introduced an new error path before a reference is taken on the mm_struct's ->exe_file. Since the ->exe_file of the new mm_struct was already set to the old ->exe_file by the memcpy() in dup_mm(), it was possible for the mmput() in the error path of dup_mm() to drop a reference to ->exe_file which was never taken. This caused the struct file to later be freed prematurely. Fix it by updating mm_init() to NULL out the ->exe_file, in the same place it clears other things like the list of mmaps. This bug was found by syzkaller. It can be reproduced using the following C program: #define _GNU_SOURCE #include #include #include #include #include #include static void *mmap_thread(void *_arg) { for (;;) { mmap(NULL, 0x1000000, PROT_READ, MAP_POPULATE|MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); } } static void *fork_thread(void *_arg) { usleep(rand() % 10000); fork(); } int main(void) { fork(); fork(); fork(); for (;;) { if (fork() == 0) { pthread_t t; pthread_create(&t, NULL, mmap_thread, NULL); pthread_create(&t, NULL, fork_thread, NULL); usleep(rand() % 10000); syscall(__NR_exit_group, 0); } wait(NULL); } } No special kernel config options are needed. It usually causes a NULL pointer dereference in __remove_shared_vm_struct() during exit, or in dup_mmap() (which is usually inlined into copy_process()) during fork. Both are due to a vm_area_struct's ->vm_file being used after it's already been freed. Google Bug Id: 64772007 Link: http://lkml.kernel.org/r/20170823211408.31198-1-ebiggers3@gmail.com Fixes: 7c051267931a ("mm, fork: make dup_mmap wait for mmap_sem for write killable") Signed-off-by: Eric Biggers Tested-by: Mark Rutland Acked-by: Michal Hocko Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Konstantin Khlebnikov Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Vlastimil Babka Cc: [v4.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index e075b7780421..cbbea277b3fb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -806,6 +806,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); + RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_mm_init(mm); init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS -- cgit v1.2.3 From 3510ca20ece0150af6b10c77a74ff1b5c198e3e2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 27 Aug 2017 13:55:12 -0700 Subject: Minor page waitqueue cleanups Tim Chen and Kan Liang have been battling a customer load that shows extremely long page wakeup lists. The cause seems to be constant NUMA migration of a hot page that is shared across a lot of threads, but the actual root cause for the exact behavior has not been found. Tim has a patch that batches the wait list traversal at wakeup time, so that we at least don't get long uninterruptible cases where we traverse and wake up thousands of processes and get nasty latency spikes. That is likely 4.14 material, but we're still discussing the page waitqueue specific parts of it. In the meantime, I've tried to look at making the page wait queues less expensive, and failing miserably. If you have thousands of threads waiting for the same page, it will be painful. We'll need to try to figure out the NUMA balancing issue some day, in addition to avoiding the excessive spinlock hold times. That said, having tried to rewrite the page wait queues, I can at least fix up some of the braindamage in the current situation. In particular: (a) we don't want to continue walking the page wait list if the bit we're waiting for already got set again (which seems to be one of the patterns of the bad load). That makes no progress and just causes pointless cache pollution chasing the pointers. (b) we don't want to put the non-locking waiters always on the front of the queue, and the locking waiters always on the back. Not only is that unfair, it means that we wake up thousands of reading threads that will just end up being blocked by the writer later anyway. Also add a comment about the layout of 'struct wait_page_key' - there is an external user of it in the cachefiles code that means that it has to match the layout of 'struct wait_bit_key' in the two first members. It so happens to match, because 'struct page *' and 'unsigned long *' end up having the same values simply because the page flags are the first member in struct page. Cc: Tim Chen Cc: Kan Liang Cc: Mel Gorman Cc: Christopher Lameter Cc: Andi Kleen Cc: Davidlohr Bueso Cc: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/sched/wait.c | 7 ++++--- mm/filemap.c | 11 ++++++----- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 17f11c6b0a9f..d6afed6d0752 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -70,9 +70,10 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, list_for_each_entry_safe(curr, next, &wq_head->head, entry) { unsigned flags = curr->flags; - - if (curr->func(curr, mode, wake_flags, key) && - (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + int ret = curr->func(curr, mode, wake_flags, key); + if (ret < 0) + break; + if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; } } diff --git a/mm/filemap.c b/mm/filemap.c index a49702445ce0..baba290c276b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -885,6 +885,7 @@ void __init pagecache_init(void) page_writeback_init(); } +/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */ struct wait_page_key { struct page *page; int bit_nr; @@ -909,8 +910,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, if (wait_page->bit_nr != key->bit_nr) return 0; + + /* Stop walking if it's locked */ if (test_bit(key->bit_nr, &key->page->flags)) - return 0; + return -1; return autoremove_wake_function(wait, mode, sync, key); } @@ -964,6 +967,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, int ret = 0; init_wait(wait); + wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0; wait->func = wake_page_function; wait_page.page = page; wait_page.bit_nr = bit_nr; @@ -972,10 +976,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, spin_lock_irq(&q->lock); if (likely(list_empty(&wait->entry))) { - if (lock) - __add_wait_queue_entry_tail_exclusive(q, wait); - else - __add_wait_queue(q, wait); + __add_wait_queue_entry_tail(q, wait); SetPageWaiters(page); } -- cgit v1.2.3