summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2021-03-25 15:31:22 -0700
committerDavid S. Miller <davem@davemloft.net>2021-03-25 15:31:22 -0700
commitefd13b71a3fa31413f8d15342e01d44b60b0a432 (patch)
tree2ed1b299e25538c5a60485a1047507b49d3e0ecf /kernel
parent84c7f6c33f42a12eb036ebf0f0e3670799304120 (diff)
parent002322402dafd846c424ffa9240a937f49b48c42 (diff)
downloadlwn-efd13b71a3fa31413f8d15342e01d44b60b0a432.tar.gz
lwn-efd13b71a3fa31413f8d15342e01d44b60b0a432.zip
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/bpf_inode_storage.c2
-rw-r--r--kernel/bpf/bpf_struct_ops.c2
-rw-r--r--kernel/bpf/core.c4
-rw-r--r--kernel/bpf/preload/bpf_preload_kern.c19
-rw-r--r--kernel/bpf/syscall.c5
-rw-r--r--kernel/bpf/trampoline.c218
-rw-r--r--kernel/bpf/verifier.c37
-rw-r--r--kernel/events/core.c42
-rw-r--r--kernel/fork.c8
-rw-r--r--kernel/freezer.c2
-rw-r--r--kernel/futex.c3
-rw-r--r--kernel/gcov/clang.c69
-rw-r--r--kernel/irq/irq_sim.c4
-rw-r--r--kernel/irq/irqdomain.c9
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/jump_label.c8
-rw-r--r--kernel/locking/mutex.c25
-rw-r--r--kernel/reboot.c2
-rw-r--r--kernel/sched/core.c126
-rw-r--r--kernel/sched/membarrier.c4
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/static_call.c47
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/hrtimer.c62
-rw-r--r--kernel/time/posix-cpu-timers.c2
-rw-r--r--kernel/trace/ftrace.c43
-rw-r--r--kernel/usermode_driver.c21
28 files changed, 557 insertions, 221 deletions
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index da753721457c..2921ca39a93e 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -109,7 +109,7 @@ static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
fd = *(int *)key;
f = fget_raw(fd);
if (!f)
- return NULL;
+ return ERR_PTR(-EBADF);
sdata = inode_storage_lookup(f->f_inode, map, true);
fput(f);
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 1a666a975416..70f6fd4fa305 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -430,7 +430,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;
tprogs[BPF_TRAMP_FENTRY].nr_progs = 1;
- err = arch_prepare_bpf_trampoline(image,
+ err = arch_prepare_bpf_trampoline(NULL, image,
st_map->image + PAGE_SIZE,
&st_ops->func_models[i], 0,
tprogs, NULL);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 3a283bf97f2f..75244ecb2389 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -827,7 +827,7 @@ static int __init bpf_jit_charge_init(void)
}
pure_initcall(bpf_jit_charge_init);
-static int bpf_jit_charge_modmem(u32 pages)
+int bpf_jit_charge_modmem(u32 pages)
{
if (atomic_long_add_return(pages, &bpf_jit_current) >
(bpf_jit_limit >> PAGE_SHIFT)) {
@@ -840,7 +840,7 @@ static int bpf_jit_charge_modmem(u32 pages)
return 0;
}
-static void bpf_jit_uncharge_modmem(u32 pages)
+void bpf_jit_uncharge_modmem(u32 pages)
{
atomic_long_sub(pages, &bpf_jit_current);
}
diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c
index 79c5772465f1..53736e52c1df 100644
--- a/kernel/bpf/preload/bpf_preload_kern.c
+++ b/kernel/bpf/preload/bpf_preload_kern.c
@@ -60,9 +60,12 @@ static int finish(void)
&magic, sizeof(magic), &pos);
if (n != sizeof(magic))
return -EPIPE;
+
tgid = umd_ops.info.tgid;
- wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
- umd_ops.info.tgid = NULL;
+ if (tgid) {
+ wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
+ umd_cleanup_helper(&umd_ops.info);
+ }
return 0;
}
@@ -80,10 +83,18 @@ static int __init load_umd(void)
static void __exit fini_umd(void)
{
+ struct pid *tgid;
+
bpf_preload_ops = NULL;
+
/* kill UMD in case it's still there due to earlier error */
- kill_pid(umd_ops.info.tgid, SIGKILL, 1);
- umd_ops.info.tgid = NULL;
+ tgid = umd_ops.info.tgid;
+ if (tgid) {
+ kill_pid(tgid, SIGKILL, 1);
+
+ wait_event(tgid->wait_pidfd, thread_group_exited(tgid));
+ umd_cleanup_helper(&umd_ops.info);
+ }
umd_unload_blob(&umd_ops.info);
}
late_initcall(load_umd);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c859bc46d06c..250503482cda 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -854,6 +854,11 @@ static int map_create(union bpf_attr *attr)
err = PTR_ERR(btf);
goto free_map;
}
+ if (btf_is_kernel(btf)) {
+ btf_put(btf);
+ err = -EACCES;
+ goto free_map;
+ }
map->btf = btf;
if (attr->btf_value_type_id) {
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 7bc3b3209224..1f3a4be4b175 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -57,19 +57,10 @@ void bpf_image_ksym_del(struct bpf_ksym *ksym)
PAGE_SIZE, true, ksym->name);
}
-static void bpf_trampoline_ksym_add(struct bpf_trampoline *tr)
-{
- struct bpf_ksym *ksym = &tr->ksym;
-
- snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", tr->key);
- bpf_image_ksym_add(tr->image, ksym);
-}
-
static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
{
struct bpf_trampoline *tr;
struct hlist_head *head;
- void *image;
int i;
mutex_lock(&trampoline_mutex);
@@ -84,14 +75,6 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
if (!tr)
goto out;
- /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */
- image = bpf_jit_alloc_exec_page();
- if (!image) {
- kfree(tr);
- tr = NULL;
- goto out;
- }
-
tr->key = key;
INIT_HLIST_NODE(&tr->hlist);
hlist_add_head(&tr->hlist, head);
@@ -99,9 +82,6 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
mutex_init(&tr->mutex);
for (i = 0; i < BPF_TRAMP_MAX; i++)
INIT_HLIST_HEAD(&tr->progs_hlist[i]);
- tr->image = image;
- INIT_LIST_HEAD_RCU(&tr->ksym.lnode);
- bpf_trampoline_ksym_add(tr);
out:
mutex_unlock(&trampoline_mutex);
return tr;
@@ -185,10 +165,142 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total)
return tprogs;
}
+static void __bpf_tramp_image_put_deferred(struct work_struct *work)
+{
+ struct bpf_tramp_image *im;
+
+ im = container_of(work, struct bpf_tramp_image, work);
+ bpf_image_ksym_del(&im->ksym);
+ bpf_jit_free_exec(im->image);
+ bpf_jit_uncharge_modmem(1);
+ percpu_ref_exit(&im->pcref);
+ kfree_rcu(im, rcu);
+}
+
+/* callback, fexit step 3 or fentry step 2 */
+static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu)
+{
+ struct bpf_tramp_image *im;
+
+ im = container_of(rcu, struct bpf_tramp_image, rcu);
+ INIT_WORK(&im->work, __bpf_tramp_image_put_deferred);
+ schedule_work(&im->work);
+}
+
+/* callback, fexit step 2. Called after percpu_ref_kill confirms. */
+static void __bpf_tramp_image_release(struct percpu_ref *pcref)
+{
+ struct bpf_tramp_image *im;
+
+ im = container_of(pcref, struct bpf_tramp_image, pcref);
+ call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu);
+}
+
+/* callback, fexit or fentry step 1 */
+static void __bpf_tramp_image_put_rcu_tasks(struct rcu_head *rcu)
+{
+ struct bpf_tramp_image *im;
+
+ im = container_of(rcu, struct bpf_tramp_image, rcu);
+ if (im->ip_after_call)
+ /* the case of fmod_ret/fexit trampoline and CONFIG_PREEMPTION=y */
+ percpu_ref_kill(&im->pcref);
+ else
+ /* the case of fentry trampoline */
+ call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu);
+}
+
+static void bpf_tramp_image_put(struct bpf_tramp_image *im)
+{
+ /* The trampoline image that calls original function is using:
+ * rcu_read_lock_trace to protect sleepable bpf progs
+ * rcu_read_lock to protect normal bpf progs
+ * percpu_ref to protect trampoline itself
+ * rcu tasks to protect trampoline asm not covered by percpu_ref
+ * (which are few asm insns before __bpf_tramp_enter and
+ * after __bpf_tramp_exit)
+ *
+ * The trampoline is unreachable before bpf_tramp_image_put().
+ *
+ * First, patch the trampoline to avoid calling into fexit progs.
+ * The progs will be freed even if the original function is still
+ * executing or sleeping.
+ * In case of CONFIG_PREEMPT=y use call_rcu_tasks() to wait on
+ * first few asm instructions to execute and call into
+ * __bpf_tramp_enter->percpu_ref_get.
+ * Then use percpu_ref_kill to wait for the trampoline and the original
+ * function to finish.
+ * Then use call_rcu_tasks() to make sure few asm insns in
+ * the trampoline epilogue are done as well.
+ *
+ * In !PREEMPT case the task that got interrupted in the first asm
+ * insns won't go through an RCU quiescent state which the
+ * percpu_ref_kill will be waiting for. Hence the first
+ * call_rcu_tasks() is not necessary.
+ */
+ if (im->ip_after_call) {
+ int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP,
+ NULL, im->ip_epilogue);
+ WARN_ON(err);
+ if (IS_ENABLED(CONFIG_PREEMPTION))
+ call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
+ else
+ percpu_ref_kill(&im->pcref);
+ return;
+ }
+
+ /* The trampoline without fexit and fmod_ret progs doesn't call original
+ * function and doesn't use percpu_ref.
+ * Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
+ * Then use call_rcu_tasks() to wait for the rest of trampoline asm
+ * and normal progs.
+ */
+ call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
+}
+
+static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
+{
+ struct bpf_tramp_image *im;
+ struct bpf_ksym *ksym;
+ void *image;
+ int err = -ENOMEM;
+
+ im = kzalloc(sizeof(*im), GFP_KERNEL);
+ if (!im)
+ goto out;
+
+ err = bpf_jit_charge_modmem(1);
+ if (err)
+ goto out_free_im;
+
+ err = -ENOMEM;
+ im->image = image = bpf_jit_alloc_exec_page();
+ if (!image)
+ goto out_uncharge;
+
+ err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL);
+ if (err)
+ goto out_free_image;
+
+ ksym = &im->ksym;
+ INIT_LIST_HEAD_RCU(&ksym->lnode);
+ snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu_%u", key, idx);
+ bpf_image_ksym_add(image, ksym);
+ return im;
+
+out_free_image:
+ bpf_jit_free_exec(im->image);
+out_uncharge:
+ bpf_jit_uncharge_modmem(1);
+out_free_im:
+ kfree(im);
+out:
+ return ERR_PTR(err);
+}
+
static int bpf_trampoline_update(struct bpf_trampoline *tr)
{
- void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2;
- void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2;
+ struct bpf_tramp_image *im;
struct bpf_tramp_progs *tprogs;
u32 flags = BPF_TRAMP_F_RESTORE_REGS;
int err, total;
@@ -198,41 +310,42 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
return PTR_ERR(tprogs);
if (total == 0) {
- err = unregister_fentry(tr, old_image);
+ err = unregister_fentry(tr, tr->cur_image->image);
+ bpf_tramp_image_put(tr->cur_image);
+ tr->cur_image = NULL;
tr->selector = 0;
goto out;
}
+ im = bpf_tramp_image_alloc(tr->key, tr->selector);
+ if (IS_ERR(im)) {
+ err = PTR_ERR(im);
+ goto out;
+ }
+
if (tprogs[BPF_TRAMP_FEXIT].nr_progs ||
tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs)
flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
- /* Though the second half of trampoline page is unused a task could be
- * preempted in the middle of the first half of trampoline and two
- * updates to trampoline would change the code from underneath the
- * preempted task. Hence wait for tasks to voluntarily schedule or go
- * to userspace.
- * The same trampoline can hold both sleepable and non-sleepable progs.
- * synchronize_rcu_tasks_trace() is needed to make sure all sleepable
- * programs finish executing.
- * Wait for these two grace periods together.
- */
- synchronize_rcu_mult(call_rcu_tasks, call_rcu_tasks_trace);
-
- err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2,
+ err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
&tr->func.model, flags, tprogs,
tr->func.addr);
if (err < 0)
goto out;
- if (tr->selector)
+ WARN_ON(tr->cur_image && tr->selector == 0);
+ WARN_ON(!tr->cur_image && tr->selector);
+ if (tr->cur_image)
/* progs already running at this address */
- err = modify_fentry(tr, old_image, new_image);
+ err = modify_fentry(tr, tr->cur_image->image, im->image);
else
/* first time registering */
- err = register_fentry(tr, new_image);
+ err = register_fentry(tr, im->image);
if (err)
goto out;
+ if (tr->cur_image)
+ bpf_tramp_image_put(tr->cur_image);
+ tr->cur_image = im;
tr->selector++;
out:
kfree(tprogs);
@@ -364,17 +477,12 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
goto out;
if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT])))
goto out;
- bpf_image_ksym_del(&tr->ksym);
- /* This code will be executed when all bpf progs (both sleepable and
- * non-sleepable) went through
- * bpf_prog_put()->call_rcu[_tasks_trace]()->bpf_prog_free_deferred().
- * Hence no need for another synchronize_rcu_tasks_trace() here,
- * but synchronize_rcu_tasks() is still needed, since trampoline
- * may not have had any sleepable programs and we need to wait
- * for tasks to get out of trampoline code before freeing it.
+ /* This code will be executed even when the last bpf_tramp_image
+ * is alive. All progs are detached from the trampoline and the
+ * trampoline image is patched with jmp into epilogue to skip
+ * fexit progs. The fentry-only trampoline will be freed via
+ * multiple rcu callbacks.
*/
- synchronize_rcu_tasks();
- bpf_jit_free_exec(tr->image);
hlist_del(&tr->hlist);
kfree(tr);
out:
@@ -478,8 +586,18 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start)
rcu_read_unlock_trace();
}
+void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr)
+{
+ percpu_ref_get(&tr->pcref);
+}
+
+void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr)
+{
+ percpu_ref_put(&tr->pcref);
+}
+
int __weak
-arch_prepare_bpf_trampoline(void *image, void *image_end,
+arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
struct bpf_tramp_progs *tprogs,
void *orig_call)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f9096b049cd6..999bf36ffeb1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -6045,10 +6045,14 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
{
bool mask_to_left = (opcode == BPF_ADD && off_is_neg) ||
(opcode == BPF_SUB && !off_is_neg);
- u32 off;
+ u32 off, max;
switch (ptr_reg->type) {
case PTR_TO_STACK:
+ /* Offset 0 is out-of-bounds, but acceptable start for the
+ * left direction, see BPF_REG_FP.
+ */
+ max = MAX_BPF_STACK + mask_to_left;
/* Indirect variable offset stack access is prohibited in
* unprivileged mode so it's not handled here.
*/
@@ -6056,8 +6060,8 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
if (mask_to_left)
*ptr_limit = MAX_BPF_STACK + off;
else
- *ptr_limit = -off;
- return 0;
+ *ptr_limit = -off - 1;
+ return *ptr_limit >= max ? -ERANGE : 0;
case PTR_TO_MAP_KEY:
/* Currently, this code is not exercised as the only use
* is bpf_for_each_map_elem() helper which requires
@@ -6072,13 +6076,14 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
}
return 0;
case PTR_TO_MAP_VALUE:
+ max = ptr_reg->map_ptr->value_size;
if (mask_to_left) {
*ptr_limit = ptr_reg->umax_value + ptr_reg->off;
} else {
off = ptr_reg->smin_value + ptr_reg->off;
- *ptr_limit = ptr_reg->map_ptr->value_size - off;
+ *ptr_limit = ptr_reg->map_ptr->value_size - off - 1;
}
- return 0;
+ return *ptr_limit >= max ? -ERANGE : 0;
default:
return -EINVAL;
}
@@ -6131,6 +6136,7 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
u32 alu_state, alu_limit;
struct bpf_reg_state tmp;
bool ret;
+ int err;
if (can_skip_alu_sanitation(env, insn))
return 0;
@@ -6146,10 +6152,13 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env,
alu_state |= ptr_is_dst_reg ?
BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
- if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg))
- return 0;
- if (update_alu_sanitation_state(aux, alu_state, alu_limit))
- return -EACCES;
+ err = retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg);
+ if (err < 0)
+ return err;
+
+ err = update_alu_sanitation_state(aux, alu_state, alu_limit);
+ if (err < 0)
+ return err;
do_sim:
/* Simulate and find potential out-of-bounds access under
* speculative execution from truncation as a result of
@@ -6301,7 +6310,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
case BPF_ADD:
ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
if (ret < 0) {
- verbose(env, "R%d tried to add from different maps or paths\n", dst);
+ verbose(env, "R%d tried to add from different maps, paths, or prohibited types\n", dst);
return ret;
}
/* We can take a fixed offset as long as it doesn't overflow
@@ -6356,7 +6365,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
case BPF_SUB:
ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
if (ret < 0) {
- verbose(env, "R%d tried to sub from different maps or paths\n", dst);
+ verbose(env, "R%d tried to sub from different maps, paths, or prohibited types\n", dst);
return ret;
}
if (dst_reg == off_reg) {
@@ -9276,6 +9285,10 @@ static int check_btf_info(struct bpf_verifier_env *env,
btf = btf_get_by_fd(attr->prog_btf_fd);
if (IS_ERR(btf))
return PTR_ERR(btf);
+ if (btf_is_kernel(btf)) {
+ btf_put(btf);
+ return -EACCES;
+ }
env->prog->aux->btf = btf;
err = check_btf_func(env, attr, uattr);
@@ -11916,7 +11929,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
off_reg = issrc ? insn->src_reg : insn->dst_reg;
if (isneg)
*patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
- *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1);
+ *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
*patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
*patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
*patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0aeca5f3c0ac..03db40f6cba9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -386,6 +386,7 @@ static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
static atomic_t nr_mmap_events __read_mostly;
@@ -3461,11 +3462,16 @@ unlock:
}
}
+static DEFINE_PER_CPU(struct list_head, sched_cb_list);
+
void perf_sched_cb_dec(struct pmu *pmu)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- --cpuctx->sched_cb_usage;
+ this_cpu_dec(perf_sched_cb_usages);
+
+ if (!--cpuctx->sched_cb_usage)
+ list_del(&cpuctx->sched_cb_entry);
}
@@ -3473,7 +3479,10 @@ void perf_sched_cb_inc(struct pmu *pmu)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
- cpuctx->sched_cb_usage++;
+ if (!cpuctx->sched_cb_usage++)
+ list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
+
+ this_cpu_inc(perf_sched_cb_usages);
}
/*
@@ -3502,6 +3511,24 @@ static void __perf_pmu_sched_task(struct perf_cpu_context *cpuctx, bool sched_in
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
+static void perf_pmu_sched_task(struct task_struct *prev,
+ struct task_struct *next,
+ bool sched_in)
+{
+ struct perf_cpu_context *cpuctx;
+
+ if (prev == next)
+ return;
+
+ list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
+ /* will be handled in perf_event_context_sched_in/out */
+ if (cpuctx->task_ctx)
+ continue;
+
+ __perf_pmu_sched_task(cpuctx, sched_in);
+ }
+}
+
static void perf_event_switch(struct task_struct *task,
struct task_struct *next_prev, bool sched_in);
@@ -3524,6 +3551,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
{
int ctxn;
+ if (__this_cpu_read(perf_sched_cb_usages))
+ perf_pmu_sched_task(task, next, false);
+
if (atomic_read(&nr_switch_events))
perf_event_switch(task, next, false);
@@ -3832,6 +3862,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
if (atomic_read(&nr_switch_events))
perf_event_switch(task, prev, true);
+
+ if (__this_cpu_read(perf_sched_cb_usages))
+ perf_pmu_sched_task(prev, task, true);
}
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -4656,7 +4689,7 @@ static void unaccount_event(struct perf_event *event)
if (event->parent)
return;
- if (event->attach_state & PERF_ATTACH_TASK)
+ if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
dec = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_dec(&nr_mmap_events);
@@ -11175,7 +11208,7 @@ static void account_event(struct perf_event *event)
if (event->parent)
return;
- if (event->attach_state & PERF_ATTACH_TASK)
+ if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
inc = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_inc(&nr_mmap_events);
@@ -12972,6 +13005,7 @@ static void __init perf_event_init_all_cpus(void)
#ifdef CONFIG_CGROUP_PERF
INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
#endif
+ INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
}
}
diff --git a/kernel/fork.c b/kernel/fork.c
index b94391a58708..50209691f21a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -996,6 +996,13 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
#endif
}
+static void mm_init_pasid(struct mm_struct *mm)
+{
+#ifdef CONFIG_IOMMU_SUPPORT
+ mm->pasid = INIT_PASID;
+#endif
+}
+
static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
@@ -1026,6 +1033,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
+ mm_init_pasid(mm);
RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_subscriptions_init(mm);
init_tlb_flush_pending(mm);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index dc520f01f99d..1a2d57d1327c 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -134,7 +134,7 @@ bool freeze_task(struct task_struct *p)
return false;
}
- if (!(p->flags & PF_KTHREAD))
+ if (!(p->flags & (PF_KTHREAD | PF_IO_WORKER)))
fake_signal_wake_up(p);
else
wake_up_state(p, TASK_INTERRUPTIBLE);
diff --git a/kernel/futex.c b/kernel/futex.c
index e68db7745039..00febd6dea9c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2728,14 +2728,13 @@ retry:
goto out;
restart = &current->restart_block;
- restart->fn = futex_wait_restart;
restart->futex.uaddr = uaddr;
restart->futex.val = val;
restart->futex.time = *abs_time;
restart->futex.bitset = bitset;
restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
- ret = -ERESTART_RESTARTBLOCK;
+ ret = set_restart_fn(restart, futex_wait_restart);
out:
if (to) {
diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c
index c94b820a1b62..8743150db2ac 100644
--- a/kernel/gcov/clang.c
+++ b/kernel/gcov/clang.c
@@ -75,7 +75,9 @@ struct gcov_fn_info {
u32 num_counters;
u64 *counters;
+#if CONFIG_CLANG_VERSION < 110000
const char *function_name;
+#endif
};
static struct gcov_info *current_info;
@@ -105,6 +107,7 @@ void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush)
}
EXPORT_SYMBOL(llvm_gcov_init);
+#if CONFIG_CLANG_VERSION < 110000
void llvm_gcda_start_file(const char *orig_filename, const char version[4],
u32 checksum)
{
@@ -113,7 +116,17 @@ void llvm_gcda_start_file(const char *orig_filename, const char version[4],
current_info->checksum = checksum;
}
EXPORT_SYMBOL(llvm_gcda_start_file);
+#else
+void llvm_gcda_start_file(const char *orig_filename, u32 version, u32 checksum)
+{
+ current_info->filename = orig_filename;
+ current_info->version = version;
+ current_info->checksum = checksum;
+}
+EXPORT_SYMBOL(llvm_gcda_start_file);
+#endif
+#if CONFIG_CLANG_VERSION < 110000
void llvm_gcda_emit_function(u32 ident, const char *function_name,
u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum)
{
@@ -133,6 +146,24 @@ void llvm_gcda_emit_function(u32 ident, const char *function_name,
list_add_tail(&info->head, &current_info->functions);
}
EXPORT_SYMBOL(llvm_gcda_emit_function);
+#else
+void llvm_gcda_emit_function(u32 ident, u32 func_checksum,
+ u8 use_extra_checksum, u32 cfg_checksum)
+{
+ struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL);
+
+ if (!info)
+ return;
+
+ INIT_LIST_HEAD(&info->head);
+ info->ident = ident;
+ info->checksum = func_checksum;
+ info->use_extra_checksum = use_extra_checksum;
+ info->cfg_checksum = cfg_checksum;
+ list_add_tail(&info->head, &current_info->functions);
+}
+EXPORT_SYMBOL(llvm_gcda_emit_function);
+#endif
void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters)
{
@@ -295,6 +326,7 @@ void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
}
}
+#if CONFIG_CLANG_VERSION < 110000
static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
{
size_t cv_size; /* counter values size */
@@ -322,6 +354,28 @@ err_name:
kfree(fn_dup);
return NULL;
}
+#else
+static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn)
+{
+ size_t cv_size; /* counter values size */
+ struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn),
+ GFP_KERNEL);
+ if (!fn_dup)
+ return NULL;
+ INIT_LIST_HEAD(&fn_dup->head);
+
+ cv_size = fn->num_counters * sizeof(fn->counters[0]);
+ fn_dup->counters = vmalloc(cv_size);
+ if (!fn_dup->counters) {
+ kfree(fn_dup);
+ return NULL;
+ }
+
+ memcpy(fn_dup->counters, fn->counters, cv_size);
+
+ return fn_dup;
+}
+#endif
/**
* gcov_info_dup - duplicate profiling data set
@@ -362,6 +416,7 @@ err:
* gcov_info_free - release memory for profiling data set duplicate
* @info: profiling data set duplicate to free
*/
+#if CONFIG_CLANG_VERSION < 110000
void gcov_info_free(struct gcov_info *info)
{
struct gcov_fn_info *fn, *tmp;
@@ -375,6 +430,20 @@ void gcov_info_free(struct gcov_info *info)
kfree(info->filename);
kfree(info);
}
+#else
+void gcov_info_free(struct gcov_info *info)
+{
+ struct gcov_fn_info *fn, *tmp;
+
+ list_for_each_entry_safe(fn, tmp, &info->functions, head) {
+ vfree(fn->counters);
+ list_del(&fn->head);
+ kfree(fn);
+ }
+ kfree(info->filename);
+ kfree(info);
+}
+#endif
#define ITER_STRIDE PAGE_SIZE
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index 48006608baf0..40880c350b95 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -159,7 +159,7 @@ static const struct irq_domain_ops irq_sim_domain_ops = {
* irq_domain_create_sim - Create a new interrupt simulator irq_domain and
* allocate a range of dummy interrupts.
*
- * @fnode: struct fwnode_handle to be associated with this domain.
+ * @fwnode: struct fwnode_handle to be associated with this domain.
* @num_irqs: Number of interrupts to allocate.
*
* On success: return a new irq_domain object.
@@ -228,7 +228,7 @@ static void devm_irq_domain_release_sim(struct device *dev, void *res)
* a managed device.
*
* @dev: Device to initialize the simulator object for.
- * @fnode: struct fwnode_handle to be associated with this domain.
+ * @fwnode: struct fwnode_handle to be associated with this domain.
* @num_irqs: Number of interrupts to allocate
*
* On success: return a new irq_domain object.
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 288151393a06..d10ab1d689d5 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1898,16 +1898,15 @@ DEFINE_SHOW_ATTRIBUTE(irq_domain_debug);
static void debugfs_add_domain_dir(struct irq_domain *d)
{
- if (!d->name || !domain_dir || d->debugfs_file)
+ if (!d->name || !domain_dir)
return;
- d->debugfs_file = debugfs_create_file(d->name, 0444, domain_dir, d,
- &irq_domain_debug_fops);
+ debugfs_create_file(d->name, 0444, domain_dir, d,
+ &irq_domain_debug_fops);
}
static void debugfs_remove_domain_dir(struct irq_domain *d)
{
- debugfs_remove(d->debugfs_file);
- d->debugfs_file = NULL;
+ debugfs_remove(debugfs_lookup(d->name, domain_dir));
}
void __init irq_domain_debugfs_init(struct dentry *root)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index dec3f73e8db9..21ea370fccda 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1142,11 +1142,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
irqreturn_t ret;
local_bh_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_disable();
ret = action->thread_fn(action->irq, action->dev_id);
if (ret == IRQ_HANDLED)
atomic_inc(&desc->threads_handled);
irq_finalize_oneshot(desc, action);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_enable();
local_bh_enable();
return ret;
}
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index c6a39d662935..ba39fbb1f8e7 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -407,6 +407,14 @@ static bool jump_label_can_update(struct jump_entry *entry, bool init)
return false;
if (!kernel_text_address(jump_entry_code(entry))) {
+ /*
+ * This skips patching built-in __exit, which
+ * is part of init_section_contains() but is
+ * not part of kernel_text_address().
+ *
+ * Skipping built-in __exit is fine since it
+ * will never be executed.
+ */
WARN_ONCE(!jump_entry_is_init(entry),
"can't patch jump_label at %pS",
(void *)jump_entry_code(entry));
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index adb935090768..622ebdfcd083 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -626,7 +626,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
*/
static __always_inline bool
mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
- const bool use_ww_ctx, struct mutex_waiter *waiter)
+ struct mutex_waiter *waiter)
{
if (!waiter) {
/*
@@ -702,7 +702,7 @@ fail:
#else
static __always_inline bool
mutex_optimistic_spin(struct mutex *lock, struct ww_acquire_ctx *ww_ctx,
- const bool use_ww_ctx, struct mutex_waiter *waiter)
+ struct mutex_waiter *waiter)
{
return false;
}
@@ -922,6 +922,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
struct ww_mutex *ww;
int ret;
+ if (!use_ww_ctx)
+ ww_ctx = NULL;
+
might_sleep();
#ifdef CONFIG_DEBUG_MUTEXES
@@ -929,7 +932,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
#endif
ww = container_of(lock, struct ww_mutex, base);
- if (use_ww_ctx && ww_ctx) {
+ if (ww_ctx) {
if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
return -EALREADY;
@@ -946,10 +949,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
if (__mutex_trylock(lock) ||
- mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, NULL)) {
+ mutex_optimistic_spin(lock, ww_ctx, NULL)) {
/* got the lock, yay! */
lock_acquired(&lock->dep_map, ip);
- if (use_ww_ctx && ww_ctx)
+ if (ww_ctx)
ww_mutex_set_context_fastpath(ww, ww_ctx);
preempt_enable();
return 0;
@@ -960,7 +963,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* After waiting to acquire the wait_lock, try again.
*/
if (__mutex_trylock(lock)) {
- if (use_ww_ctx && ww_ctx)
+ if (ww_ctx)
__ww_mutex_check_waiters(lock, ww_ctx);
goto skip_wait;
@@ -1013,7 +1016,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
goto err;
}
- if (use_ww_ctx && ww_ctx) {
+ if (ww_ctx) {
ret = __ww_mutex_check_kill(lock, &waiter, ww_ctx);
if (ret)
goto err;
@@ -1026,7 +1029,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* ww_mutex needs to always recheck its position since its waiter
* list is not FIFO ordered.
*/
- if ((use_ww_ctx && ww_ctx) || !first) {
+ if (ww_ctx || !first) {
first = __mutex_waiter_is_first(lock, &waiter);
if (first)
__mutex_set_flag(lock, MUTEX_FLAG_HANDOFF);
@@ -1039,7 +1042,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* or we must see its unlock and acquire.
*/
if (__mutex_trylock(lock) ||
- (first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, &waiter)))
+ (first && mutex_optimistic_spin(lock, ww_ctx, &waiter)))
break;
spin_lock(&lock->wait_lock);
@@ -1048,7 +1051,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
acquired:
__set_current_state(TASK_RUNNING);
- if (use_ww_ctx && ww_ctx) {
+ if (ww_ctx) {
/*
* Wound-Wait; we stole the lock (!first_waiter), check the
* waiters as anyone might want to wound us.
@@ -1068,7 +1071,7 @@ skip_wait:
/* got the lock - cleanup and rejoice! */
lock_acquired(&lock->dep_map, ip);
- if (use_ww_ctx && ww_ctx)
+ if (ww_ctx)
ww_mutex_lock_acquired(ww, ww_ctx);
spin_unlock(&lock->wait_lock);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index eb1b15850761..a6ad5eb2fa73 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -244,8 +244,6 @@ void migrate_to_reboot_cpu(void)
void kernel_restart(char *cmd)
{
kernel_restart_prepare(cmd);
- if (pm_power_off_prepare)
- pm_power_off_prepare();
migrate_to_reboot_cpu();
syscore_shutdown();
if (!cmd)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ca2bb629595f..98191218d891 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1862,8 +1862,13 @@ struct migration_arg {
struct set_affinity_pending *pending;
};
+/*
+ * @refs: number of wait_for_completion()
+ * @stop_pending: is @stop_work in use
+ */
struct set_affinity_pending {
refcount_t refs;
+ unsigned int stop_pending;
struct completion done;
struct cpu_stop_work stop_work;
struct migration_arg arg;
@@ -1898,8 +1903,8 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
*/
static int migration_cpu_stop(void *data)
{
- struct set_affinity_pending *pending;
struct migration_arg *arg = data;
+ struct set_affinity_pending *pending = arg->pending;
struct task_struct *p = arg->task;
int dest_cpu = arg->dest_cpu;
struct rq *rq = this_rq();
@@ -1921,7 +1926,6 @@ static int migration_cpu_stop(void *data)
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
- pending = p->migration_pending;
/*
* If task_rq(p) != rq, it cannot be migrated here, because we're
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
@@ -1932,21 +1936,14 @@ static int migration_cpu_stop(void *data)
goto out;
if (pending) {
- p->migration_pending = NULL;
+ if (p->migration_pending == pending)
+ p->migration_pending = NULL;
complete = true;
}
- /* migrate_enable() -- we must not race against SCA */
if (dest_cpu < 0) {
- /*
- * When this was migrate_enable() but we no longer
- * have a @pending, a concurrent SCA 'fixed' things
- * and we should be valid again. Nothing to do.
- */
- if (!pending) {
- WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask));
+ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask))
goto out;
- }
dest_cpu = cpumask_any_distribute(&p->cpus_mask);
}
@@ -1956,7 +1953,14 @@ static int migration_cpu_stop(void *data)
else
p->wake_cpu = dest_cpu;
- } else if (dest_cpu < 0 || pending) {
+ /*
+ * XXX __migrate_task() can fail, at which point we might end
+ * up running on a dodgy CPU, AFAICT this can only happen
+ * during CPU hotplug, at which point we'll get pushed out
+ * anyway, so it's probably not a big deal.
+ */
+
+ } else if (pending) {
/*
* This happens when we get migrated between migrate_enable()'s
* preempt_enable() and scheduling the stopper task. At that
@@ -1971,43 +1975,32 @@ static int migration_cpu_stop(void *data)
* ->pi_lock, so the allowed mask is stable - if it got
* somewhere allowed, we're done.
*/
- if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
- p->migration_pending = NULL;
+ if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
+ if (p->migration_pending == pending)
+ p->migration_pending = NULL;
complete = true;
goto out;
}
/*
- * When this was migrate_enable() but we no longer have an
- * @pending, a concurrent SCA 'fixed' things and we should be
- * valid again. Nothing to do.
- */
- if (!pending) {
- WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask));
- goto out;
- }
-
- /*
* When migrate_enable() hits a rq mis-match we can't reliably
* determine is_migration_disabled() and so have to chase after
* it.
*/
+ WARN_ON_ONCE(!pending->stop_pending);
task_rq_unlock(rq, p, &rf);
stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
&pending->arg, &pending->stop_work);
return 0;
}
out:
+ if (pending)
+ pending->stop_pending = false;
task_rq_unlock(rq, p, &rf);
if (complete)
complete_all(&pending->done);
- /* For pending->{arg,stop_work} */
- pending = arg->pending;
- if (pending && refcount_dec_and_test(&pending->refs))
- wake_up_var(&pending->refs);
-
return 0;
}
@@ -2194,11 +2187,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
int dest_cpu, unsigned int flags)
{
struct set_affinity_pending my_pending = { }, *pending = NULL;
- struct migration_arg arg = {
- .task = p,
- .dest_cpu = dest_cpu,
- };
- bool complete = false;
+ bool stop_pending, complete = false;
/* Can the task run on the task's current CPU? If so, we're done */
if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
@@ -2210,12 +2199,16 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
push_task = get_task_struct(p);
}
+ /*
+ * If there are pending waiters, but no pending stop_work,
+ * then complete now.
+ */
pending = p->migration_pending;
- if (pending) {
- refcount_inc(&pending->refs);
+ if (pending && !pending->stop_pending) {
p->migration_pending = NULL;
complete = true;
}
+
task_rq_unlock(rq, p, rf);
if (push_task) {
@@ -2224,7 +2217,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
}
if (complete)
- goto do_complete;
+ complete_all(&pending->done);
return 0;
}
@@ -2235,6 +2228,12 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
/* Install the request */
refcount_set(&my_pending.refs, 1);
init_completion(&my_pending.done);
+ my_pending.arg = (struct migration_arg) {
+ .task = p,
+ .dest_cpu = -1, /* any */
+ .pending = &my_pending,
+ };
+
p->migration_pending = &my_pending;
} else {
pending = p->migration_pending;
@@ -2259,45 +2258,41 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
return -EINVAL;
}
- if (flags & SCA_MIGRATE_ENABLE) {
-
- refcount_inc(&pending->refs); /* pending->{arg,stop_work} */
- p->migration_flags &= ~MDF_PUSH;
- task_rq_unlock(rq, p, rf);
-
- pending->arg = (struct migration_arg) {
- .task = p,
- .dest_cpu = -1,
- .pending = pending,
- };
-
- stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
- &pending->arg, &pending->stop_work);
-
- return 0;
- }
-
if (task_running(rq, p) || p->state == TASK_WAKING) {
/*
- * Lessen races (and headaches) by delegating
- * is_migration_disabled(p) checks to the stopper, which will
- * run on the same CPU as said p.
+ * MIGRATE_ENABLE gets here because 'p == current', but for
+ * anything else we cannot do is_migration_disabled(), punt
+ * and have the stopper function handle it all race-free.
*/
+ stop_pending = pending->stop_pending;
+ if (!stop_pending)
+ pending->stop_pending = true;
+
+ if (flags & SCA_MIGRATE_ENABLE)
+ p->migration_flags &= ~MDF_PUSH;
+
task_rq_unlock(rq, p, rf);
- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+ if (!stop_pending) {
+ stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop,
+ &pending->arg, &pending->stop_work);
+ }
+
+ if (flags & SCA_MIGRATE_ENABLE)
+ return 0;
} else {
if (!is_migration_disabled(p)) {
if (task_on_rq_queued(p))
rq = move_queued_task(rq, rf, p, dest_cpu);
- p->migration_pending = NULL;
- complete = true;
+ if (!pending->stop_pending) {
+ p->migration_pending = NULL;
+ complete = true;
+ }
}
task_rq_unlock(rq, p, rf);
-do_complete:
if (complete)
complete_all(&pending->done);
}
@@ -2305,7 +2300,7 @@ do_complete:
wait_for_completion(&pending->done);
if (refcount_dec_and_test(&pending->refs))
- wake_up_var(&pending->refs);
+ wake_up_var(&pending->refs); /* No UaF, just an address */
/*
* Block the original owner of &pending until all subsequent callers
@@ -2313,6 +2308,9 @@ do_complete:
*/
wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs));
+ /* ARGH */
+ WARN_ON_ONCE(my_pending.stop_pending);
+
return 0;
}
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index acdae625c636..b5add64d9698 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -471,9 +471,7 @@ static int sync_runqueues_membarrier_state(struct mm_struct *mm)
}
rcu_read_unlock();
- preempt_disable();
- smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
- preempt_enable();
+ on_each_cpu_mask(tmpmask, ipi_sync_rq_state, mm, true);
free_cpumask_var(tmpmask);
cpus_read_unlock();
diff --git a/kernel/signal.c b/kernel/signal.c
index ba4d1ef39a9e..f2a1b898da29 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -288,7 +288,8 @@ bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask)
JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK));
- if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING)))
+ if (unlikely(fatal_signal_pending(task) ||
+ (task->flags & (PF_EXITING | PF_IO_WORKER))))
return false;
if (mask & JOBCTL_STOP_SIGMASK)
@@ -833,6 +834,9 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info,
if (!valid_signal(sig))
return -EINVAL;
+ /* PF_IO_WORKER threads don't take any signals */
+ if (t->flags & PF_IO_WORKER)
+ return -ESRCH;
if (!si_fromuser(info))
return 0;
diff --git a/kernel/static_call.c b/kernel/static_call.c
index 6906c6ec4c97..2c5950b0b90e 100644
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -35,27 +35,30 @@ static inline void *static_call_addr(struct static_call_site *site)
return (void *)((long)site->addr + (long)&site->addr);
}
+static inline unsigned long __static_call_key(const struct static_call_site *site)
+{
+ return (long)site->key + (long)&site->key;
+}
static inline struct static_call_key *static_call_key(const struct static_call_site *site)
{
- return (struct static_call_key *)
- (((long)site->key + (long)&site->key) & ~STATIC_CALL_SITE_FLAGS);
+ return (void *)(__static_call_key(site) & ~STATIC_CALL_SITE_FLAGS);
}
/* These assume the key is word-aligned. */
static inline bool static_call_is_init(struct static_call_site *site)
{
- return ((long)site->key + (long)&site->key) & STATIC_CALL_SITE_INIT;
+ return __static_call_key(site) & STATIC_CALL_SITE_INIT;
}
static inline bool static_call_is_tail(struct static_call_site *site)
{
- return ((long)site->key + (long)&site->key) & STATIC_CALL_SITE_TAIL;
+ return __static_call_key(site) & STATIC_CALL_SITE_TAIL;
}
static inline void static_call_set_init(struct static_call_site *site)
{
- site->key = ((long)static_call_key(site) | STATIC_CALL_SITE_INIT) -
+ site->key = (__static_call_key(site) | STATIC_CALL_SITE_INIT) -
(long)&site->key;
}
@@ -146,6 +149,7 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func)
};
for (site_mod = &first; site_mod; site_mod = site_mod->next) {
+ bool init = system_state < SYSTEM_RUNNING;
struct module *mod = site_mod->mod;
if (!site_mod->sites) {
@@ -165,6 +169,7 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func)
if (mod) {
stop = mod->static_call_sites +
mod->num_static_call_sites;
+ init = mod->state == MODULE_STATE_COMING;
}
#endif
@@ -172,25 +177,26 @@ void __static_call_update(struct static_call_key *key, void *tramp, void *func)
site < stop && static_call_key(site) == key; site++) {
void *site_addr = static_call_addr(site);
- if (static_call_is_init(site)) {
- /*
- * Don't write to call sites which were in
- * initmem and have since been freed.
- */
- if (!mod && system_state >= SYSTEM_RUNNING)
- continue;
- if (mod && !within_module_init((unsigned long)site_addr, mod))
- continue;
- }
+ if (!init && static_call_is_init(site))
+ continue;
if (!kernel_text_address((unsigned long)site_addr)) {
- WARN_ONCE(1, "can't patch static call site at %pS",
+ /*
+ * This skips patching built-in __exit, which
+ * is part of init_section_contains() but is
+ * not part of kernel_text_address().
+ *
+ * Skipping built-in __exit is fine since it
+ * will never be executed.
+ */
+ WARN_ONCE(!static_call_is_init(site),
+ "can't patch static call site at %pS",
site_addr);
continue;
}
arch_static_call_transform(site_addr, NULL, func,
- static_call_is_tail(site));
+ static_call_is_tail(site));
}
}
@@ -349,7 +355,8 @@ static int static_call_add_module(struct module *mod)
struct static_call_site *site;
for (site = start; site != stop; site++) {
- unsigned long addr = (unsigned long)static_call_key(site);
+ unsigned long s_key = __static_call_key(site);
+ unsigned long addr = s_key & ~STATIC_CALL_SITE_FLAGS;
unsigned long key;
/*
@@ -373,8 +380,8 @@ static int static_call_add_module(struct module *mod)
return -EINVAL;
}
- site->key = (key - (long)&site->key) |
- (site->key & STATIC_CALL_SITE_FLAGS);
+ key |= s_key & STATIC_CALL_SITE_FLAGS;
+ site->key = key - (long)&site->key;
}
return __static_call_init(mod, start, stop);
diff --git a/kernel/sys.c b/kernel/sys.c
index b09fe21e88ff..2e2e3f378d97 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2079,7 +2079,7 @@ static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
* up to the caller to provide sane values here, otherwise userspace
* tools which use this vector might be unhappy.
*/
- unsigned long user_auxv[AT_VECTOR_SIZE];
+ unsigned long user_auxv[AT_VECTOR_SIZE] = {};
if (len > sizeof(user_auxv))
return -EINVAL;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 98d7a15e8cf6..4d94e2b5499d 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -854,9 +854,9 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
if (flags == TIMER_ABSTIME)
return -ERESTARTNOHAND;
- restart->fn = alarm_timer_nsleep_restart;
restart->nanosleep.clockid = type;
restart->nanosleep.expires = exp;
+ set_restart_fn(restart, alarm_timer_nsleep_restart);
return ret;
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 743c852e10f2..5c9d968187ae 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -546,8 +546,11 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
}
/*
- * Recomputes cpu_base::*next_timer and returns the earliest expires_next but
- * does not set cpu_base::*expires_next, that is done by hrtimer_reprogram.
+ * Recomputes cpu_base::*next_timer and returns the earliest expires_next
+ * but does not set cpu_base::*expires_next, that is done by
+ * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating
+ * cpu_base::*expires_next right away, reprogramming logic would no longer
+ * work.
*
* When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
* those timers will get run whenever the softirq gets handled, at the end of
@@ -588,6 +591,37 @@ __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_
return expires_next;
}
+static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base)
+{
+ ktime_t expires_next, soft = KTIME_MAX;
+
+ /*
+ * If the soft interrupt has already been activated, ignore the
+ * soft bases. They will be handled in the already raised soft
+ * interrupt.
+ */
+ if (!cpu_base->softirq_activated) {
+ soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
+ /*
+ * Update the soft expiry time. clock_settime() might have
+ * affected it.
+ */
+ cpu_base->softirq_expires_next = soft;
+ }
+
+ expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD);
+ /*
+ * If a softirq timer is expiring first, update cpu_base->next_timer
+ * and program the hardware with the soft expiry time.
+ */
+ if (expires_next > soft) {
+ cpu_base->next_timer = cpu_base->softirq_next_timer;
+ expires_next = soft;
+ }
+
+ return expires_next;
+}
+
static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
{
ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
@@ -628,23 +662,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
{
ktime_t expires_next;
- /*
- * Find the current next expiration time.
- */
- expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
-
- if (cpu_base->next_timer && cpu_base->next_timer->is_soft) {
- /*
- * When the softirq is activated, hrtimer has to be
- * programmed with the first hard hrtimer because soft
- * timer interrupt could occur too late.
- */
- if (cpu_base->softirq_activated)
- expires_next = __hrtimer_get_next_event(cpu_base,
- HRTIMER_ACTIVE_HARD);
- else
- cpu_base->softirq_expires_next = expires_next;
- }
+ expires_next = hrtimer_update_next_event(cpu_base);
if (skip_equal && expires_next == cpu_base->expires_next)
return;
@@ -1644,8 +1662,8 @@ retry:
__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
- /* Reevaluate the clock bases for the next expiry */
- expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
+ /* Reevaluate the clock bases for the [soft] next expiry */
+ expires_next = hrtimer_update_next_event(cpu_base);
/*
* Store the new expiry value so the migration code can verify
* against it.
@@ -1939,9 +1957,9 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
}
restart = &current->restart_block;
- restart->fn = hrtimer_nanosleep_restart;
restart->nanosleep.clockid = t.timer.base->clockid;
restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
+ set_restart_fn(restart, hrtimer_nanosleep_restart);
out:
destroy_hrtimer_on_stack(&t.timer);
return ret;
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index a71758e34e45..9abe15255bc4 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1480,8 +1480,8 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
if (flags & TIMER_ABSTIME)
return -ERESTARTNOHAND;
- restart_block->fn = posix_cpu_nsleep_restart;
restart_block->nanosleep.clockid = which_clock;
+ set_restart_fn(restart_block, posix_cpu_nsleep_restart);
}
return error;
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4d8e35575549..b7e29db127fa 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -5045,6 +5045,20 @@ struct ftrace_direct_func *ftrace_find_direct_func(unsigned long addr)
return NULL;
}
+static struct ftrace_direct_func *ftrace_alloc_direct_func(unsigned long addr)
+{
+ struct ftrace_direct_func *direct;
+
+ direct = kmalloc(sizeof(*direct), GFP_KERNEL);
+ if (!direct)
+ return NULL;
+ direct->addr = addr;
+ direct->count = 0;
+ list_add_rcu(&direct->next, &ftrace_direct_funcs);
+ ftrace_direct_func_count++;
+ return direct;
+}
+
/**
* register_ftrace_direct - Call a custom trampoline directly
* @ip: The address of the nop at the beginning of a function
@@ -5120,15 +5134,11 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
direct = ftrace_find_direct_func(addr);
if (!direct) {
- direct = kmalloc(sizeof(*direct), GFP_KERNEL);
+ direct = ftrace_alloc_direct_func(addr);
if (!direct) {
kfree(entry);
goto out_unlock;
}
- direct->addr = addr;
- direct->count = 0;
- list_add_rcu(&direct->next, &ftrace_direct_funcs);
- ftrace_direct_func_count++;
}
entry->ip = ip;
@@ -5329,6 +5339,7 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry,
int modify_ftrace_direct(unsigned long ip,
unsigned long old_addr, unsigned long new_addr)
{
+ struct ftrace_direct_func *direct, *new_direct = NULL;
struct ftrace_func_entry *entry;
struct dyn_ftrace *rec;
int ret = -ENODEV;
@@ -5344,6 +5355,20 @@ int modify_ftrace_direct(unsigned long ip,
if (entry->direct != old_addr)
goto out_unlock;
+ direct = ftrace_find_direct_func(old_addr);
+ if (WARN_ON(!direct))
+ goto out_unlock;
+ if (direct->count > 1) {
+ ret = -ENOMEM;
+ new_direct = ftrace_alloc_direct_func(new_addr);
+ if (!new_direct)
+ goto out_unlock;
+ direct->count--;
+ new_direct->count++;
+ } else {
+ direct->addr = new_addr;
+ }
+
/*
* If there's no other ftrace callback on the rec->ip location,
* then it can be changed directly by the architecture.
@@ -5357,6 +5382,14 @@ int modify_ftrace_direct(unsigned long ip,
ret = 0;
}
+ if (unlikely(ret && new_direct)) {
+ direct->count++;
+ list_del_rcu(&new_direct->next);
+ synchronize_rcu_tasks();
+ kfree(new_direct);
+ ftrace_direct_func_count--;
+ }
+
out_unlock:
mutex_unlock(&ftrace_lock);
mutex_unlock(&direct_mutex);
diff --git a/kernel/usermode_driver.c b/kernel/usermode_driver.c
index 0b35212ffc3d..bb7bb3b478ab 100644
--- a/kernel/usermode_driver.c
+++ b/kernel/usermode_driver.c
@@ -139,13 +139,22 @@ static void umd_cleanup(struct subprocess_info *info)
struct umd_info *umd_info = info->data;
/* cleanup if umh_setup() was successful but exec failed */
- if (info->retval) {
- fput(umd_info->pipe_to_umh);
- fput(umd_info->pipe_from_umh);
- put_pid(umd_info->tgid);
- umd_info->tgid = NULL;
- }
+ if (info->retval)
+ umd_cleanup_helper(umd_info);
+}
+
+/**
+ * umd_cleanup_helper - release the resources which were allocated in umd_setup
+ * @info: information about usermode driver
+ */
+void umd_cleanup_helper(struct umd_info *info)
+{
+ fput(info->pipe_to_umh);
+ fput(info->pipe_from_umh);
+ put_pid(info->tgid);
+ info->tgid = NULL;
}
+EXPORT_SYMBOL_GPL(umd_cleanup_helper);
/**
* fork_usermode_driver - fork a usermode driver