summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/audit_watch.c12
-rw-r--r--kernel/auditsc.c12
-rw-r--r--kernel/bpf/helpers.c3
-rw-r--r--kernel/bpf/inode.c8
-rw-r--r--kernel/bpf/verifier.c4
-rw-r--r--kernel/cfi.c4
-rw-r--r--kernel/cgroup/cgroup-internal.h1
-rw-r--r--kernel/cgroup/cgroup-v1.c7
-rw-r--r--kernel/cgroup/cgroup.c6
-rw-r--r--kernel/cgroup/cpuset-v1.c49
-rw-r--r--kernel/cgroup/cpuset.c79
-rw-r--r--kernel/cgroup/legacy_freezer.c6
-rw-r--r--kernel/cgroup/misc.c16
-rw-r--r--kernel/cgroup/rstat.c116
-rw-r--r--kernel/configs/hardening.config2
-rw-r--r--kernel/context_tracking.c9
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/crash_core.c2
-rw-r--r--kernel/dma/direct.c28
-rw-r--r--kernel/entry/Makefile3
-rw-r--r--kernel/entry/common.c2
-rw-r--r--kernel/events/callchain.c38
-rw-r--r--kernel/events/core.c1083
-rw-r--r--kernel/events/hw_breakpoint.c5
-rw-r--r--kernel/events/ring_buffer.c3
-rw-r--r--kernel/events/uprobes.c12
-rw-r--r--kernel/exit.c64
-rw-r--r--kernel/fork.c25
-rw-r--r--kernel/futex/core.c21
-rw-r--r--kernel/iomem.c5
-rw-r--r--kernel/irq/chip.c30
-rw-r--r--kernel/irq/internals.h11
-rw-r--r--kernel/irq/irqdesc.c2
-rw-r--r--kernel/irq/irqdomain.c5
-rw-r--r--kernel/irq/manage.c7
-rw-r--r--kernel/irq/migration.c20
-rw-r--r--kernel/irq/msi.c182
-rw-r--r--kernel/kallsyms.c12
-rw-r--r--kernel/kcmp.c2
-rw-r--r--kernel/locking/Makefile3
-rw-r--r--kernel/locking/lock_events_list.h28
-rw-r--r--kernel/locking/lockdep.c17
-rw-r--r--kernel/locking/mutex.c2
-rw-r--r--kernel/locking/rtmutex.c29
-rw-r--r--kernel/module/main.c81
-rw-r--r--kernel/module/strict_rwx.c9
-rw-r--r--kernel/padata.c2
-rw-r--r--kernel/pid.c106
-rw-r--r--kernel/power/Kconfig3
-rw-r--r--kernel/power/energy_model.c67
-rw-r--r--kernel/power/hibernate.c6
-rw-r--r--kernel/power/snapshot.c16
-rw-r--r--kernel/power/suspend.c14
-rw-r--r--kernel/printk/printk.c4
-rw-r--r--kernel/rcu/Kconfig35
-rw-r--r--kernel/rcu/Kconfig.debug18
-rw-r--r--kernel/rcu/rcu.h13
-rw-r--r--kernel/rcu/rcutorture.c124
-rw-r--r--kernel/rcu/refscale.c32
-rw-r--r--kernel/rcu/srcutiny.c20
-rw-r--r--kernel/rcu/srcutree.c207
-rw-r--r--kernel/rcu/tasks.h5
-rw-r--r--kernel/rcu/tiny.c29
-rw-r--r--kernel/rcu/tree.c73
-rw-r--r--kernel/rcu/tree_exp.h6
-rw-r--r--kernel/rcu/tree_nocb.h20
-rw-r--r--kernel/rcu/tree_plugin.h22
-rw-r--r--kernel/reboot.c1
-rw-r--r--kernel/rseq.c140
-rw-r--r--kernel/sched/Makefile5
-rw-r--r--kernel/sched/build_policy.c1
-rw-r--r--kernel/sched/build_utility.c4
-rw-r--r--kernel/sched/core.c149
-rw-r--r--kernel/sched/core_sched.c2
-rw-r--r--kernel/sched/deadline.c57
-rw-r--r--kernel/sched/debug.c18
-rw-r--r--kernel/sched/ext.c1087
-rw-r--r--kernel/sched/ext.h10
-rw-r--r--kernel/sched/ext_idle.c1171
-rw-r--r--kernel/sched/ext_idle.h35
-rw-r--r--kernel/sched/fair.c139
-rw-r--r--kernel/sched/rt.c14
-rw-r--r--kernel/sched/sched.h128
-rw-r--r--kernel/sched/stats.h2
-rw-r--r--kernel/sched/syscalls.c12
-rw-r--r--kernel/sched/topology.c45
-rw-r--r--kernel/seccomp.c49
-rw-r--r--kernel/signal.c110
-rw-r--r--kernel/stop_machine.c1
-rw-r--r--kernel/sys.c19
-rw-r--r--kernel/sysctl.c120
-rw-r--r--kernel/time/Makefile6
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/hrtimer.c34
-rw-r--r--kernel/time/namespace.c24
-rw-r--r--kernel/time/ntp.c3
-rw-r--r--kernel/time/posix-clock.c24
-rw-r--r--kernel/time/posix-timers.c558
-rw-r--r--kernel/time/sched_clock.c3
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c3
-rw-r--r--kernel/time/tick-sched.c6
-rw-r--r--kernel/time/timekeeping.c94
-rw-r--r--kernel/time/timer_list.c4
-rw-r--r--kernel/time/vsyscall.c66
-rw-r--r--kernel/torture.c12
-rw-r--r--kernel/trace/bpf_trace.c21
-rw-r--r--kernel/trace/trace.c7
-rw-r--r--kernel/trace/trace_event_perf.c4
-rw-r--r--kernel/trace/trace_osnoise.c38
-rw-r--r--kernel/user_namespace.c26
-rw-r--r--kernel/watch_queue.c16
-rw-r--r--kernel/watchdog.c28
-rw-r--r--kernel/watchdog_perf.c29
114 files changed, 4247 insertions, 2978 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 87866b037fbe..434929de17ef 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,6 +21,11 @@ ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
endif
+# Branch profiling isn't noinstr-safe
+ifdef CONFIG_TRACE_BRANCH_PROFILING
+CFLAGS_context_tracking.o += -DDISABLE_BRANCH_PROFILING
+endif
+
# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip()
# in coverage traces.
KCOV_INSTRUMENT_softirq.o := n
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 7f358740e958..367eaf2c78b7 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -350,11 +350,10 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent)
struct dentry *d = kern_path_locked(watch->path, parent);
if (IS_ERR(d))
return PTR_ERR(d);
- if (d_is_positive(d)) {
- /* update watch filter fields */
- watch->dev = d->d_sb->s_dev;
- watch->ino = d_backing_inode(d)->i_ino;
- }
+ /* update watch filter fields */
+ watch->dev = d->d_sb->s_dev;
+ watch->ino = d_backing_inode(d)->i_ino;
+
inode_unlock(d_backing_inode(parent->dentry));
dput(d);
return 0;
@@ -419,10 +418,11 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
/* caller expects mutex locked */
mutex_lock(&audit_filter_mutex);
- if (ret) {
+ if (ret && ret != -ENOENT) {
audit_put_watch(watch);
return ret;
}
+ ret = 0;
/* either find an old parent or attach a new one */
parent = audit_find_parent(d_backing_inode(parent_path.dentry));
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9c853cde9abe..78fd876a5473 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2207,10 +2207,8 @@ __audit_reusename(const __user char *uptr)
list_for_each_entry(n, &context->names_list, list) {
if (!n->name)
continue;
- if (n->name->uptr == uptr) {
- atomic_inc(&n->name->refcnt);
- return n->name;
- }
+ if (n->name->uptr == uptr)
+ return refname(n->name);
}
return NULL;
}
@@ -2237,7 +2235,7 @@ void __audit_getname(struct filename *name)
n->name = name;
n->name_len = AUDIT_NAME_FULL;
name->aname = n;
- atomic_inc(&name->refcnt);
+ refname(name);
}
static inline int audit_copy_fcaps(struct audit_names *name,
@@ -2369,7 +2367,7 @@ out_alloc:
return;
if (name) {
n->name = name;
- atomic_inc(&name->refcnt);
+ refname(name);
}
out:
@@ -2496,7 +2494,7 @@ void __audit_inode_child(struct inode *parent,
if (found_parent) {
found_child->name = found_parent->name;
found_child->name_len = AUDIT_NAME_FULL;
- atomic_inc(&found_child->name->refcnt);
+ refname(found_child->name);
}
}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index f27ce162427a..672abe111282 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1284,8 +1284,7 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
atomic_set(&t->cancelling, 0);
INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work);
- hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
- t->timer.function = bpf_timer_cb;
+ hrtimer_setup(&t->timer, bpf_timer_cb, clockid, HRTIMER_MODE_REL_SOFT);
cb->value = (void *)async - map->record->timer_off;
break;
case BPF_ASYNC_TYPE_WQ:
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 9aaf5124648b..dc3aa91a6ba0 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -150,14 +150,14 @@ static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
}
-static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static struct dentry *bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR);
if (IS_ERR(inode))
- return PTR_ERR(inode);
+ return ERR_CAST(inode);
inode->i_op = &bpf_dir_iops;
inode->i_fop = &simple_dir_operations;
@@ -166,7 +166,7 @@ static int bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
inc_nlink(dir);
bpf_dentry_finalize(dentry, inode, dir);
- return 0;
+ return NULL;
}
struct map_iter {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 60611df77957..6e604caa870c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -21702,12 +21702,12 @@ patch_map_ops_generic:
if (insn->imm == BPF_FUNC_get_smp_processor_id &&
verifier_inlines_helper_call(env, insn->imm)) {
/* BPF_FUNC_get_smp_processor_id inlining is an
- * optimization, so if pcpu_hot.cpu_number is ever
+ * optimization, so if cpu_number is ever
* changed in some incompatible and hard to support
* way, it's fine to back out this inlining logic
*/
#ifdef CONFIG_SMP
- insn_buf[0] = BPF_MOV32_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number);
+ insn_buf[0] = BPF_MOV64_IMM(BPF_REG_0, (u32)(unsigned long)&cpu_number);
insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0);
cnt = 3;
diff --git a/kernel/cfi.c b/kernel/cfi.c
index 08caad776717..19be79639542 100644
--- a/kernel/cfi.c
+++ b/kernel/cfi.c
@@ -7,6 +7,8 @@
#include <linux/cfi.h>
+bool cfi_warn __ro_after_init = IS_ENABLED(CONFIG_CFI_PERMISSIVE);
+
enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr,
unsigned long *target, u32 type)
{
@@ -17,7 +19,7 @@ enum bug_trap_type report_cfi_failure(struct pt_regs *regs, unsigned long addr,
pr_err("CFI failure at %pS (no target information)\n",
(void *)addr);
- if (IS_ENABLED(CONFIG_CFI_PERMISSIVE)) {
+ if (cfi_warn) {
__warn(NULL, 0, (void *)addr, 0, regs, NULL);
return BUG_TRAP_TYPE_WARN;
}
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index c964dd7ff967..95ab39e1ec8f 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -168,6 +168,7 @@ struct cgroup_mgctx {
extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;
+extern bool cgrp_dfl_visible;
/* iterate across the hierarchies */
#define for_each_root(root) \
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index e28d5f0d20ed..11ea8d24ac72 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -673,6 +673,7 @@ struct cftype cgroup1_base_files[] = {
int proc_cgroupstats_show(struct seq_file *m, void *v)
{
struct cgroup_subsys *ss;
+ bool cgrp_v1_visible = false;
int i;
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
@@ -684,12 +685,18 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
for_each_subsys(ss, i) {
if (cgroup1_subsys_absent(ss))
continue;
+ cgrp_v1_visible |= ss->root != &cgrp_dfl_root;
+
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->legacy_name, ss->root->hierarchy_id,
atomic_read(&ss->root->nr_cgrps),
cgroup_ssid_enabled(i));
}
+ if (cgrp_dfl_visible && !cgrp_v1_visible)
+ pr_info_once("/proc/cgroups lists only v1 controllers, use cgroup.controllers of root cgroup for v2 info\n");
+
+
return 0;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index afc665b7b1fe..f231fe3a0744 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -171,7 +171,7 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
* The default hierarchy always exists but is hidden until mounted for the
* first time. This is for backward compatibility.
*/
-static bool cgrp_dfl_visible;
+bool cgrp_dfl_visible;
/* some controllers are not supported in the default hierarchy */
static u16 cgrp_dfl_inhibit_ss_mask;
@@ -4447,7 +4447,7 @@ int cgroup_rm_cftypes(struct cftype *cfts)
* function currently returns 0 as long as @cfts registration is successful
* even if some file creation attempts on existing cgroups fail.
*/
-static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
int ret;
@@ -5831,7 +5831,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
}
/*
- * This extra ref will be put in cgroup_free_fn() and guarantees
+ * This extra ref will be put in css_free_rwork_fn() and guarantees
* that @cgrp->kn is always accessible.
*/
kernfs_get(cgrp->kn);
diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
index 25c1d7b77e2f..b69a7db67090 100644
--- a/kernel/cgroup/cpuset-v1.c
+++ b/kernel/cgroup/cpuset-v1.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-or-later
+#include "cgroup-internal.h"
#include "cpuset-internal.h"
/*
@@ -175,6 +176,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
switch (type) {
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
retval = update_relax_domain_level(cs, val);
break;
default:
@@ -373,6 +375,46 @@ out:
return ret;
}
+#ifdef CONFIG_PROC_PID_CPUSET
+/*
+ * proc_cpuset_show()
+ * - Print tasks cpuset path into seq_file.
+ * - Used for /proc/<pid>/cpuset.
+ */
+int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *tsk)
+{
+ char *buf;
+ struct cgroup_subsys_state *css;
+ int retval;
+
+ retval = -ENOMEM;
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf)
+ goto out;
+
+ rcu_read_lock();
+ spin_lock_irq(&css_set_lock);
+ css = task_css(tsk, cpuset_cgrp_id);
+ retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
+ current->nsproxy->cgroup_ns);
+ spin_unlock_irq(&css_set_lock);
+ rcu_read_unlock();
+
+ if (retval == -E2BIG)
+ retval = -ENAMETOOLONG;
+ if (retval < 0)
+ goto out_free;
+ seq_puts(m, buf);
+ seq_putc(m, '\n');
+ retval = 0;
+out_free:
+ kfree(buf);
+out:
+ return retval;
+}
+#endif /* CONFIG_PROC_PID_CPUSET */
+
static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
{
struct cpuset *cs = css_cs(css);
@@ -424,24 +466,31 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val);
break;
case FILE_MEM_EXCLUSIVE:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val);
break;
case FILE_MEM_HARDWALL:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val);
break;
case FILE_SCHED_LOAD_BALANCE:
+ pr_info_once("cpuset.%s is deprecated, use cpuset.cpus.partition instead\n", cft->name);
retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
break;
case FILE_MEMORY_MIGRATE:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val);
break;
case FILE_MEMORY_PRESSURE_ENABLED:
+ pr_info_once("cpuset.%s is deprecated, use memory.pressure with CONFIG_PSI instead\n", cft->name);
cpuset_memory_pressure_enabled = !!val;
break;
case FILE_SPREAD_PAGE:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val);
break;
case FILE_SPREAD_SLAB:
+ pr_warn_once("cpuset.%s is deprecated\n", cft->name);
retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val);
break;
default:
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 0f910c828973..39c1fc643d77 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -21,7 +21,6 @@
* License. See the file COPYING in the main directory of the Linux
* distribution for more details.
*/
-#include "cgroup-internal.h"
#include "cpuset-internal.h"
#include <linux/init.h>
@@ -954,10 +953,12 @@ static void dl_update_tasks_root_domain(struct cpuset *cs)
css_task_iter_end(&it);
}
-static void dl_rebuild_rd_accounting(void)
+void dl_rebuild_rd_accounting(void)
{
struct cpuset *cs = NULL;
struct cgroup_subsys_state *pos_css;
+ int cpu;
+ u64 cookie = ++dl_cookie;
lockdep_assert_held(&cpuset_mutex);
lockdep_assert_cpus_held();
@@ -965,11 +966,12 @@ static void dl_rebuild_rd_accounting(void)
rcu_read_lock();
- /*
- * Clear default root domain DL accounting, it will be computed again
- * if a task belongs to it.
- */
- dl_clear_root_domain(&def_root_domain);
+ for_each_possible_cpu(cpu) {
+ if (dl_bw_visited(cpu, cookie))
+ continue;
+
+ dl_clear_root_domain_cpu(cpu);
+ }
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
@@ -990,16 +992,6 @@ static void dl_rebuild_rd_accounting(void)
rcu_read_unlock();
}
-static void
-partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
- struct sched_domain_attr *dattr_new)
-{
- mutex_lock(&sched_domains_mutex);
- partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
- dl_rebuild_rd_accounting();
- mutex_unlock(&sched_domains_mutex);
-}
-
/*
* Rebuild scheduler domains.
*
@@ -1061,7 +1053,7 @@ void rebuild_sched_domains_locked(void)
ndoms = generate_sched_domains(&doms, &attr);
/* Have scheduler rebuild the domains */
- partition_and_rebuild_sched_domains(ndoms, doms, attr);
+ partition_sched_domains(ndoms, doms, attr);
}
#else /* !CONFIG_SMP */
void rebuild_sched_domains_locked(void)
@@ -1083,6 +1075,13 @@ void rebuild_sched_domains(void)
cpus_read_unlock();
}
+void cpuset_reset_sched_domains(void)
+{
+ mutex_lock(&cpuset_mutex);
+ partition_sched_domains(1, NULL, NULL);
+ mutex_unlock(&cpuset_mutex);
+}
+
/**
* cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
@@ -4244,50 +4243,6 @@ void cpuset_print_current_mems_allowed(void)
rcu_read_unlock();
}
-#ifdef CONFIG_PROC_PID_CPUSET
-/*
- * proc_cpuset_show()
- * - Print tasks cpuset path into seq_file.
- * - Used for /proc/<pid>/cpuset.
- * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
- * doesn't really matter if tsk->cpuset changes after we read it,
- * and we take cpuset_mutex, keeping cpuset_attach() from changing it
- * anyway.
- */
-int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
- struct pid *pid, struct task_struct *tsk)
-{
- char *buf;
- struct cgroup_subsys_state *css;
- int retval;
-
- retval = -ENOMEM;
- buf = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!buf)
- goto out;
-
- rcu_read_lock();
- spin_lock_irq(&css_set_lock);
- css = task_css(tsk, cpuset_cgrp_id);
- retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
- current->nsproxy->cgroup_ns);
- spin_unlock_irq(&css_set_lock);
- rcu_read_unlock();
-
- if (retval == -E2BIG)
- retval = -ENAMETOOLONG;
- if (retval < 0)
- goto out_free;
- seq_puts(m, buf);
- seq_putc(m, '\n');
- retval = 0;
-out_free:
- kfree(buf);
-out:
- return retval;
-}
-#endif /* CONFIG_PROC_PID_CPUSET */
-
/* Display task mems_allowed in /proc/<pid>/status file. */
void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
{
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 074653f964c1..039d1eb2f215 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -430,9 +430,11 @@ static ssize_t freezer_write(struct kernfs_open_file *of,
if (strcmp(buf, freezer_state_strs(0)) == 0)
freeze = false;
- else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
+ else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) {
+ pr_info_once("Freezing with imperfect legacy cgroup freezer. "
+ "See cgroup.freeze of cgroup v2\n");
freeze = true;
- else
+ } else
return -EINVAL;
freezer_change_state(css_freezer(of_css(of)), freeze);
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index 0e26068995a6..2fa3a4fb2aaf 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -68,22 +68,6 @@ static inline bool valid_type(enum misc_res_type type)
}
/**
- * misc_cg_res_total_usage() - Get the current total usage of the resource.
- * @type: misc res type.
- *
- * Context: Any context.
- * Return: Current total usage of the resource.
- */
-u64 misc_cg_res_total_usage(enum misc_res_type type)
-{
- if (valid_type(type))
- return atomic64_read(&root_cg.res[type].usage);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(misc_cg_res_total_usage);
-
-/**
* misc_cg_set_capacity() - Set the capacity of the misc cgroup res.
* @type: Type of the misc res.
* @capacity: Supported capacity of the misc res on the host.
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index aac91466279f..4bb587d5d34f 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -299,40 +299,6 @@ static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop)
spin_unlock_irq(&cgroup_rstat_lock);
}
-/* see cgroup_rstat_flush() */
-static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
- __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
-{
- int cpu;
-
- lockdep_assert_held(&cgroup_rstat_lock);
-
- for_each_possible_cpu(cpu) {
- struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu);
-
- for (; pos; pos = pos->rstat_flush_next) {
- struct cgroup_subsys_state *css;
-
- cgroup_base_stat_flush(pos, cpu);
- bpf_rstat_flush(pos, cgroup_parent(pos), cpu);
-
- rcu_read_lock();
- list_for_each_entry_rcu(css, &pos->rstat_css_list,
- rstat_css_node)
- css->ss->css_rstat_flush(css, cpu);
- rcu_read_unlock();
- }
-
- /* play nice and yield if necessary */
- if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
- __cgroup_rstat_unlock(cgrp, cpu);
- if (!cond_resched())
- cpu_relax();
- __cgroup_rstat_lock(cgrp, cpu);
- }
- }
-}
-
/**
* cgroup_rstat_flush - flush stats in @cgrp's subtree
* @cgrp: target cgroup
@@ -348,38 +314,30 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
*/
__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
{
+ int cpu;
+
might_sleep();
+ for_each_possible_cpu(cpu) {
+ struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu);
- __cgroup_rstat_lock(cgrp, -1);
- cgroup_rstat_flush_locked(cgrp);
- __cgroup_rstat_unlock(cgrp, -1);
-}
+ /* Reacquire for each CPU to avoid disabling IRQs too long */
+ __cgroup_rstat_lock(cgrp, cpu);
+ for (; pos; pos = pos->rstat_flush_next) {
+ struct cgroup_subsys_state *css;
-/**
- * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
- * @cgrp: target cgroup
- *
- * Flush stats in @cgrp's subtree and prevent further flushes. Must be
- * paired with cgroup_rstat_flush_release().
- *
- * This function may block.
- */
-void cgroup_rstat_flush_hold(struct cgroup *cgrp)
- __acquires(&cgroup_rstat_lock)
-{
- might_sleep();
- __cgroup_rstat_lock(cgrp, -1);
- cgroup_rstat_flush_locked(cgrp);
-}
+ cgroup_base_stat_flush(pos, cpu);
+ bpf_rstat_flush(pos, cgroup_parent(pos), cpu);
-/**
- * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
- * @cgrp: cgroup used by tracepoint
- */
-void cgroup_rstat_flush_release(struct cgroup *cgrp)
- __releases(&cgroup_rstat_lock)
-{
- __cgroup_rstat_unlock(cgrp, -1);
+ rcu_read_lock();
+ list_for_each_entry_rcu(css, &pos->rstat_css_list,
+ rstat_css_node)
+ css->ss->css_rstat_flush(css, cpu);
+ rcu_read_unlock();
+ }
+ __cgroup_rstat_unlock(cgrp, cpu);
+ if (!cond_resched())
+ cpu_relax();
+ }
}
int cgroup_rstat_init(struct cgroup *cgrp)
@@ -612,36 +570,34 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat
void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- u64 usage, utime, stime, ntime;
+ struct cgroup_base_stat bstat;
if (cgroup_parent(cgrp)) {
- cgroup_rstat_flush_hold(cgrp);
- usage = cgrp->bstat.cputime.sum_exec_runtime;
+ cgroup_rstat_flush(cgrp);
+ __cgroup_rstat_lock(cgrp, -1);
+ bstat = cgrp->bstat;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
- &utime, &stime);
- ntime = cgrp->bstat.ntime;
- cgroup_rstat_flush_release(cgrp);
+ &bstat.cputime.utime, &bstat.cputime.stime);
+ __cgroup_rstat_unlock(cgrp, -1);
} else {
- /* cgrp->bstat of root is not actually used, reuse it */
- root_cgroup_cputime(&cgrp->bstat);
- usage = cgrp->bstat.cputime.sum_exec_runtime;
- utime = cgrp->bstat.cputime.utime;
- stime = cgrp->bstat.cputime.stime;
- ntime = cgrp->bstat.ntime;
+ root_cgroup_cputime(&bstat);
}
- do_div(usage, NSEC_PER_USEC);
- do_div(utime, NSEC_PER_USEC);
- do_div(stime, NSEC_PER_USEC);
- do_div(ntime, NSEC_PER_USEC);
+ do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC);
+ do_div(bstat.cputime.utime, NSEC_PER_USEC);
+ do_div(bstat.cputime.stime, NSEC_PER_USEC);
+ do_div(bstat.ntime, NSEC_PER_USEC);
seq_printf(seq, "usage_usec %llu\n"
"user_usec %llu\n"
"system_usec %llu\n"
"nice_usec %llu\n",
- usage, utime, stime, ntime);
+ bstat.cputime.sum_exec_runtime,
+ bstat.cputime.utime,
+ bstat.cputime.stime,
+ bstat.ntime);
- cgroup_force_idle_show(seq, &cgrp->bstat);
+ cgroup_force_idle_show(seq, &bstat);
}
/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
diff --git a/kernel/configs/hardening.config b/kernel/configs/hardening.config
index 3fabb8f55ef6..dd7c32fb5ac1 100644
--- a/kernel/configs/hardening.config
+++ b/kernel/configs/hardening.config
@@ -46,7 +46,7 @@ CONFIG_UBSAN_BOUNDS=y
# CONFIG_UBSAN_SHIFT is not set
# CONFIG_UBSAN_DIV_ZERO is not set
# CONFIG_UBSAN_UNREACHABLE is not set
-# CONFIG_UBSAN_SIGNED_WRAP is not set
+# CONFIG_UBSAN_INTEGER_WRAP is not set
# CONFIG_UBSAN_BOOL is not set
# CONFIG_UBSAN_ENUM is not set
# CONFIG_UBSAN_ALIGNMENT is not set
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 938c48952d26..fb5be6e9b423 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -80,17 +80,16 @@ static __always_inline void rcu_task_trace_heavyweight_exit(void)
*/
static noinstr void ct_kernel_exit_state(int offset)
{
- int seq;
-
/*
* CPUs seeing atomic_add_return() must see prior RCU read-side
* critical sections, and we also must force ordering with the
* next idle sojourn.
*/
rcu_task_trace_heavyweight_enter(); // Before CT state update!
- seq = ct_state_inc(offset);
- // RCU is no longer watching. Better be in extended quiescent state!
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & CT_RCU_WATCHING));
+ // RCU is still watching. Better not be in extended quiescent state!
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !rcu_is_watching_curr_cpu());
+ (void)ct_state_inc(offset);
+ // RCU is no longer watching.
}
/*
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 07455d25329c..b08bb34b1718 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -526,6 +526,7 @@ void lockdep_assert_cpus_held(void)
percpu_rwsem_assert_held(&cpu_hotplug_lock);
}
+EXPORT_SYMBOL_GPL(lockdep_assert_cpus_held);
#ifdef CONFIG_LOCKDEP
int lockdep_is_cpus_held(void)
@@ -1453,11 +1454,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
out:
cpus_write_unlock();
- /*
- * Do post unplug cleanup. This is still protected against
- * concurrent CPU hotplug via cpu_add_remove_lock.
- */
- lockup_detector_cleanup();
arch_smt_update();
return ret;
}
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 078fe5bc5a74..335b8425dd4b 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -436,7 +436,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
memset(&prstatus, 0, sizeof(prstatus));
prstatus.common.pr_pid = current->pid;
elf_core_copy_regs(&prstatus.pr_reg, regs);
- buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+ buf = append_elf_note(buf, NN_PRSTATUS, NT_PRSTATUS,
&prstatus, sizeof(prstatus));
final_note(buf);
}
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 5b4e6d3bf7bc..b8fe0b3d0ffb 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -584,6 +584,22 @@ int dma_direct_supported(struct device *dev, u64 mask)
return mask >= phys_to_dma_unencrypted(dev, min_mask);
}
+static const struct bus_dma_region *dma_find_range(struct device *dev,
+ unsigned long start_pfn)
+{
+ const struct bus_dma_region *m;
+
+ for (m = dev->dma_range_map; PFN_DOWN(m->size); m++) {
+ unsigned long cpu_start_pfn = PFN_DOWN(m->cpu_start);
+
+ if (start_pfn >= cpu_start_pfn &&
+ start_pfn - cpu_start_pfn < PFN_DOWN(m->size))
+ return m;
+ }
+
+ return NULL;
+}
+
/*
* To check whether all ram resource ranges are covered by dma range map
* Returns 0 when further check is needed
@@ -593,20 +609,12 @@ static int check_ram_in_range_map(unsigned long start_pfn,
unsigned long nr_pages, void *data)
{
unsigned long end_pfn = start_pfn + nr_pages;
- const struct bus_dma_region *bdr = NULL;
- const struct bus_dma_region *m;
struct device *dev = data;
while (start_pfn < end_pfn) {
- for (m = dev->dma_range_map; PFN_DOWN(m->size); m++) {
- unsigned long cpu_start_pfn = PFN_DOWN(m->cpu_start);
+ const struct bus_dma_region *bdr;
- if (start_pfn >= cpu_start_pfn &&
- start_pfn - cpu_start_pfn < PFN_DOWN(m->size)) {
- bdr = m;
- break;
- }
- }
+ bdr = dma_find_range(dev, start_pfn);
if (!bdr)
return 1;
diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile
index 095c775e001e..d4b8bd0af79b 100644
--- a/kernel/entry/Makefile
+++ b/kernel/entry/Makefile
@@ -6,6 +6,9 @@ KASAN_SANITIZE := n
UBSAN_SANITIZE := n
KCOV_INSTRUMENT := n
+# Branch profiling isn't noinstr-safe
+ccflags-$(CONFIG_TRACE_BRANCH_PROFILING) += -DDISABLE_BRANCH_PROFILING
+
CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong
CFLAGS_common.o += -fno-stack-protector
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index e33691d5adf7..20154572ede9 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -49,7 +49,7 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall,
/* Do seccomp after ptrace, to catch any tracer changes. */
if (work & SYSCALL_WORK_SECCOMP) {
- ret = __secure_computing(NULL);
+ ret = __secure_computing();
if (ret == -1L)
return ret;
}
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 8a47e52a454f..6c83ad674d01 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -22,6 +22,7 @@ struct callchain_cpus_entries {
int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH;
int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK;
+static const int six_hundred_forty_kb = 640 * 1024;
static inline size_t perf_callchain_entry__sizeof(void)
{
@@ -266,12 +267,8 @@ exit_put:
return entry;
}
-/*
- * Used for sysctl_perf_event_max_stack and
- * sysctl_perf_event_max_contexts_per_stack.
- */
-int perf_event_max_stack_handler(const struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+static int perf_event_max_stack_handler(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *value = table->data;
int new_value = *value, ret;
@@ -292,3 +289,32 @@ int perf_event_max_stack_handler(const struct ctl_table *table, int write,
return ret;
}
+
+static const struct ctl_table callchain_sysctl_table[] = {
+ {
+ .procname = "perf_event_max_stack",
+ .data = &sysctl_perf_event_max_stack,
+ .maxlen = sizeof(sysctl_perf_event_max_stack),
+ .mode = 0644,
+ .proc_handler = perf_event_max_stack_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = (void *)&six_hundred_forty_kb,
+ },
+ {
+ .procname = "perf_event_max_contexts_per_stack",
+ .data = &sysctl_perf_event_max_contexts_per_stack,
+ .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack),
+ .mode = 0644,
+ .proc_handler = perf_event_max_stack_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_THOUSAND,
+ },
+};
+
+static int __init init_callchain_sysctls(void)
+{
+ register_sysctl_init("kernel", callchain_sysctl_table);
+ return 0;
+}
+core_initcall(init_callchain_sysctls);
+
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 823aa0824916..0bb21659e252 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -55,6 +55,7 @@
#include <linux/pgtable.h>
#include <linux/buildid.h>
#include <linux/task_work.h>
+#include <linux/percpu-rwsem.h>
#include "internal.h"
@@ -452,8 +453,8 @@ static struct kmem_cache *perf_event_cache;
*/
int sysctl_perf_event_paranoid __read_mostly = 2;
-/* Minimum for 512 kiB + 1 user control page */
-int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
+/* Minimum for 512 kiB + 1 user control page. 'free' kiB per user. */
+static int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024);
/*
* max perf event sample rate
@@ -463,6 +464,7 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free'
#define DEFAULT_CPU_TIME_MAX_PERCENT 25
int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+static int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
@@ -484,7 +486,7 @@ static void update_perf_cpu_limits(void)
static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);
-int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write,
+static int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -506,9 +508,7 @@ int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write,
return 0;
}
-int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
-
-int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write,
+static int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
@@ -528,6 +528,52 @@ int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write,
return 0;
}
+static const struct ctl_table events_core_sysctl_table[] = {
+ /*
+ * User-space relies on this file as a feature check for
+ * perf_events being enabled. It's an ABI, do not remove!
+ */
+ {
+ .procname = "perf_event_paranoid",
+ .data = &sysctl_perf_event_paranoid,
+ .maxlen = sizeof(sysctl_perf_event_paranoid),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "perf_event_mlock_kb",
+ .data = &sysctl_perf_event_mlock,
+ .maxlen = sizeof(sysctl_perf_event_mlock),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "perf_event_max_sample_rate",
+ .data = &sysctl_perf_event_sample_rate,
+ .maxlen = sizeof(sysctl_perf_event_sample_rate),
+ .mode = 0644,
+ .proc_handler = perf_event_max_sample_rate_handler,
+ .extra1 = SYSCTL_ONE,
+ },
+ {
+ .procname = "perf_cpu_time_max_percent",
+ .data = &sysctl_perf_cpu_time_max_percent,
+ .maxlen = sizeof(sysctl_perf_cpu_time_max_percent),
+ .mode = 0644,
+ .proc_handler = perf_cpu_time_max_percent_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_HUNDRED,
+ },
+};
+
+static int __init init_events_core_sysctls(void)
+{
+ register_sysctl_init("kernel", events_core_sysctl_table);
+ return 0;
+}
+core_initcall(init_events_core_sysctls);
+
+
/*
* perf samples are done in some very critical code paths (NMIs).
* If they take too much CPU time, the system can lock up and not
@@ -1147,8 +1193,8 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu)
cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
raw_spin_lock_init(&cpc->hrtimer_lock);
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
- timer->function = perf_mux_hrtimer_handler;
+ hrtimer_setup(timer, perf_mux_hrtimer_handler, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_PINNED_HARD);
}
static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc)
@@ -1172,42 +1218,40 @@ static int perf_mux_hrtimer_restart_ipi(void *arg)
return perf_mux_hrtimer_restart(arg);
}
+static __always_inline struct perf_cpu_pmu_context *this_cpc(struct pmu *pmu)
+{
+ return *this_cpu_ptr(pmu->cpu_pmu_context);
+}
+
void perf_pmu_disable(struct pmu *pmu)
{
- int *count = this_cpu_ptr(pmu->pmu_disable_count);
+ int *count = &this_cpc(pmu)->pmu_disable_count;
if (!(*count)++)
pmu->pmu_disable(pmu);
}
void perf_pmu_enable(struct pmu *pmu)
{
- int *count = this_cpu_ptr(pmu->pmu_disable_count);
+ int *count = &this_cpc(pmu)->pmu_disable_count;
if (!--(*count))
pmu->pmu_enable(pmu);
}
static void perf_assert_pmu_disabled(struct pmu *pmu)
{
- WARN_ON_ONCE(*this_cpu_ptr(pmu->pmu_disable_count) == 0);
-}
-
-static void get_ctx(struct perf_event_context *ctx)
-{
- refcount_inc(&ctx->refcount);
+ int *count = &this_cpc(pmu)->pmu_disable_count;
+ WARN_ON_ONCE(*count == 0);
}
-static void *alloc_task_ctx_data(struct pmu *pmu)
+static inline void perf_pmu_read(struct perf_event *event)
{
- if (pmu->task_ctx_cache)
- return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
-
- return NULL;
+ if (event->state == PERF_EVENT_STATE_ACTIVE)
+ event->pmu->read(event);
}
-static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
+static void get_ctx(struct perf_event_context *ctx)
{
- if (pmu->task_ctx_cache && task_ctx_data)
- kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
+ refcount_inc(&ctx->refcount);
}
static void free_ctx(struct rcu_head *head)
@@ -2303,7 +2347,7 @@ static void
event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_event_pmu_context *epc = event->pmu_ctx;
- struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
// XXX cpc serialization, probably per-cpu IRQ disabled
@@ -2444,9 +2488,8 @@ __perf_remove_from_context(struct perf_event *event,
pmu_ctx->rotate_necessary = 0;
if (ctx->task && ctx->is_active) {
- struct perf_cpu_pmu_context *cpc;
+ struct perf_cpu_pmu_context *cpc = this_cpc(pmu_ctx->pmu);
- cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
cpc->task_epc = NULL;
}
@@ -2584,7 +2627,7 @@ static int
event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
{
struct perf_event_pmu_context *epc = event->pmu_ctx;
- struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
int ret = 0;
WARN_ON_ONCE(event->ctx != ctx);
@@ -2691,7 +2734,7 @@ error:
static int group_can_go_on(struct perf_event *event, int can_add_hw)
{
struct perf_event_pmu_context *epc = event->pmu_ctx;
- struct perf_cpu_pmu_context *cpc = this_cpu_ptr(epc->pmu->cpu_pmu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
/*
* Groups consisting entirely of software events can always go on.
@@ -3314,9 +3357,8 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
struct pmu *pmu = pmu_ctx->pmu;
if (ctx->task && !(ctx->is_active & EVENT_ALL)) {
- struct perf_cpu_pmu_context *cpc;
+ struct perf_cpu_pmu_context *cpc = this_cpc(pmu);
- cpc = this_cpu_ptr(pmu->cpu_pmu_context);
WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
cpc->task_epc = NULL;
}
@@ -3473,8 +3515,7 @@ static void __perf_event_sync_stat(struct perf_event *event,
* we know the event must be on the current CPU, therefore we
* don't need to use it.
*/
- if (event->state == PERF_EVENT_STATE_ACTIVE)
- event->pmu->read(event);
+ perf_pmu_read(event);
perf_event_update_time(event);
@@ -3522,52 +3563,17 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
}
}
-#define double_list_for_each_entry(pos1, pos2, head1, head2, member) \
- for (pos1 = list_first_entry(head1, typeof(*pos1), member), \
- pos2 = list_first_entry(head2, typeof(*pos2), member); \
- !list_entry_is_head(pos1, head1, member) && \
- !list_entry_is_head(pos2, head2, member); \
- pos1 = list_next_entry(pos1, member), \
- pos2 = list_next_entry(pos2, member))
-
-static void perf_event_swap_task_ctx_data(struct perf_event_context *prev_ctx,
- struct perf_event_context *next_ctx)
-{
- struct perf_event_pmu_context *prev_epc, *next_epc;
-
- if (!prev_ctx->nr_task_data)
- return;
-
- double_list_for_each_entry(prev_epc, next_epc,
- &prev_ctx->pmu_ctx_list, &next_ctx->pmu_ctx_list,
- pmu_ctx_entry) {
-
- if (WARN_ON_ONCE(prev_epc->pmu != next_epc->pmu))
- continue;
-
- /*
- * PMU specific parts of task perf context can require
- * additional synchronization. As an example of such
- * synchronization see implementation details of Intel
- * LBR call stack data profiling;
- */
- if (prev_epc->pmu->swap_task_ctx)
- prev_epc->pmu->swap_task_ctx(prev_epc, next_epc);
- else
- swap(prev_epc->task_ctx_data, next_epc->task_ctx_data);
- }
-}
-
-static void perf_ctx_sched_task_cb(struct perf_event_context *ctx, bool sched_in)
+static void perf_ctx_sched_task_cb(struct perf_event_context *ctx,
+ struct task_struct *task, bool sched_in)
{
struct perf_event_pmu_context *pmu_ctx;
struct perf_cpu_pmu_context *cpc;
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
- cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+ cpc = this_cpc(pmu_ctx->pmu);
if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task)
- pmu_ctx->pmu->sched_task(pmu_ctx, sched_in);
+ pmu_ctx->pmu->sched_task(pmu_ctx, task, sched_in);
}
}
@@ -3630,17 +3636,16 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
WRITE_ONCE(ctx->task, next);
WRITE_ONCE(next_ctx->task, task);
- perf_ctx_sched_task_cb(ctx, false);
- perf_event_swap_task_ctx_data(ctx, next_ctx);
+ perf_ctx_sched_task_cb(ctx, task, false);
perf_ctx_enable(ctx, false);
/*
* RCU_INIT_POINTER here is safe because we've not
* modified the ctx and the above modification of
- * ctx->task and ctx->task_ctx_data are immaterial
- * since those values are always verified under
- * ctx->lock which we're now holding.
+ * ctx->task is immaterial since this value is
+ * always verified under ctx->lock which we're now
+ * holding.
*/
RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx);
RCU_INIT_POINTER(next->perf_event_ctxp, ctx);
@@ -3660,7 +3665,7 @@ unlock:
perf_ctx_disable(ctx, false);
inside_switch:
- perf_ctx_sched_task_cb(ctx, false);
+ perf_ctx_sched_task_cb(ctx, task, false);
task_ctx_sched_out(ctx, NULL, EVENT_ALL);
perf_ctx_enable(ctx, false);
@@ -3673,7 +3678,7 @@ static DEFINE_PER_CPU(int, perf_sched_cb_usages);
void perf_sched_cb_dec(struct pmu *pmu)
{
- struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpc(pmu);
this_cpu_dec(perf_sched_cb_usages);
barrier();
@@ -3685,7 +3690,7 @@ void perf_sched_cb_dec(struct pmu *pmu)
void perf_sched_cb_inc(struct pmu *pmu)
{
- struct perf_cpu_pmu_context *cpc = this_cpu_ptr(pmu->cpu_pmu_context);
+ struct perf_cpu_pmu_context *cpc = this_cpc(pmu);
if (!cpc->sched_cb_usage++)
list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
@@ -3702,7 +3707,8 @@ void perf_sched_cb_inc(struct pmu *pmu)
* PEBS requires this to provide PID/TID information. This requires we flush
* all queued PEBS records before we context switch to a new task.
*/
-static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_in)
+static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc,
+ struct task_struct *task, bool sched_in)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct pmu *pmu;
@@ -3716,7 +3722,7 @@ static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc, bool sched_i
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(pmu);
- pmu->sched_task(cpc->task_epc, sched_in);
+ pmu->sched_task(cpc->task_epc, task, sched_in);
perf_pmu_enable(pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -3734,7 +3740,7 @@ static void perf_pmu_sched_task(struct task_struct *prev,
return;
list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
- __perf_pmu_sched_task(cpc, sched_in);
+ __perf_pmu_sched_task(cpc, sched_in ? next : prev, sched_in);
}
static void perf_event_switch(struct task_struct *task,
@@ -3802,7 +3808,7 @@ static void __link_epc(struct perf_event_pmu_context *pmu_ctx)
if (!pmu_ctx->ctx->task)
return;
- cpc = this_cpu_ptr(pmu_ctx->pmu->cpu_pmu_context);
+ cpc = this_cpc(pmu_ctx->pmu);
WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
cpc->task_epc = pmu_ctx;
}
@@ -3930,11 +3936,15 @@ static int merge_sched_in(struct perf_event *event, void *data)
if (event->attr.pinned) {
perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+
+ if (*perf_event_fasync(event))
+ event->pending_kill = POLL_HUP;
+
+ perf_event_wakeup(event);
} else {
- struct perf_cpu_pmu_context *cpc;
+ struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu);
event->pmu_ctx->rotate_necessary = 1;
- cpc = this_cpu_ptr(event->pmu_ctx->pmu->cpu_pmu_context);
perf_mux_hrtimer_restart(cpc);
group_update_userpage(event);
}
@@ -4029,7 +4039,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
perf_ctx_lock(cpuctx, ctx);
perf_ctx_disable(ctx, false);
- perf_ctx_sched_task_cb(ctx, true);
+ perf_ctx_sched_task_cb(ctx, task, true);
perf_ctx_enable(ctx, false);
perf_ctx_unlock(cpuctx, ctx);
@@ -4060,7 +4070,7 @@ static void perf_event_context_sched_in(struct task_struct *task)
perf_event_sched_in(cpuctx, ctx, NULL);
- perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
+ perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true);
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
perf_ctx_enable(&cpuctx->ctx, false);
@@ -4618,15 +4628,8 @@ static void __perf_event_read(void *info)
pmu->read(event);
- for_each_sibling_event(sub, event) {
- if (sub->state == PERF_EVENT_STATE_ACTIVE) {
- /*
- * Use sibling's PMU rather than @event's since
- * sibling could be on different (eg: software) PMU.
- */
- sub->pmu->read(sub);
- }
- }
+ for_each_sibling_event(sub, event)
+ perf_pmu_read(sub);
data->ret = pmu->commit_txn(pmu);
@@ -4883,7 +4886,7 @@ find_get_context(struct task_struct *task, struct perf_event *event)
if (!task) {
/* Must be root to operate on a CPU event: */
- err = perf_allow_cpu(&event->attr);
+ err = perf_allow_cpu();
if (err)
return ERR_PTR(err);
@@ -4951,7 +4954,6 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
struct perf_event *event)
{
struct perf_event_pmu_context *new = NULL, *pos = NULL, *epc;
- void *task_ctx_data = NULL;
if (!ctx->task) {
/*
@@ -4961,11 +4963,14 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
*/
struct perf_cpu_pmu_context *cpc;
- cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
+ cpc = *per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
epc = &cpc->epc;
raw_spin_lock_irq(&ctx->lock);
if (!epc->ctx) {
- atomic_set(&epc->refcount, 1);
+ /*
+ * One extra reference for the pmu; see perf_pmu_free().
+ */
+ atomic_set(&epc->refcount, 2);
epc->embedded = 1;
list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
epc->ctx = ctx;
@@ -4981,14 +4986,6 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
if (!new)
return ERR_PTR(-ENOMEM);
- if (event->attach_state & PERF_ATTACH_TASK_DATA) {
- task_ctx_data = alloc_task_ctx_data(pmu);
- if (!task_ctx_data) {
- kfree(new);
- return ERR_PTR(-ENOMEM);
- }
- }
-
__perf_init_event_pmu_context(new, pmu);
/*
@@ -5023,14 +5020,7 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
epc->ctx = ctx;
found_epc:
- if (task_ctx_data && !epc->task_ctx_data) {
- epc->task_ctx_data = task_ctx_data;
- task_ctx_data = NULL;
- ctx->nr_task_data++;
- }
raw_spin_unlock_irq(&ctx->lock);
-
- free_task_ctx_data(pmu, task_ctx_data);
kfree(new);
return epc;
@@ -5041,11 +5031,18 @@ static void get_pmu_ctx(struct perf_event_pmu_context *epc)
WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount));
}
+static void free_cpc_rcu(struct rcu_head *head)
+{
+ struct perf_cpu_pmu_context *cpc =
+ container_of(head, typeof(*cpc), epc.rcu_head);
+
+ kfree(cpc);
+}
+
static void free_epc_rcu(struct rcu_head *head)
{
struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head);
- kfree(epc->task_ctx_data);
kfree(epc);
}
@@ -5075,8 +5072,10 @@ static void put_pmu_ctx(struct perf_event_pmu_context *epc)
raw_spin_unlock_irqrestore(&ctx->lock, flags);
- if (epc->embedded)
+ if (epc->embedded) {
+ call_rcu(&epc->rcu_head, free_cpc_rcu);
return;
+ }
call_rcu(&epc->rcu_head, free_epc_rcu);
}
@@ -5152,6 +5151,225 @@ static void unaccount_freq_event(void)
atomic_dec(&nr_freq_events);
}
+
+static struct perf_ctx_data *
+alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global)
+{
+ struct perf_ctx_data *cd;
+
+ cd = kzalloc(sizeof(*cd), GFP_KERNEL);
+ if (!cd)
+ return NULL;
+
+ cd->data = kmem_cache_zalloc(ctx_cache, GFP_KERNEL);
+ if (!cd->data) {
+ kfree(cd);
+ return NULL;
+ }
+
+ cd->global = global;
+ cd->ctx_cache = ctx_cache;
+ refcount_set(&cd->refcount, 1);
+
+ return cd;
+}
+
+static void free_perf_ctx_data(struct perf_ctx_data *cd)
+{
+ kmem_cache_free(cd->ctx_cache, cd->data);
+ kfree(cd);
+}
+
+static void __free_perf_ctx_data_rcu(struct rcu_head *rcu_head)
+{
+ struct perf_ctx_data *cd;
+
+ cd = container_of(rcu_head, struct perf_ctx_data, rcu_head);
+ free_perf_ctx_data(cd);
+}
+
+static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd)
+{
+ call_rcu(&cd->rcu_head, __free_perf_ctx_data_rcu);
+}
+
+static int
+attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache,
+ bool global)
+{
+ struct perf_ctx_data *cd, *old = NULL;
+
+ cd = alloc_perf_ctx_data(ctx_cache, global);
+ if (!cd)
+ return -ENOMEM;
+
+ for (;;) {
+ if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) {
+ if (old)
+ perf_free_ctx_data_rcu(old);
+ return 0;
+ }
+
+ if (!old) {
+ /*
+ * After seeing a dead @old, we raced with
+ * removal and lost, try again to install @cd.
+ */
+ continue;
+ }
+
+ if (refcount_inc_not_zero(&old->refcount)) {
+ free_perf_ctx_data(cd); /* unused */
+ return 0;
+ }
+
+ /*
+ * @old is a dead object, refcount==0 is stable, try and
+ * replace it with @cd.
+ */
+ }
+ return 0;
+}
+
+static void __detach_global_ctx_data(void);
+DEFINE_STATIC_PERCPU_RWSEM(global_ctx_data_rwsem);
+static refcount_t global_ctx_data_ref;
+
+static int
+attach_global_ctx_data(struct kmem_cache *ctx_cache)
+{
+ struct task_struct *g, *p;
+ struct perf_ctx_data *cd;
+ int ret;
+
+ if (refcount_inc_not_zero(&global_ctx_data_ref))
+ return 0;
+
+ guard(percpu_write)(&global_ctx_data_rwsem);
+ if (refcount_inc_not_zero(&global_ctx_data_ref))
+ return 0;
+again:
+ /* Allocate everything */
+ scoped_guard (rcu) {
+ for_each_process_thread(g, p) {
+ cd = rcu_dereference(p->perf_ctx_data);
+ if (cd && !cd->global) {
+ cd->global = 1;
+ if (!refcount_inc_not_zero(&cd->refcount))
+ cd = NULL;
+ }
+ if (!cd) {
+ get_task_struct(p);
+ goto alloc;
+ }
+ }
+ }
+
+ refcount_set(&global_ctx_data_ref, 1);
+
+ return 0;
+alloc:
+ ret = attach_task_ctx_data(p, ctx_cache, true);
+ put_task_struct(p);
+ if (ret) {
+ __detach_global_ctx_data();
+ return ret;
+ }
+ goto again;
+}
+
+static int
+attach_perf_ctx_data(struct perf_event *event)
+{
+ struct task_struct *task = event->hw.target;
+ struct kmem_cache *ctx_cache = event->pmu->task_ctx_cache;
+ int ret;
+
+ if (!ctx_cache)
+ return -ENOMEM;
+
+ if (task)
+ return attach_task_ctx_data(task, ctx_cache, false);
+
+ ret = attach_global_ctx_data(ctx_cache);
+ if (ret)
+ return ret;
+
+ event->attach_state |= PERF_ATTACH_GLOBAL_DATA;
+ return 0;
+}
+
+static void
+detach_task_ctx_data(struct task_struct *p)
+{
+ struct perf_ctx_data *cd;
+
+ scoped_guard (rcu) {
+ cd = rcu_dereference(p->perf_ctx_data);
+ if (!cd || !refcount_dec_and_test(&cd->refcount))
+ return;
+ }
+
+ /*
+ * The old ctx_data may be lost because of the race.
+ * Nothing is required to do for the case.
+ * See attach_task_ctx_data().
+ */
+ if (try_cmpxchg((struct perf_ctx_data **)&p->perf_ctx_data, &cd, NULL))
+ perf_free_ctx_data_rcu(cd);
+}
+
+static void __detach_global_ctx_data(void)
+{
+ struct task_struct *g, *p;
+ struct perf_ctx_data *cd;
+
+again:
+ scoped_guard (rcu) {
+ for_each_process_thread(g, p) {
+ cd = rcu_dereference(p->perf_ctx_data);
+ if (!cd || !cd->global)
+ continue;
+ cd->global = 0;
+ get_task_struct(p);
+ goto detach;
+ }
+ }
+ return;
+detach:
+ detach_task_ctx_data(p);
+ put_task_struct(p);
+ goto again;
+}
+
+static void detach_global_ctx_data(void)
+{
+ if (refcount_dec_not_one(&global_ctx_data_ref))
+ return;
+
+ guard(percpu_write)(&global_ctx_data_rwsem);
+ if (!refcount_dec_and_test(&global_ctx_data_ref))
+ return;
+
+ /* remove everything */
+ __detach_global_ctx_data();
+}
+
+static void detach_perf_ctx_data(struct perf_event *event)
+{
+ struct task_struct *task = event->hw.target;
+
+ event->attach_state &= ~PERF_ATTACH_TASK_DATA;
+
+ if (task)
+ return detach_task_ctx_data(task);
+
+ if (event->attach_state & PERF_ATTACH_GLOBAL_DATA) {
+ detach_global_ctx_data();
+ event->attach_state &= ~PERF_ATTACH_GLOBAL_DATA;
+ }
+}
+
static void unaccount_event(struct perf_event *event)
{
bool dec = false;
@@ -5246,6 +5464,8 @@ static int exclusive_event_init(struct perf_event *event)
return -EBUSY;
}
+ event->attach_state |= PERF_ATTACH_EXCLUSIVE;
+
return 0;
}
@@ -5253,14 +5473,13 @@ static void exclusive_event_destroy(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
- if (!is_exclusive_pmu(pmu))
- return;
-
/* see comment in exclusive_event_init() */
if (event->attach_state & PERF_ATTACH_TASK)
atomic_dec(&pmu->exclusive_cnt);
else
atomic_inc(&pmu->exclusive_cnt);
+
+ event->attach_state &= ~PERF_ATTACH_EXCLUSIVE;
}
static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
@@ -5292,8 +5511,7 @@ static bool exclusive_event_installable(struct perf_event *event,
return true;
}
-static void perf_addr_filters_splice(struct perf_event *event,
- struct list_head *head);
+static void perf_free_addr_filters(struct perf_event *event);
static void perf_pending_task_sync(struct perf_event *event)
{
@@ -5319,39 +5537,22 @@ static void perf_pending_task_sync(struct perf_event *event)
rcuwait_wait_event(&event->pending_work_wait, !event->pending_work, TASK_UNINTERRUPTIBLE);
}
-static void _free_event(struct perf_event *event)
+/* vs perf_event_alloc() error */
+static void __free_event(struct perf_event *event)
{
- irq_work_sync(&event->pending_irq);
- irq_work_sync(&event->pending_disable_irq);
- perf_pending_task_sync(event);
-
- unaccount_event(event);
+ if (event->attach_state & PERF_ATTACH_CALLCHAIN)
+ put_callchain_buffers();
- security_perf_event_free(event);
+ kfree(event->addr_filter_ranges);
- if (event->rb) {
- /*
- * Can happen when we close an event with re-directed output.
- *
- * Since we have a 0 refcount, perf_mmap_close() will skip
- * over us; possibly making our ring_buffer_put() the last.
- */
- mutex_lock(&event->mmap_mutex);
- ring_buffer_attach(event, NULL);
- mutex_unlock(&event->mmap_mutex);
- }
+ if (event->attach_state & PERF_ATTACH_EXCLUSIVE)
+ exclusive_event_destroy(event);
if (is_cgroup_event(event))
perf_detach_cgroup(event);
- if (!event->parent) {
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
- put_callchain_buffers();
- }
-
- perf_event_free_bpf_prog(event);
- perf_addr_filters_splice(event, NULL);
- kfree(event->addr_filter_ranges);
+ if (event->attach_state & PERF_ATTACH_TASK_DATA)
+ detach_perf_ctx_data(event);
if (event->destroy)
event->destroy(event);
@@ -5363,22 +5564,60 @@ static void _free_event(struct perf_event *event)
if (event->hw.target)
put_task_struct(event->hw.target);
- if (event->pmu_ctx)
+ if (event->pmu_ctx) {
+ /*
+ * put_pmu_ctx() needs an event->ctx reference, because of
+ * epc->ctx.
+ */
+ WARN_ON_ONCE(!event->ctx);
+ WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx);
put_pmu_ctx(event->pmu_ctx);
+ }
/*
- * perf_event_free_task() relies on put_ctx() being 'last', in particular
- * all task references must be cleaned up.
+ * perf_event_free_task() relies on put_ctx() being 'last', in
+ * particular all task references must be cleaned up.
*/
if (event->ctx)
put_ctx(event->ctx);
- exclusive_event_destroy(event);
- module_put(event->pmu->module);
+ if (event->pmu)
+ module_put(event->pmu->module);
call_rcu(&event->rcu_head, free_event_rcu);
}
+DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T))
+
+/* vs perf_event_alloc() success */
+static void _free_event(struct perf_event *event)
+{
+ irq_work_sync(&event->pending_irq);
+ irq_work_sync(&event->pending_disable_irq);
+ perf_pending_task_sync(event);
+
+ unaccount_event(event);
+
+ security_perf_event_free(event);
+
+ if (event->rb) {
+ /*
+ * Can happen when we close an event with re-directed output.
+ *
+ * Since we have a 0 refcount, perf_mmap_close() will skip
+ * over us; possibly making our ring_buffer_put() the last.
+ */
+ mutex_lock(&event->mmap_mutex);
+ ring_buffer_attach(event, NULL);
+ mutex_unlock(&event->mmap_mutex);
+ }
+
+ perf_event_free_bpf_prog(event);
+ perf_free_addr_filters(event);
+
+ __free_event(event);
+}
+
/*
* Used to free events which have a known refcount of 1, such as in error paths
* where the event isn't exposed yet and inherited events.
@@ -5847,6 +6086,10 @@ static __poll_t perf_poll(struct file *file, poll_table *wait)
if (is_event_hup(event))
return events;
+ if (unlikely(READ_ONCE(event->state) == PERF_EVENT_STATE_ERROR &&
+ event->attr.pinned))
+ return events;
+
/*
* Pin the event->rb by taking event->mmap_mutex; otherwise
* perf_event_set_output() can swizzle our rb and make us miss wakeups.
@@ -6616,7 +6859,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
unsigned long vma_size;
unsigned long nr_pages;
long user_extra = 0, extra = 0;
- int ret = 0, flags = 0;
+ int ret, flags = 0;
/*
* Don't allow mmap() of inherited per-task counters. This would
@@ -6634,9 +6877,54 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
return ret;
vma_size = vma->vm_end - vma->vm_start;
+ nr_pages = vma_size / PAGE_SIZE;
+
+ if (nr_pages > INT_MAX)
+ return -ENOMEM;
+
+ if (vma_size != PAGE_SIZE * nr_pages)
+ return -EINVAL;
+
+ user_extra = nr_pages;
+
+ mutex_lock(&event->mmap_mutex);
+ ret = -EINVAL;
if (vma->vm_pgoff == 0) {
- nr_pages = (vma_size / PAGE_SIZE) - 1;
+ nr_pages -= 1;
+
+ /*
+ * If we have rb pages ensure they're a power-of-two number, so we
+ * can do bitmasks instead of modulo.
+ */
+ if (nr_pages != 0 && !is_power_of_2(nr_pages))
+ goto unlock;
+
+ WARN_ON_ONCE(event->ctx->parent_ctx);
+
+ if (event->rb) {
+ if (data_page_nr(event->rb) != nr_pages)
+ goto unlock;
+
+ if (atomic_inc_not_zero(&event->rb->mmap_count)) {
+ /*
+ * Success -- managed to mmap() the same buffer
+ * multiple times.
+ */
+ ret = 0;
+ /* We need the rb to map pages. */
+ rb = event->rb;
+ goto unlock;
+ }
+
+ /*
+ * Raced against perf_mmap_close()'s
+ * atomic_dec_and_mutex_lock() remove the
+ * event and continue as if !event->rb
+ */
+ ring_buffer_attach(event, NULL);
+ }
+
} else {
/*
* AUX area mapping: if rb->aux_nr_pages != 0, it's already
@@ -6645,16 +6933,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
*/
u64 aux_offset, aux_size;
- if (!event->rb)
- return -EINVAL;
-
- nr_pages = vma_size / PAGE_SIZE;
- if (nr_pages > INT_MAX)
- return -ENOMEM;
-
- mutex_lock(&event->mmap_mutex);
- ret = -EINVAL;
-
rb = event->rb;
if (!rb)
goto aux_unlock;
@@ -6695,48 +6973,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
}
atomic_set(&rb->aux_mmap_count, 1);
- user_extra = nr_pages;
-
- goto accounting;
- }
-
- /*
- * If we have rb pages ensure they're a power-of-two number, so we
- * can do bitmasks instead of modulo.
- */
- if (nr_pages != 0 && !is_power_of_2(nr_pages))
- return -EINVAL;
-
- if (vma_size != PAGE_SIZE * (1 + nr_pages))
- return -EINVAL;
-
- WARN_ON_ONCE(event->ctx->parent_ctx);
-again:
- mutex_lock(&event->mmap_mutex);
- if (event->rb) {
- if (data_page_nr(event->rb) != nr_pages) {
- ret = -EINVAL;
- goto unlock;
- }
-
- if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
- /*
- * Raced against perf_mmap_close(); remove the
- * event and try again.
- */
- ring_buffer_attach(event, NULL);
- mutex_unlock(&event->mmap_mutex);
- goto again;
- }
-
- /* We need the rb to map pages. */
- rb = event->rb;
- goto unlock;
}
- user_extra = nr_pages + 1;
-
-accounting:
user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
/*
@@ -6804,6 +7042,8 @@ accounting:
rb->aux_mmap_locked = extra;
}
+ ret = 0;
+
unlock:
if (!ret) {
atomic_long_add(user_extra, &user->locked_vm);
@@ -6828,7 +7068,7 @@ aux_unlock:
if (!ret)
ret = map_range(rb, vma);
- if (event->pmu->event_mapped)
+ if (!ret && event->pmu->event_mapped)
event->pmu->event_mapped(event, vma->vm_mm);
return ret;
@@ -7452,9 +7692,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
values[n++] = running;
- if ((leader != event) &&
- (leader->state == PERF_EVENT_STATE_ACTIVE))
- leader->pmu->read(leader);
+ if ((leader != event) && !handle->skip_read)
+ perf_pmu_read(leader);
values[n++] = perf_event_count(leader, self);
if (read_format & PERF_FORMAT_ID)
@@ -7467,9 +7706,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
for_each_sibling_event(sub, leader) {
n = 0;
- if ((sub != event) &&
- (sub->state == PERF_EVENT_STATE_ACTIVE))
- sub->pmu->read(sub);
+ if ((sub != event) && !handle->skip_read)
+ perf_pmu_read(sub);
values[n++] = perf_event_count(sub, self);
if (read_format & PERF_FORMAT_ID)
@@ -7528,6 +7766,9 @@ void perf_output_sample(struct perf_output_handle *handle,
{
u64 sample_type = data->type;
+ if (data->sample_flags & PERF_SAMPLE_READ)
+ handle->skip_read = 1;
+
perf_output_put(handle, *header);
if (sample_type & PERF_SAMPLE_IDENTIFIER)
@@ -8522,10 +8763,58 @@ static void perf_event_task(struct task_struct *task,
task_ctx);
}
+/*
+ * Allocate data for a new task when profiling system-wide
+ * events which require PMU specific data
+ */
+static void
+perf_event_alloc_task_data(struct task_struct *child,
+ struct task_struct *parent)
+{
+ struct kmem_cache *ctx_cache = NULL;
+ struct perf_ctx_data *cd;
+
+ if (!refcount_read(&global_ctx_data_ref))
+ return;
+
+ scoped_guard (rcu) {
+ cd = rcu_dereference(parent->perf_ctx_data);
+ if (cd)
+ ctx_cache = cd->ctx_cache;
+ }
+
+ if (!ctx_cache)
+ return;
+
+ guard(percpu_read)(&global_ctx_data_rwsem);
+ scoped_guard (rcu) {
+ cd = rcu_dereference(child->perf_ctx_data);
+ if (!cd) {
+ /*
+ * A system-wide event may be unaccount,
+ * when attaching the perf_ctx_data.
+ */
+ if (!refcount_read(&global_ctx_data_ref))
+ return;
+ goto attach;
+ }
+
+ if (!cd->global) {
+ cd->global = 1;
+ refcount_inc(&cd->refcount);
+ }
+ }
+
+ return;
+attach:
+ attach_task_ctx_data(child, ctx_cache, true);
+}
+
void perf_event_fork(struct task_struct *task)
{
perf_event_task(task, NULL, 1);
perf_event_namespaces(task);
+ perf_event_alloc_task_data(task, current);
}
/*
@@ -8589,7 +8878,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
unsigned int size;
memset(comm, 0, sizeof(comm));
- strscpy(comm, comm_event->task->comm, sizeof(comm));
+ strscpy(comm, comm_event->task->comm);
size = ALIGN(strlen(comm)+1, sizeof(u64));
comm_event->comm = comm;
@@ -9033,7 +9322,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
}
cpy_name:
- strscpy(tmp, name, sizeof(tmp));
+ strscpy(tmp, name);
name = tmp;
got_name:
/*
@@ -9457,7 +9746,7 @@ void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
goto err;
- strscpy(name, sym, KSYM_NAME_LEN);
+ strscpy(name, sym);
name_len = strlen(name) + 1;
while (!IS_ALIGNED(name_len, sizeof(u64)))
name[name_len++] = '\0';
@@ -10840,6 +11129,9 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
void perf_event_free_bpf_prog(struct perf_event *event)
{
+ if (!event->prog)
+ return;
+
if (!perf_event_is_tracing(event)) {
perf_event_free_bpf_handler(event);
return;
@@ -10938,6 +11230,17 @@ static void perf_addr_filters_splice(struct perf_event *event,
free_filters_list(&list);
}
+static void perf_free_addr_filters(struct perf_event *event)
+{
+ /*
+ * Used during free paths, there is no concurrency.
+ */
+ if (list_empty(&event->addr_filters.list))
+ return;
+
+ perf_addr_filters_splice(event, NULL);
+}
+
/*
* Scan through mm's vmas and see if one of them matches the
* @filter; if so, adjust filter's address range.
@@ -11376,8 +11679,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
if (!is_sampling_event(event))
return;
- hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
- hwc->hrtimer.function = perf_swevent_hrtimer;
+ hrtimer_setup(&hwc->hrtimer, perf_swevent_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
/*
* Since hrtimers have a fixed rate, we can do a static freq->period
@@ -11614,11 +11916,6 @@ static int perf_event_idx_default(struct perf_event *event)
return 0;
}
-static void free_pmu_context(struct pmu *pmu)
-{
- free_percpu(pmu->cpu_pmu_context);
-}
-
/*
* Let userspace know that this PMU supports address range filtering:
*/
@@ -11628,7 +11925,7 @@ static ssize_t nr_addr_filters_show(struct device *dev,
{
struct pmu *pmu = dev_get_drvdata(dev);
- return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
+ return sysfs_emit(page, "%d\n", pmu->nr_addr_filters);
}
DEVICE_ATTR_RO(nr_addr_filters);
@@ -11639,7 +11936,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
{
struct pmu *pmu = dev_get_drvdata(dev);
- return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type);
+ return sysfs_emit(page, "%d\n", pmu->type);
}
static DEVICE_ATTR_RO(type);
@@ -11650,7 +11947,7 @@ perf_event_mux_interval_ms_show(struct device *dev,
{
struct pmu *pmu = dev_get_drvdata(dev);
- return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms);
+ return sysfs_emit(page, "%d\n", pmu->hrtimer_interval_ms);
}
static DEFINE_MUTEX(mux_interval_mutex);
@@ -11681,7 +11978,7 @@ perf_event_mux_interval_ms_store(struct device *dev,
cpus_read_lock();
for_each_online_cpu(cpu) {
struct perf_cpu_pmu_context *cpc;
- cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu);
cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc);
@@ -11824,6 +12121,7 @@ del_dev:
free_dev:
put_device(pmu->dev);
+ pmu->dev = NULL;
goto out;
}
@@ -11845,57 +12143,85 @@ static bool idr_cmpxchg(struct idr *idr, unsigned long id, void *old, void *new)
return true;
}
-int perf_pmu_register(struct pmu *pmu, const char *name, int type)
+static void perf_pmu_free(struct pmu *pmu)
{
- int cpu, ret, max = PERF_TYPE_MAX;
+ if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
+ if (pmu->nr_addr_filters)
+ device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
+ device_del(pmu->dev);
+ put_device(pmu->dev);
+ }
- mutex_lock(&pmus_lock);
- ret = -ENOMEM;
- pmu->pmu_disable_count = alloc_percpu(int);
- if (!pmu->pmu_disable_count)
- goto unlock;
+ if (pmu->cpu_pmu_context) {
+ int cpu;
- pmu->type = -1;
- if (WARN_ONCE(!name, "Can not register anonymous pmu.\n")) {
- ret = -EINVAL;
- goto free_pdc;
- }
+ for_each_possible_cpu(cpu) {
+ struct perf_cpu_pmu_context *cpc;
- if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) {
- ret = -EINVAL;
- goto free_pdc;
+ cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ if (!cpc)
+ continue;
+ if (cpc->epc.embedded) {
+ /* refcount managed */
+ put_pmu_ctx(&cpc->epc);
+ continue;
+ }
+ kfree(cpc);
+ }
+ free_percpu(pmu->cpu_pmu_context);
}
+}
+
+DEFINE_FREE(pmu_unregister, struct pmu *, if (_T) perf_pmu_free(_T))
+
+int perf_pmu_register(struct pmu *_pmu, const char *name, int type)
+{
+ int cpu, max = PERF_TYPE_MAX;
+
+ struct pmu *pmu __free(pmu_unregister) = _pmu;
+ guard(mutex)(&pmus_lock);
+
+ if (WARN_ONCE(!name, "Can not register anonymous pmu.\n"))
+ return -EINVAL;
+
+ if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE,
+ "Can not register a pmu with an invalid scope.\n"))
+ return -EINVAL;
pmu->name = name;
if (type >= 0)
max = type;
- ret = idr_alloc(&pmu_idr, NULL, max, 0, GFP_KERNEL);
- if (ret < 0)
- goto free_pdc;
+ CLASS(idr_alloc, pmu_type)(&pmu_idr, NULL, max, 0, GFP_KERNEL);
+ if (pmu_type.id < 0)
+ return pmu_type.id;
- WARN_ON(type >= 0 && ret != type);
+ WARN_ON(type >= 0 && pmu_type.id != type);
- type = ret;
- pmu->type = type;
+ pmu->type = pmu_type.id;
atomic_set(&pmu->exclusive_cnt, 0);
if (pmu_bus_running && !pmu->dev) {
- ret = pmu_dev_alloc(pmu);
+ int ret = pmu_dev_alloc(pmu);
if (ret)
- goto free_idr;
+ return ret;
}
- ret = -ENOMEM;
- pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context);
+ pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context *);
if (!pmu->cpu_pmu_context)
- goto free_dev;
+ return -ENOMEM;
for_each_possible_cpu(cpu) {
- struct perf_cpu_pmu_context *cpc;
+ struct perf_cpu_pmu_context *cpc =
+ kmalloc_node(sizeof(struct perf_cpu_pmu_context),
+ GFP_KERNEL | __GFP_ZERO,
+ cpu_to_node(cpu));
- cpc = per_cpu_ptr(pmu->cpu_pmu_context, cpu);
+ if (!cpc)
+ return -ENOMEM;
+
+ *per_cpu_ptr(pmu->cpu_pmu_context, cpu) = cpc;
__perf_init_event_pmu_context(&cpc->epc, pmu);
__perf_mux_hrtimer_init(cpc, cpu);
}
@@ -11932,39 +12258,21 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
* Now that the PMU is complete, make it visible to perf_try_init_event().
*/
if (!idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu))
- goto free_context;
+ return -EINVAL;
list_add_rcu(&pmu->entry, &pmus);
- ret = 0;
-unlock:
- mutex_unlock(&pmus_lock);
-
- return ret;
-
-free_context:
- free_percpu(pmu->cpu_pmu_context);
-
-free_dev:
- if (pmu->dev && pmu->dev != PMU_NULL_DEV) {
- device_del(pmu->dev);
- put_device(pmu->dev);
- }
-
-free_idr:
- idr_remove(&pmu_idr, pmu->type);
-
-free_pdc:
- free_percpu(pmu->pmu_disable_count);
- goto unlock;
+ take_idr_id(pmu_type);
+ _pmu = no_free_ptr(pmu); // let it rip
+ return 0;
}
EXPORT_SYMBOL_GPL(perf_pmu_register);
void perf_pmu_unregister(struct pmu *pmu)
{
- mutex_lock(&pmus_lock);
- list_del_rcu(&pmu->entry);
- idr_remove(&pmu_idr, pmu->type);
- mutex_unlock(&pmus_lock);
+ scoped_guard (mutex, &pmus_lock) {
+ list_del_rcu(&pmu->entry);
+ idr_remove(&pmu_idr, pmu->type);
+ }
/*
* We dereference the pmu list under both SRCU and regular RCU, so
@@ -11973,14 +12281,7 @@ void perf_pmu_unregister(struct pmu *pmu)
synchronize_srcu(&pmus_srcu);
synchronize_rcu();
- free_percpu(pmu->pmu_disable_count);
- if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
- if (pmu->nr_addr_filters)
- device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
- device_del(pmu->dev);
- put_device(pmu->dev);
- }
- free_pmu_context(pmu);
+ perf_pmu_free(pmu);
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);
@@ -12020,48 +12321,61 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
if (ctx)
perf_event_ctx_unlock(event->group_leader, ctx);
- if (!ret) {
- if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
- has_extended_regs(event))
- ret = -EOPNOTSUPP;
+ if (ret)
+ goto err_pmu;
- if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
- event_has_any_exclude_flag(event))
- ret = -EINVAL;
+ if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
+ has_extended_regs(event)) {
+ ret = -EOPNOTSUPP;
+ goto err_destroy;
+ }
- if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
- const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
- struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope);
- int cpu;
-
- if (pmu_cpumask && cpumask) {
- cpu = cpumask_any_and(pmu_cpumask, cpumask);
- if (cpu >= nr_cpu_ids)
- ret = -ENODEV;
- else
- event->event_caps |= PERF_EV_CAP_READ_SCOPE;
- } else {
- ret = -ENODEV;
- }
- }
+ if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
+ event_has_any_exclude_flag(event)) {
+ ret = -EINVAL;
+ goto err_destroy;
+ }
+
+ if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
+ const struct cpumask *cpumask;
+ struct cpumask *pmu_cpumask;
+ int cpu;
- if (ret && event->destroy)
- event->destroy(event);
+ cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
+ pmu_cpumask = perf_scope_cpumask(pmu->scope);
+
+ ret = -ENODEV;
+ if (!pmu_cpumask || !cpumask)
+ goto err_destroy;
+
+ cpu = cpumask_any_and(pmu_cpumask, cpumask);
+ if (cpu >= nr_cpu_ids)
+ goto err_destroy;
+
+ event->event_caps |= PERF_EV_CAP_READ_SCOPE;
}
- if (ret)
- module_put(pmu->module);
+ return 0;
+
+err_destroy:
+ if (event->destroy) {
+ event->destroy(event);
+ event->destroy = NULL;
+ }
+err_pmu:
+ event->pmu = NULL;
+ module_put(pmu->module);
return ret;
}
static struct pmu *perf_init_event(struct perf_event *event)
{
bool extended_type = false;
- int idx, type, ret;
struct pmu *pmu;
+ int type, ret;
- idx = srcu_read_lock(&pmus_srcu);
+ guard(srcu)(&pmus_srcu);
/*
* Save original type before calling pmu->event_init() since certain
@@ -12074,7 +12388,7 @@ static struct pmu *perf_init_event(struct perf_event *event)
pmu = event->parent->pmu;
ret = perf_try_init_event(pmu, event);
if (!ret)
- goto unlock;
+ return pmu;
}
/*
@@ -12093,13 +12407,12 @@ static struct pmu *perf_init_event(struct perf_event *event)
}
again:
- rcu_read_lock();
- pmu = idr_find(&pmu_idr, type);
- rcu_read_unlock();
+ scoped_guard (rcu)
+ pmu = idr_find(&pmu_idr, type);
if (pmu) {
if (event->attr.type != type && type != PERF_TYPE_RAW &&
!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
- goto fail;
+ return ERR_PTR(-ENOENT);
ret = perf_try_init_event(pmu, event);
if (ret == -ENOENT && event->attr.type != type && !extended_type) {
@@ -12108,27 +12421,21 @@ again:
}
if (ret)
- pmu = ERR_PTR(ret);
+ return ERR_PTR(ret);
- goto unlock;
+ return pmu;
}
list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
ret = perf_try_init_event(pmu, event);
if (!ret)
- goto unlock;
+ return pmu;
- if (ret != -ENOENT) {
- pmu = ERR_PTR(ret);
- goto unlock;
- }
+ if (ret != -ENOENT)
+ return ERR_PTR(ret);
}
-fail:
- pmu = ERR_PTR(-ENOENT);
-unlock:
- srcu_read_unlock(&pmus_srcu, idx);
- return pmu;
+ return ERR_PTR(-ENOENT);
}
static void attach_sb_event(struct perf_event *event)
@@ -12255,7 +12562,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
void *context, int cgroup_fd)
{
struct pmu *pmu;
- struct perf_event *event;
struct hw_perf_event *hwc;
long err = -EINVAL;
int node;
@@ -12270,8 +12576,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
node = (cpu >= 0) ? cpu_to_node(cpu) : -1;
- event = kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO,
- node);
+ struct perf_event *event __free(__free_event) =
+ kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, node);
if (!event)
return ERR_PTR(-ENOMEM);
@@ -12378,15 +12684,25 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
* See perf_output_read().
*/
if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID))
- goto err_ns;
+ return ERR_PTR(-EINVAL);
if (!has_branch_stack(event))
event->attr.branch_sample_type = 0;
pmu = perf_init_event(event);
- if (IS_ERR(pmu)) {
- err = PTR_ERR(pmu);
- goto err_ns;
+ if (IS_ERR(pmu))
+ return (void*)pmu;
+
+ /*
+ * The PERF_ATTACH_TASK_DATA is set in the event_init()->hw_config().
+ * The attach should be right after the perf_init_event().
+ * Otherwise, the __free_event() would mistakenly detach the non-exist
+ * perf_ctx_data because of the other errors between them.
+ */
+ if (event->attach_state & PERF_ATTACH_TASK_DATA) {
+ err = attach_perf_ctx_data(event);
+ if (err)
+ return ERR_PTR(err);
}
/*
@@ -12394,49 +12710,39 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
* events (they don't make sense as the cgroup will be different
* on other CPUs in the uncore mask).
*/
- if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1)) {
- err = -EINVAL;
- goto err_pmu;
- }
+ if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1))
+ return ERR_PTR(-EINVAL);
if (event->attr.aux_output &&
(!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) ||
- event->attr.aux_pause || event->attr.aux_resume)) {
- err = -EOPNOTSUPP;
- goto err_pmu;
- }
+ event->attr.aux_pause || event->attr.aux_resume))
+ return ERR_PTR(-EOPNOTSUPP);
- if (event->attr.aux_pause && event->attr.aux_resume) {
- err = -EINVAL;
- goto err_pmu;
- }
+ if (event->attr.aux_pause && event->attr.aux_resume)
+ return ERR_PTR(-EINVAL);
if (event->attr.aux_start_paused) {
- if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) {
- err = -EOPNOTSUPP;
- goto err_pmu;
- }
+ if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE))
+ return ERR_PTR(-EOPNOTSUPP);
event->hw.aux_paused = 1;
}
if (cgroup_fd != -1) {
err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
if (err)
- goto err_pmu;
+ return ERR_PTR(err);
}
err = exclusive_event_init(event);
if (err)
- goto err_pmu;
+ return ERR_PTR(err);
if (has_addr_filter(event)) {
event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
sizeof(struct perf_addr_filter_range),
GFP_KERNEL);
- if (!event->addr_filter_ranges) {
- err = -ENOMEM;
- goto err_per_task;
- }
+ if (!event->addr_filter_ranges)
+ return ERR_PTR(-ENOMEM);
/*
* Clone the parent's vma offsets: they are valid until exec()
@@ -12460,42 +12766,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
err = get_callchain_buffers(attr->sample_max_stack);
if (err)
- goto err_addr_filters;
+ return ERR_PTR(err);
+ event->attach_state |= PERF_ATTACH_CALLCHAIN;
}
}
err = security_perf_event_alloc(event);
if (err)
- goto err_callchain_buffer;
+ return ERR_PTR(err);
/* symmetric to unaccount_event() in _free_event() */
account_event(event);
- return event;
-
-err_callchain_buffer:
- if (!event->parent) {
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
- put_callchain_buffers();
- }
-err_addr_filters:
- kfree(event->addr_filter_ranges);
-
-err_per_task:
- exclusive_event_destroy(event);
-
-err_pmu:
- if (is_cgroup_event(event))
- perf_detach_cgroup(event);
- if (event->destroy)
- event->destroy(event);
- module_put(pmu->module);
-err_ns:
- if (event->hw.target)
- put_task_struct(event->hw.target);
- call_rcu(&event->rcu_head, free_event_rcu);
-
- return ERR_PTR(err);
+ return_ptr(event);
}
static int perf_copy_attr(struct perf_event_attr __user *uattr,
@@ -12565,7 +12848,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
}
/* privileged levels capture (kernel, hv): check permissions */
if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
- ret = perf_allow_kernel(attr);
+ ret = perf_allow_kernel();
if (ret)
return ret;
}
@@ -12822,12 +13105,12 @@ SYSCALL_DEFINE5(perf_event_open,
return err;
/* Do we allow access to perf_event_open(2) ? */
- err = security_perf_event_open(&attr, PERF_SECURITY_OPEN);
+ err = security_perf_event_open(PERF_SECURITY_OPEN);
if (err)
return err;
if (!attr.exclude_kernel) {
- err = perf_allow_kernel(&attr);
+ err = perf_allow_kernel();
if (err)
return err;
}
@@ -12847,7 +13130,7 @@ SYSCALL_DEFINE5(perf_event_open,
/* Only privileged users can get physical addresses */
if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
- err = perf_allow_kernel(&attr);
+ err = perf_allow_kernel();
if (err)
return err;
}
@@ -13569,6 +13852,12 @@ void perf_event_exit_task(struct task_struct *child)
* At this point we need to send EXIT events to cpu contexts.
*/
perf_event_task(child, NULL, 0);
+
+ /*
+ * Detach the perf_ctx_data for the system-wide event.
+ */
+ guard(percpu_read)(&global_ctx_data_rwsem);
+ detach_task_ctx_data(child);
}
static void perf_free_event(struct perf_event *event,
@@ -13680,12 +13969,12 @@ const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
return &event->attr;
}
-int perf_allow_kernel(struct perf_event_attr *attr)
+int perf_allow_kernel(void)
{
if (sysctl_perf_event_paranoid > 1 && !perfmon_capable())
return -EACCES;
- return security_perf_event_open(attr, PERF_SECURITY_KERNEL);
+ return security_perf_event_open(PERF_SECURITY_KERNEL);
}
EXPORT_SYMBOL_GPL(perf_allow_kernel);
@@ -13744,7 +14033,6 @@ inherit_event(struct perf_event *parent_event,
if (is_orphaned_event(parent_event) ||
!atomic_long_inc_not_zero(&parent_event->refcount)) {
mutex_unlock(&parent_event->child_mutex);
- /* task_ctx_data is freed with child_ctx */
free_event(child_event);
return NULL;
}
@@ -14002,6 +14290,7 @@ int perf_event_init_task(struct task_struct *child, u64 clone_flags)
child->perf_event_ctxp = NULL;
mutex_init(&child->perf_event_mutex);
INIT_LIST_HEAD(&child->perf_event_list);
+ child->perf_ctx_data = NULL;
ret = perf_event_init_context(child, clone_flags);
if (ret) {
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index bc4a61029b6d..8ec2cb688903 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -950,9 +950,10 @@ static int hw_breakpoint_event_init(struct perf_event *bp)
return -ENOENT;
/*
- * no branch sampling for breakpoint events
+ * Check if breakpoint type is supported before proceeding.
+ * Also, no branch sampling for breakpoint events.
*/
- if (has_branch_stack(bp))
+ if (!hw_breakpoint_slots_cached(find_slot_idx(bp->attr.bp_type)) || has_branch_stack(bp))
return -EOPNOTSUPP;
err = register_perf_hw_breakpoint(bp);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 180509132d4b..5130b119d0ae 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -19,7 +19,7 @@
static void perf_output_wakeup(struct perf_output_handle *handle)
{
- atomic_set(&handle->rb->poll, EPOLLIN);
+ atomic_set(&handle->rb->poll, EPOLLIN | EPOLLRDNORM);
handle->event->pending_wakeup = 1;
@@ -185,6 +185,7 @@ __perf_output_begin(struct perf_output_handle *handle,
handle->rb = rb;
handle->event = event;
+ handle->flags = 0;
have_lost = local_read(&rb->lost);
if (unlikely(have_lost)) {
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index b4ca8898fe17..70c84b9d7be3 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2169,8 +2169,8 @@ void uprobe_copy_process(struct task_struct *t, unsigned long flags)
*/
unsigned long uprobe_get_trampoline_vaddr(void)
{
+ unsigned long trampoline_vaddr = UPROBE_NO_TRAMPOLINE_VADDR;
struct xol_area *area;
- unsigned long trampoline_vaddr = -1;
/* Pairs with xol_add_vma() smp_store_release() */
area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */
@@ -2311,9 +2311,8 @@ bool uprobe_deny_signal(void)
WARN_ON_ONCE(utask->state != UTASK_SSTEP);
if (task_sigpending(t)) {
- spin_lock_irq(&t->sighand->siglock);
+ utask->signal_denied = true;
clear_tsk_thread_flag(t, TIF_SIGPENDING);
- spin_unlock_irq(&t->sighand->siglock);
if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
utask->state = UTASK_SSTEP_TRAPPED;
@@ -2746,9 +2745,10 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
utask->state = UTASK_RUNNING;
xol_free_insn_slot(utask);
- spin_lock_irq(&current->sighand->siglock);
- recalc_sigpending(); /* see uprobe_deny_signal() */
- spin_unlock_irq(&current->sighand->siglock);
+ if (utask->signal_denied) {
+ set_thread_flag(TIF_SIGPENDING);
+ utask->signal_denied = false;
+ }
if (unlikely(err)) {
uprobe_warn(current, "execute the probed insn, sending SIGILL.");
diff --git a/kernel/exit.c b/kernel/exit.c
index 3485e5fc499e..c2e6c7b7779f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -69,6 +69,7 @@
#include <linux/sysfs.h>
#include <linux/user_events.h>
#include <linux/uaccess.h>
+#include <linux/pidfs.h>
#include <uapi/linux/wait.h>
@@ -122,14 +123,22 @@ static __init int kernel_exit_sysfs_init(void)
late_initcall(kernel_exit_sysfs_init);
#endif
-static void __unhash_process(struct task_struct *p, bool group_dead)
+/*
+ * For things release_task() would like to do *after* tasklist_lock is released.
+ */
+struct release_task_post {
+ struct pid *pids[PIDTYPE_MAX];
+};
+
+static void __unhash_process(struct release_task_post *post, struct task_struct *p,
+ bool group_dead)
{
nr_threads--;
- detach_pid(p, PIDTYPE_PID);
+ detach_pid(post->pids, p, PIDTYPE_PID);
if (group_dead) {
- detach_pid(p, PIDTYPE_TGID);
- detach_pid(p, PIDTYPE_PGID);
- detach_pid(p, PIDTYPE_SID);
+ detach_pid(post->pids, p, PIDTYPE_TGID);
+ detach_pid(post->pids, p, PIDTYPE_PGID);
+ detach_pid(post->pids, p, PIDTYPE_SID);
list_del_rcu(&p->tasks);
list_del_init(&p->sibling);
@@ -141,7 +150,7 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
/*
* This function expects the tasklist_lock write-locked.
*/
-static void __exit_signal(struct task_struct *tsk)
+static void __exit_signal(struct release_task_post *post, struct task_struct *tsk)
{
struct signal_struct *sig = tsk->signal;
bool group_dead = thread_group_leader(tsk);
@@ -174,9 +183,6 @@ static void __exit_signal(struct task_struct *tsk)
sig->curr_target = next_thread(tsk);
}
- add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
- sizeof(unsigned long long));
-
/*
* Accumulate here the counters for all threads as they die. We could
* skip the group leader because it is the last user of signal_struct,
@@ -197,23 +203,15 @@ static void __exit_signal(struct task_struct *tsk)
task_io_accounting_add(&sig->ioac, &tsk->ioac);
sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig->nr_threads--;
- __unhash_process(tsk, group_dead);
+ __unhash_process(post, tsk, group_dead);
write_sequnlock(&sig->stats_lock);
- /*
- * Do this under ->siglock, we can race with another thread
- * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
- */
- flush_sigqueue(&tsk->pending);
tsk->sighand = NULL;
spin_unlock(&sighand->siglock);
__cleanup_sighand(sighand);
- clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
- if (group_dead) {
- flush_sigqueue(&sig->shared_pending);
+ if (group_dead)
tty_kref_put(tty);
- }
}
static void delayed_put_task_struct(struct rcu_head *rhp)
@@ -239,22 +237,27 @@ void __weak release_thread(struct task_struct *dead_task)
void release_task(struct task_struct *p)
{
+ struct release_task_post post;
struct task_struct *leader;
struct pid *thread_pid;
int zap_leader;
repeat:
+ memset(&post, 0, sizeof(post));
+
/* don't need to get the RCU readlock here - the process is dead and
* can't be modifying its own credentials. But shut RCU-lockdep up */
rcu_read_lock();
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
rcu_read_unlock();
+ pidfs_exit(p);
cgroup_release(p);
+ thread_pid = get_pid(p->thread_pid);
+
write_lock_irq(&tasklist_lock);
ptrace_release_task(p);
- thread_pid = get_pid(p->thread_pid);
- __exit_signal(p);
+ __exit_signal(&post, p);
/*
* If we are the last non-leader member of the thread
@@ -278,7 +281,20 @@ repeat:
write_unlock_irq(&tasklist_lock);
proc_flush_pid(thread_pid);
put_pid(thread_pid);
+ add_device_randomness(&p->se.sum_exec_runtime,
+ sizeof(p->se.sum_exec_runtime));
+ free_pids(post.pids);
release_thread(p);
+ /*
+ * This task was already removed from the process/thread/pid lists
+ * and lock_task_sighand(p) can't succeed. Nobody else can touch
+ * ->pending or, if group dead, signal->shared_pending. We can call
+ * flush_sigqueue() lockless.
+ */
+ flush_sigqueue(&p->pending);
+ if (thread_group_leader(p))
+ flush_sigqueue(&p->signal->shared_pending);
+
put_task_struct_rcu_user(p);
p = leader;
@@ -741,10 +757,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
tsk->exit_state = EXIT_ZOMBIE;
/*
- * sub-thread or delay_group_leader(), wake up the
- * PIDFD_THREAD waiters.
+ * Ignore thread-group leaders that exited before all
+ * subthreads did.
*/
- if (!thread_group_empty(tsk))
+ if (!delay_group_leader(tsk))
do_notify_pidfd(tsk);
if (unlikely(tsk->ptrace)) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 735405a9c5f3..a61a4407ebdf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1891,8 +1891,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
#ifdef CONFIG_POSIX_TIMERS
INIT_HLIST_HEAD(&sig->posix_timers);
INIT_HLIST_HEAD(&sig->ignored_posix_timers);
- hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- sig->real_timer.function = it_real_fn;
+ hrtimer_setup(&sig->real_timer, it_real_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
#endif
task_lock(current->group_leader);
@@ -2032,25 +2031,18 @@ static inline void rcu_copy_process(struct task_struct *p)
*/
static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
{
- int pidfd;
struct file *pidfd_file;
- pidfd = get_unused_fd_flags(O_CLOEXEC);
+ CLASS(get_unused_fd, pidfd)(O_CLOEXEC);
if (pidfd < 0)
return pidfd;
pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR);
- if (IS_ERR(pidfd_file)) {
- put_unused_fd(pidfd);
+ if (IS_ERR(pidfd_file))
return PTR_ERR(pidfd_file);
- }
- /*
- * anon_inode_getfile() ignores everything outside of the
- * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually.
- */
- pidfd_file->f_flags |= (flags & PIDFD_THREAD);
+
*ret = pidfd_file;
- return pidfd;
+ return take_fd(pidfd);
}
/**
@@ -2432,8 +2424,11 @@ __latent_entropy struct task_struct *copy_process(
if (clone_flags & CLONE_PIDFD) {
int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
- /* Note that no task has been attached to @pid yet. */
- retval = __pidfd_prepare(pid, flags, &pidfile);
+ /*
+ * Note that no task has been attached to @pid yet indicate
+ * that via CLONE_PIDFD.
+ */
+ retval = __pidfd_prepare(pid, flags | PIDFD_CLONE, &pidfile);
if (retval < 0)
goto bad_fork_free_pid;
pidfd = retval;
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 3db8567f5a44..cca15859a50b 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -50,10 +50,10 @@
*/
static struct {
struct futex_hash_bucket *queues;
- unsigned long hashsize;
+ unsigned long hashmask;
} __futex_data __read_mostly __aligned(2*sizeof(long));
#define futex_queues (__futex_data.queues)
-#define futex_hashsize (__futex_data.hashsize)
+#define futex_hashmask (__futex_data.hashmask)
/*
@@ -119,7 +119,7 @@ struct futex_hash_bucket *futex_hash(union futex_key *key)
u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
key->both.offset);
- return &futex_queues[hash & (futex_hashsize - 1)];
+ return &futex_queues[hash & futex_hashmask];
}
@@ -1127,27 +1127,28 @@ void futex_exit_release(struct task_struct *tsk)
static int __init futex_init(void)
{
+ unsigned long hashsize, i;
unsigned int futex_shift;
- unsigned long i;
#ifdef CONFIG_BASE_SMALL
- futex_hashsize = 16;
+ hashsize = 16;
#else
- futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
+ hashsize = roundup_pow_of_two(256 * num_possible_cpus());
#endif
futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
- futex_hashsize, 0, 0,
+ hashsize, 0, 0,
&futex_shift, NULL,
- futex_hashsize, futex_hashsize);
- futex_hashsize = 1UL << futex_shift;
+ hashsize, hashsize);
+ hashsize = 1UL << futex_shift;
- for (i = 0; i < futex_hashsize; i++) {
+ for (i = 0; i < hashsize; i++) {
atomic_set(&futex_queues[i].waiters, 0);
plist_head_init(&futex_queues[i].chain);
spin_lock_init(&futex_queues[i].lock);
}
+ futex_hashmask = hashsize - 1;
return 0;
}
core_initcall(futex_init);
diff --git a/kernel/iomem.c b/kernel/iomem.c
index dc2120776e1c..75e61c1c6bc0 100644
--- a/kernel/iomem.c
+++ b/kernel/iomem.c
@@ -6,7 +6,8 @@
#include <linux/ioremap.h>
#ifndef arch_memremap_wb
-static void *arch_memremap_wb(resource_size_t offset, unsigned long size)
+static void *arch_memremap_wb(resource_size_t offset, unsigned long size,
+ unsigned long flags)
{
#ifdef ioremap_cache
return (__force void *)ioremap_cache(offset, size);
@@ -91,7 +92,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
if (is_ram == REGION_INTERSECTS)
addr = try_ram_remap(offset, size, flags);
if (!addr)
- addr = arch_memremap_wb(offset, size);
+ addr = arch_memremap_wb(offset, size, flags);
}
/*
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c901436ebd9f..0ff987d3a799 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -232,6 +232,21 @@ __irq_startup_managed(struct irq_desc *desc, const struct cpumask *aff,
}
#endif
+static void irq_enable(struct irq_desc *desc)
+{
+ if (!irqd_irq_disabled(&desc->irq_data)) {
+ unmask_irq(desc);
+ } else {
+ irq_state_clr_disabled(desc);
+ if (desc->irq_data.chip->irq_enable) {
+ desc->irq_data.chip->irq_enable(&desc->irq_data);
+ irq_state_clr_masked(desc);
+ } else {
+ unmask_irq(desc);
+ }
+ }
+}
+
static int __irq_startup(struct irq_desc *desc)
{
struct irq_data *d = irq_desc_get_irq_data(desc);
@@ -332,21 +347,6 @@ void irq_shutdown_and_deactivate(struct irq_desc *desc)
irq_domain_deactivate_irq(&desc->irq_data);
}
-void irq_enable(struct irq_desc *desc)
-{
- if (!irqd_irq_disabled(&desc->irq_data)) {
- unmask_irq(desc);
- } else {
- irq_state_clr_disabled(desc);
- if (desc->irq_data.chip->irq_enable) {
- desc->irq_data.chip->irq_enable(&desc->irq_data);
- irq_state_clr_masked(desc);
- } else {
- unmask_irq(desc);
- }
- }
-}
-
static void __irq_disable(struct irq_desc *desc, bool mask)
{
if (irqd_irq_disabled(&desc->irq_data)) {
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index a979523640d0..b0290849c395 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -90,7 +90,6 @@ extern int irq_startup(struct irq_desc *desc, bool resend, bool force);
extern void irq_shutdown(struct irq_desc *desc);
extern void irq_shutdown_and_deactivate(struct irq_desc *desc);
-extern void irq_enable(struct irq_desc *desc);
extern void irq_disable(struct irq_desc *desc);
extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu);
@@ -98,18 +97,12 @@ extern void mask_irq(struct irq_desc *desc);
extern void unmask_irq(struct irq_desc *desc);
extern void unmask_threaded_irq(struct irq_desc *desc);
-extern unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask);
-
#ifdef CONFIG_SPARSE_IRQ
static inline void irq_mark_irq(unsigned int irq) { }
#else
extern void irq_mark_irq(unsigned int irq);
#endif
-extern int __irq_get_irqchip_state(struct irq_data *data,
- enum irqchip_irq_state which,
- bool *state);
-
irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event(struct irq_desc *desc);
@@ -139,8 +132,6 @@ static inline void unregister_handler_proc(unsigned int irq,
extern bool irq_can_set_affinity_usr(unsigned int irq);
-extern void irq_set_thread_affinity(struct irq_desc *desc);
-
extern int irq_do_set_affinity(struct irq_data *data,
const struct cpumask *dest, bool force);
@@ -442,6 +433,7 @@ static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc)
return desc->pending_mask;
}
bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear);
+void irq_force_complete_move(struct irq_desc *desc);
#else /* CONFIG_GENERIC_PENDING_IRQ */
static inline bool irq_can_move_pcntxt(struct irq_data *data)
{
@@ -467,6 +459,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
{
return false;
}
+static inline void irq_force_complete_move(struct irq_desc *desc) { }
#endif /* !CONFIG_GENERIC_PENDING_IRQ */
#if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 287830739783..4258cd6bd3b4 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -991,7 +991,7 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
return desc && desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0;
}
-unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask)
+static unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask)
{
unsigned int sum = 0;
int cpu;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index ec6d8e72d980..2861f89880af 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1589,9 +1589,8 @@ static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain,
}
}
-int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
- unsigned int irq_base,
- unsigned int nr_irqs, void *arg)
+static int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base,
+ unsigned int nr_irqs, void *arg)
{
if (!domain->ops->alloc) {
pr_debug("domain->ops->alloc() is NULL\n");
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f300bb6be3bd..753eef8e041c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -35,6 +35,8 @@ static int __init setup_forced_irqthreads(char *arg)
early_param("threadirqs", setup_forced_irqthreads);
#endif
+static int __irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *state);
+
static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
{
struct irq_data *irqd = irq_desc_get_irq_data(desc);
@@ -187,7 +189,7 @@ bool irq_can_set_affinity_usr(unsigned int irq)
* set_cpus_allowed_ptr() here as we hold desc->lock and this
* code can be called from hard interrupt context.
*/
-void irq_set_thread_affinity(struct irq_desc *desc)
+static void irq_set_thread_affinity(struct irq_desc *desc)
{
struct irqaction *action;
@@ -2789,8 +2791,7 @@ out:
irq_put_desc_unlock(desc, flags);
}
-int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which,
- bool *state)
+static int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state)
{
struct irq_chip *chip;
int err = -EINVAL;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index eb150afd671f..147cabb4c077 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -35,6 +35,16 @@ bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear)
return true;
}
+void irq_force_complete_move(struct irq_desc *desc)
+{
+ for (struct irq_data *d = irq_desc_get_irq_data(desc); d; d = d->parent_data) {
+ if (d->chip && d->chip->irq_force_complete_move) {
+ d->chip->irq_force_complete_move(d);
+ return;
+ }
+ }
+}
+
void irq_move_masked_irq(struct irq_data *idata)
{
struct irq_desc *desc = irq_data_to_desc(idata);
@@ -117,3 +127,13 @@ void __irq_move_irq(struct irq_data *idata)
if (!masked)
idata->chip->irq_unmask(idata);
}
+
+bool irq_can_move_in_process_context(struct irq_data *data)
+{
+ /*
+ * Get the top level irq_data in the hierarchy, which is optimized
+ * away when CONFIG_IRQ_DOMAIN_HIERARCHY is disabled.
+ */
+ data = irq_desc_get_irq_data(irq_data_to_desc(data));
+ return irq_can_move_pcntxt(data);
+}
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 396a067a8a56..1951a08f0421 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -15,6 +15,7 @@
#include <linux/mutex.h>
#include <linux/pci.h>
#include <linux/slab.h>
+#include <linux/seq_file.h>
#include <linux/sysfs.h>
#include <linux/types.h>
#include <linux/xarray.h>
@@ -269,16 +270,11 @@ fail:
return ret;
}
-void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
-{
- *msg = entry->msg;
-}
-
void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
{
struct msi_desc *entry = irq_get_msi_desc(irq);
- __get_cached_msi_msg(entry, msg);
+ *msg = entry->msg;
}
EXPORT_SYMBOL_GPL(get_cached_msi_msg);
@@ -342,26 +338,30 @@ int msi_setup_device_data(struct device *dev)
}
/**
- * msi_lock_descs - Lock the MSI descriptor storage of a device
+ * __msi_lock_descs - Lock the MSI descriptor storage of a device
* @dev: Device to operate on
+ *
+ * Internal function for guard(msi_descs_lock). Don't use in code.
*/
-void msi_lock_descs(struct device *dev)
+void __msi_lock_descs(struct device *dev)
{
mutex_lock(&dev->msi.data->mutex);
}
-EXPORT_SYMBOL_GPL(msi_lock_descs);
+EXPORT_SYMBOL_GPL(__msi_lock_descs);
/**
- * msi_unlock_descs - Unlock the MSI descriptor storage of a device
+ * __msi_unlock_descs - Unlock the MSI descriptor storage of a device
* @dev: Device to operate on
+ *
+ * Internal function for guard(msi_descs_lock). Don't use in code.
*/
-void msi_unlock_descs(struct device *dev)
+void __msi_unlock_descs(struct device *dev)
{
/* Invalidate the index which was cached by the iterator */
dev->msi.data->__iter_idx = MSI_XA_MAX_INDEX;
mutex_unlock(&dev->msi.data->mutex);
}
-EXPORT_SYMBOL_GPL(msi_unlock_descs);
+EXPORT_SYMBOL_GPL(__msi_unlock_descs);
static struct msi_desc *msi_find_desc(struct msi_device_data *md, unsigned int domid,
enum msi_desc_filter filter)
@@ -447,7 +447,6 @@ EXPORT_SYMBOL_GPL(msi_next_desc);
unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigned int index)
{
struct msi_desc *desc;
- unsigned int ret = 0;
bool pcimsi = false;
struct xarray *xa;
@@ -461,7 +460,7 @@ unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigne
if (dev_is_pci(dev) && domid == MSI_DEFAULT_DOMAIN)
pcimsi = to_pci_dev(dev)->msi_enabled;
- msi_lock_descs(dev);
+ guard(msi_descs_lock)(dev);
xa = &dev->msi.data->__domains[domid].store;
desc = xa_load(xa, pcimsi ? 0 : index);
if (desc && desc->irq) {
@@ -470,16 +469,12 @@ unsigned int msi_domain_get_virq(struct device *dev, unsigned int domid, unsigne
* PCI-MSIX and platform MSI use a descriptor per
* interrupt.
*/
- if (pcimsi) {
- if (index < desc->nvec_used)
- ret = desc->irq + index;
- } else {
- ret = desc->irq;
- }
+ if (!pcimsi)
+ return desc->irq;
+ if (index < desc->nvec_used)
+ return desc->irq + index;
}
-
- msi_unlock_descs(dev);
- return ret;
+ return 0;
}
EXPORT_SYMBOL_GPL(msi_domain_get_virq);
@@ -756,12 +751,30 @@ static int msi_domain_translate(struct irq_domain *domain, struct irq_fwspec *fw
return info->ops->msi_translate(domain, fwspec, hwirq, type);
}
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+static void msi_domain_debug_show(struct seq_file *m, struct irq_domain *d,
+ struct irq_data *irqd, int ind)
+{
+ struct msi_desc *desc = irq_data_get_msi_desc(irqd);
+
+ if (!desc)
+ return;
+
+ seq_printf(m, "\n%*saddress_hi: 0x%08x", ind + 1, "", desc->msg.address_hi);
+ seq_printf(m, "\n%*saddress_lo: 0x%08x", ind + 1, "", desc->msg.address_lo);
+ seq_printf(m, "\n%*smsg_data: 0x%08x\n", ind + 1, "", desc->msg.data);
+}
+#endif
+
static const struct irq_domain_ops msi_domain_ops = {
.alloc = msi_domain_alloc,
.free = msi_domain_free,
.activate = msi_domain_activate,
.deactivate = msi_domain_deactivate,
.translate = msi_domain_translate,
+#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
+ .debug_show = msi_domain_debug_show,
+#endif
};
static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info,
@@ -979,9 +992,8 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
void *chip_data)
{
struct irq_domain *domain, *parent = dev->msi.domain;
- struct fwnode_handle *fwnode, *fwnalloced = NULL;
- struct msi_domain_template *bundle;
const struct msi_parent_ops *pops;
+ struct fwnode_handle *fwnode;
if (!irq_domain_is_msi_parent(parent))
return false;
@@ -989,7 +1001,8 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
if (domid >= MSI_MAX_DEVICE_IRQDOMAINS)
return false;
- bundle = kmemdup(template, sizeof(*bundle), GFP_KERNEL);
+ struct msi_domain_template *bundle __free(kfree) =
+ bundle = kmemdup(template, sizeof(*bundle), GFP_KERNEL);
if (!bundle)
return false;
@@ -1012,41 +1025,36 @@ bool msi_create_device_irq_domain(struct device *dev, unsigned int domid,
* node as they are not guaranteed to have a fwnode. They are never
* looked up and always handled in the context of the device.
*/
- if (bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE)
- fwnode = dev->fwnode;
+ struct fwnode_handle *fwnode_alloced __free(irq_domain_free_fwnode) = NULL;
+
+ if (!(bundle->info.flags & MSI_FLAG_USE_DEV_FWNODE))
+ fwnode = fwnode_alloced = irq_domain_alloc_named_fwnode(bundle->name);
else
- fwnode = fwnalloced = irq_domain_alloc_named_fwnode(bundle->name);
+ fwnode = dev->fwnode;
if (!fwnode)
- goto free_bundle;
+ return false;
if (msi_setup_device_data(dev))
- goto free_fwnode;
-
- msi_lock_descs(dev);
+ return false;
+ guard(msi_descs_lock)(dev);
if (WARN_ON_ONCE(msi_get_device_domain(dev, domid)))
- goto fail;
+ return false;
if (!pops->init_dev_msi_info(dev, parent, parent, &bundle->info))
- goto fail;
+ return false;
domain = __msi_create_irq_domain(fwnode, &bundle->info, IRQ_DOMAIN_FLAG_MSI_DEVICE, parent);
if (!domain)
- goto fail;
+ return false;
+ /* @bundle and @fwnode_alloced are now in use. Prevent cleanup */
+ retain_ptr(bundle);
+ retain_ptr(fwnode_alloced);
domain->dev = dev;
dev->msi.data->__domains[domid].domain = domain;
- msi_unlock_descs(dev);
return true;
-
-fail:
- msi_unlock_descs(dev);
-free_fwnode:
- irq_domain_free_fwnode(fwnalloced);
-free_bundle:
- kfree(bundle);
- return false;
}
/**
@@ -1060,12 +1068,10 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid)
struct msi_domain_info *info;
struct irq_domain *domain;
- msi_lock_descs(dev);
-
+ guard(msi_descs_lock)(dev);
domain = msi_get_device_domain(dev, domid);
-
if (!domain || !irq_domain_is_msi_device(domain))
- goto unlock;
+ return;
dev->msi.data->__domains[domid].domain = NULL;
info = domain->host_data;
@@ -1074,9 +1080,6 @@ void msi_remove_device_irq_domain(struct device *dev, unsigned int domid)
irq_domain_remove(domain);
irq_domain_free_fwnode(fwnode);
kfree(container_of(info, struct msi_domain_template, info));
-
-unlock:
- msi_unlock_descs(dev);
}
/**
@@ -1092,16 +1095,14 @@ bool msi_match_device_irq_domain(struct device *dev, unsigned int domid,
{
struct msi_domain_info *info;
struct irq_domain *domain;
- bool ret = false;
- msi_lock_descs(dev);
+ guard(msi_descs_lock)(dev);
domain = msi_get_device_domain(dev, domid);
if (domain && irq_domain_is_msi_device(domain)) {
info = domain->host_data;
- ret = info->bus_token == bus_token;
+ return info->bus_token == bus_token;
}
- msi_unlock_descs(dev);
- return ret;
+ return false;
}
static int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
@@ -1143,7 +1144,7 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
if (!(info->flags & MSI_FLAG_MUST_REACTIVATE))
return false;
- if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask)
+ if (info->flags & MSI_FLAG_NO_MASK)
return false;
/*
@@ -1333,21 +1334,17 @@ static int msi_domain_alloc_locked(struct device *dev, struct msi_ctrl *ctrl)
}
/**
- * msi_domain_alloc_irqs_range_locked - Allocate interrupts from a MSI interrupt domain
+ * msi_domain_alloc_irqs_range - Allocate interrupts from a MSI interrupt domain
* @dev: Pointer to device struct of the device for which the interrupts
* are allocated
* @domid: Id of the interrupt domain to operate on
* @first: First index to allocate (inclusive)
* @last: Last index to allocate (inclusive)
*
- * Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
- * pair. Use this for MSI irqdomains which implement their own descriptor
- * allocation/free.
- *
* Return: %0 on success or an error code.
*/
-int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid,
- unsigned int first, unsigned int last)
+int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
{
struct msi_ctrl ctrl = {
.domid = domid,
@@ -1356,29 +1353,9 @@ int msi_domain_alloc_irqs_range_locked(struct device *dev, unsigned int domid,
.nirqs = last + 1 - first,
};
+ guard(msi_descs_lock)(dev);
return msi_domain_alloc_locked(dev, &ctrl);
}
-
-/**
- * msi_domain_alloc_irqs_range - Allocate interrupts from a MSI interrupt domain
- * @dev: Pointer to device struct of the device for which the interrupts
- * are allocated
- * @domid: Id of the interrupt domain to operate on
- * @first: First index to allocate (inclusive)
- * @last: Last index to allocate (inclusive)
- *
- * Return: %0 on success or an error code.
- */
-int msi_domain_alloc_irqs_range(struct device *dev, unsigned int domid,
- unsigned int first, unsigned int last)
-{
- int ret;
-
- msi_lock_descs(dev);
- ret = msi_domain_alloc_irqs_range_locked(dev, domid, first, last);
- msi_unlock_descs(dev);
- return ret;
-}
EXPORT_SYMBOL_GPL(msi_domain_alloc_irqs_range);
/**
@@ -1481,12 +1458,8 @@ struct msi_map msi_domain_alloc_irq_at(struct device *dev, unsigned int domid, u
const struct irq_affinity_desc *affdesc,
union msi_instance_cookie *icookie)
{
- struct msi_map map;
-
- msi_lock_descs(dev);
- map = __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie);
- msi_unlock_descs(dev);
- return map;
+ guard(msi_descs_lock)(dev);
+ return __msi_domain_alloc_irq_at(dev, domid, index, affdesc, icookie);
}
/**
@@ -1523,13 +1496,11 @@ int msi_device_domain_alloc_wired(struct irq_domain *domain, unsigned int hwirq,
icookie.value = ((u64)type << 32) | hwirq;
- msi_lock_descs(dev);
+ guard(msi_descs_lock)(dev);
if (WARN_ON_ONCE(msi_get_device_domain(dev, domid) != domain))
map.index = -EINVAL;
else
map = __msi_domain_alloc_irq_at(dev, domid, MSI_ANY_INDEX, NULL, &icookie);
- msi_unlock_descs(dev);
-
return map.index >= 0 ? map.virq : map.index;
}
@@ -1599,8 +1570,8 @@ static void msi_domain_free_locked(struct device *dev, struct msi_ctrl *ctrl)
* @first: First index to free (inclusive)
* @last: Last index to free (inclusive)
*/
-void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid,
- unsigned int first, unsigned int last)
+static void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid,
+ unsigned int first, unsigned int last)
{
struct msi_ctrl ctrl = {
.domid = domid,
@@ -1622,9 +1593,8 @@ void msi_domain_free_irqs_range_locked(struct device *dev, unsigned int domid,
void msi_domain_free_irqs_range(struct device *dev, unsigned int domid,
unsigned int first, unsigned int last)
{
- msi_lock_descs(dev);
+ guard(msi_descs_lock)(dev);
msi_domain_free_irqs_range_locked(dev, domid, first, last);
- msi_unlock_descs(dev);
}
EXPORT_SYMBOL_GPL(msi_domain_free_irqs_all);
@@ -1654,9 +1624,8 @@ void msi_domain_free_irqs_all_locked(struct device *dev, unsigned int domid)
*/
void msi_domain_free_irqs_all(struct device *dev, unsigned int domid)
{
- msi_lock_descs(dev);
+ guard(msi_descs_lock)(dev);
msi_domain_free_irqs_all_locked(dev, domid);
- msi_unlock_descs(dev);
}
/**
@@ -1675,12 +1644,11 @@ void msi_device_domain_free_wired(struct irq_domain *domain, unsigned int virq)
if (WARN_ON_ONCE(!dev || !desc || domain->bus_token != DOMAIN_BUS_WIRED_TO_MSI))
return;
- msi_lock_descs(dev);
- if (!WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain)) {
- msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index,
- desc->msi_index);
- }
- msi_unlock_descs(dev);
+ guard(msi_descs_lock)(dev);
+ if (WARN_ON_ONCE(msi_get_device_domain(dev, MSI_DEFAULT_DOMAIN) != domain))
+ return;
+ msi_domain_free_irqs_range_locked(dev, MSI_DEFAULT_DOMAIN, desc->msi_index,
+ desc->msi_index);
}
/**
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index a9a0ca605d4a..4198f30aac3c 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -148,16 +148,8 @@ static unsigned int get_symbol_offset(unsigned long pos)
unsigned long kallsyms_sym_address(int idx)
{
- /* values are unsigned offsets if --absolute-percpu is not in effect */
- if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU))
- return kallsyms_relative_base + (u32)kallsyms_offsets[idx];
-
- /* ...otherwise, positive offsets are absolute values */
- if (kallsyms_offsets[idx] >= 0)
- return kallsyms_offsets[idx];
-
- /* ...and negative offsets are relative to kallsyms_relative_base - 1 */
- return kallsyms_relative_base - 1 - kallsyms_offsets[idx];
+ /* values are unsigned offsets */
+ return kallsyms_relative_base + (u32)kallsyms_offsets[idx];
}
static unsigned int get_symbol_seq(int index)
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 2c596851f8a9..7c1a65bd5f8d 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -145,7 +145,7 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
*/
task1 = find_task_by_vpid(pid1);
task2 = find_task_by_vpid(pid2);
- if (!task1 || !task2)
+ if (unlikely(!task1 || !task2))
goto err_no_task;
get_task_struct(task1);
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 0db4093d17b8..a114949eeed5 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -5,7 +5,8 @@ KCOV_INSTRUMENT := n
obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
-# Avoid recursion lockdep -> sanitizer -> ... -> lockdep.
+# Avoid recursion lockdep -> sanitizer -> ... -> lockdep & improve performance.
+KASAN_SANITIZE_lockdep.o := n
KCSAN_SANITIZE_lockdep.o := n
ifdef CONFIG_FUNCTION_TRACER
diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h
index 97fb6f3f840a..9ef9850aeebe 100644
--- a/kernel/locking/lock_events_list.h
+++ b/kernel/locking/lock_events_list.h
@@ -67,3 +67,31 @@ LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */
LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */
LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */
LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */
+
+/*
+ * Locking events for rtlock_slowlock()
+ */
+LOCK_EVENT(rtlock_slowlock) /* # of rtlock_slowlock() calls */
+LOCK_EVENT(rtlock_slow_acq1) /* # of locks acquired after wait_lock */
+LOCK_EVENT(rtlock_slow_acq2) /* # of locks acquired in for loop */
+LOCK_EVENT(rtlock_slow_sleep) /* # of sleeps */
+LOCK_EVENT(rtlock_slow_wake) /* # of wakeup's */
+
+/*
+ * Locking events for rt_mutex_slowlock()
+ */
+LOCK_EVENT(rtmutex_slowlock) /* # of rt_mutex_slowlock() calls */
+LOCK_EVENT(rtmutex_slow_block) /* # of rt_mutex_slowlock_block() calls */
+LOCK_EVENT(rtmutex_slow_acq1) /* # of locks acquired after wait_lock */
+LOCK_EVENT(rtmutex_slow_acq2) /* # of locks acquired at the end */
+LOCK_EVENT(rtmutex_slow_acq3) /* # of locks acquired in *block() */
+LOCK_EVENT(rtmutex_slow_sleep) /* # of sleeps */
+LOCK_EVENT(rtmutex_slow_wake) /* # of wakeup's */
+LOCK_EVENT(rtmutex_deadlock) /* # of rt_mutex_handle_deadlock()'s */
+
+/*
+ * Locking events for lockdep
+ */
+LOCK_EVENT(lockdep_acquire)
+LOCK_EVENT(lockdep_lock)
+LOCK_EVENT(lockdep_nocheck)
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 4470680f0226..b15757e63626 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -57,10 +57,12 @@
#include <linux/lockdep.h>
#include <linux/context_tracking.h>
#include <linux/console.h>
+#include <linux/kasan.h>
#include <asm/sections.h>
#include "lockdep_internals.h"
+#include "lock_events.h"
#include <trace/events/lock.h>
@@ -170,6 +172,7 @@ static struct task_struct *lockdep_selftest_task_struct;
static int graph_lock(void)
{
lockdep_lock();
+ lockevent_inc(lockdep_lock);
/*
* Make sure that if another CPU detected a bug while
* walking the graph we dont change it (while the other
@@ -5091,8 +5094,12 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (unlikely(lock->key == &__lockdep_no_track__))
return 0;
- if (!prove_locking || lock->key == &__lockdep_no_validate__)
+ lockevent_inc(lockdep_acquire);
+
+ if (!prove_locking || lock->key == &__lockdep_no_validate__) {
check = 0;
+ lockevent_inc(lockdep_nocheck);
+ }
if (subclass < NR_LOCKDEP_CACHING_CLASSES)
class = lock->class_cache[subclass];
@@ -5824,6 +5831,14 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (!debug_locks)
return;
+ /*
+ * As KASAN instrumentation is disabled and lock_acquire() is usually
+ * the first lockdep call when a task tries to acquire a lock, add
+ * kasan_check_byte() here to check for use-after-free and other
+ * memory errors.
+ */
+ kasan_check_byte(lock);
+
if (unlikely(!lockdep_enabled())) {
/* XXX allow trylock from NMI ?!? */
if (lockdep_nmi() && !trylock) {
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index b36f23de48f1..19b636f60a24 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -143,6 +143,8 @@ static __always_inline bool __mutex_trylock_fast(struct mutex *lock)
unsigned long curr = (unsigned long)current;
unsigned long zero = 0UL;
+ MUTEX_WARN_ON(lock->magic != lock);
+
if (atomic_long_try_cmpxchg_acquire(&lock->owner, &zero, curr))
return true;
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 4a8df1800cbb..c80902eacd79 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -27,6 +27,7 @@
#include <trace/events/lock.h>
#include "rtmutex_common.h"
+#include "lock_events.h"
#ifndef WW_RT
# define build_ww_mutex() (false)
@@ -1612,10 +1613,13 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
struct task_struct *owner;
int ret = 0;
+ lockevent_inc(rtmutex_slow_block);
for (;;) {
/* Try to acquire the lock: */
- if (try_to_take_rt_mutex(lock, current, waiter))
+ if (try_to_take_rt_mutex(lock, current, waiter)) {
+ lockevent_inc(rtmutex_slow_acq3);
break;
+ }
if (timeout && !timeout->task) {
ret = -ETIMEDOUT;
@@ -1638,8 +1642,10 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
owner = NULL;
raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q);
- if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
+ if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) {
+ lockevent_inc(rtmutex_slow_sleep);
rt_mutex_schedule();
+ }
raw_spin_lock_irq(&lock->wait_lock);
set_current_state(state);
@@ -1694,6 +1700,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
int ret;
lockdep_assert_held(&lock->wait_lock);
+ lockevent_inc(rtmutex_slowlock);
/* Try to acquire the lock again: */
if (try_to_take_rt_mutex(lock, current, NULL)) {
@@ -1701,6 +1708,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
__ww_mutex_check_waiters(rtm, ww_ctx, wake_q);
ww_mutex_lock_acquired(ww, ww_ctx);
}
+ lockevent_inc(rtmutex_slow_acq1);
return 0;
}
@@ -1719,10 +1727,12 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
__ww_mutex_check_waiters(rtm, ww_ctx, wake_q);
ww_mutex_lock_acquired(ww, ww_ctx);
}
+ lockevent_inc(rtmutex_slow_acq2);
} else {
__set_current_state(TASK_RUNNING);
remove_waiter(lock, waiter);
rt_mutex_handle_deadlock(ret, chwalk, lock, waiter);
+ lockevent_inc(rtmutex_deadlock);
}
/*
@@ -1751,6 +1761,7 @@ static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock,
&waiter, wake_q);
debug_rt_mutex_free_waiter(&waiter);
+ lockevent_cond_inc(rtmutex_slow_wake, !wake_q_empty(wake_q));
return ret;
}
@@ -1823,9 +1834,12 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock,
struct task_struct *owner;
lockdep_assert_held(&lock->wait_lock);
+ lockevent_inc(rtlock_slowlock);
- if (try_to_take_rt_mutex(lock, current, NULL))
+ if (try_to_take_rt_mutex(lock, current, NULL)) {
+ lockevent_inc(rtlock_slow_acq1);
return;
+ }
rt_mutex_init_rtlock_waiter(&waiter);
@@ -1838,8 +1852,10 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock,
for (;;) {
/* Try to acquire the lock again */
- if (try_to_take_rt_mutex(lock, current, &waiter))
+ if (try_to_take_rt_mutex(lock, current, &waiter)) {
+ lockevent_inc(rtlock_slow_acq2);
break;
+ }
if (&waiter == rt_mutex_top_waiter(lock))
owner = rt_mutex_owner(lock);
@@ -1847,8 +1863,10 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock,
owner = NULL;
raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q);
- if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner))
+ if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) {
+ lockevent_inc(rtlock_slow_sleep);
schedule_rtlock();
+ }
raw_spin_lock_irq(&lock->wait_lock);
set_current_state(TASK_RTLOCK_WAIT);
@@ -1865,6 +1883,7 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock,
debug_rt_mutex_free_waiter(&waiter);
trace_contention_end(lock, 0);
+ lockevent_cond_inc(rtlock_slow_wake, !wake_q_empty(wake_q));
}
static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock)
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 1fb9ad289a6f..a256cc919ad7 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1221,18 +1221,6 @@ void __weak module_arch_freeing_init(struct module *mod)
{
}
-void *__module_writable_address(struct module *mod, void *loc)
-{
- for_class_mod_mem_type(type, text) {
- struct module_memory *mem = &mod->mem[type];
-
- if (loc >= mem->base && loc < mem->base + mem->size)
- return loc + (mem->rw_copy - mem->base);
- }
-
- return loc;
-}
-
static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
{
unsigned int size = PAGE_ALIGN(mod->mem[type].size);
@@ -1250,21 +1238,15 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
if (!ptr)
return -ENOMEM;
- mod->mem[type].base = ptr;
-
if (execmem_is_rox(execmem_type)) {
- ptr = vzalloc(size);
+ int err = execmem_make_temp_rw(ptr, size);
- if (!ptr) {
- execmem_free(mod->mem[type].base);
+ if (err) {
+ execmem_free(ptr);
return -ENOMEM;
}
- mod->mem[type].rw_copy = ptr;
mod->mem[type].is_rox = true;
- } else {
- mod->mem[type].rw_copy = mod->mem[type].base;
- memset(mod->mem[type].base, 0, size);
}
/*
@@ -1278,18 +1260,29 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type)
* *do* eventually get freed, but let's just keep things simple
* and avoid *any* false positives.
*/
- kmemleak_not_leak(ptr);
+ if (!mod->mem[type].is_rox)
+ kmemleak_not_leak(ptr);
+
+ memset(ptr, 0, size);
+ mod->mem[type].base = ptr;
return 0;
}
+static void module_memory_restore_rox(struct module *mod)
+{
+ for_class_mod_mem_type(type, text) {
+ struct module_memory *mem = &mod->mem[type];
+
+ if (mem->is_rox)
+ execmem_restore_rox(mem->base, mem->size);
+ }
+}
+
static void module_memory_free(struct module *mod, enum mod_mem_type type)
{
struct module_memory *mem = &mod->mem[type];
- if (mem->is_rox)
- vfree(mem->rw_copy);
-
execmem_free(mem->base);
}
@@ -2642,7 +2635,6 @@ static int move_module(struct module *mod, struct load_info *info)
for_each_mod_mem_type(type) {
if (!mod->mem[type].size) {
mod->mem[type].base = NULL;
- mod->mem[type].rw_copy = NULL;
continue;
}
@@ -2659,7 +2651,6 @@ static int move_module(struct module *mod, struct load_info *info)
void *dest;
Elf_Shdr *shdr = &info->sechdrs[i];
const char *sname;
- unsigned long addr;
if (!(shdr->sh_flags & SHF_ALLOC))
continue;
@@ -2680,14 +2671,12 @@ static int move_module(struct module *mod, struct load_info *info)
ret = PTR_ERR(dest);
goto out_err;
}
- addr = (unsigned long)dest;
codetag_section_found = true;
} else {
enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT;
unsigned long offset = shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK;
- addr = (unsigned long)mod->mem[type].base + offset;
- dest = mod->mem[type].rw_copy + offset;
+ dest = mod->mem[type].base + offset;
}
if (shdr->sh_type != SHT_NOBITS) {
@@ -2710,13 +2699,14 @@ static int move_module(struct module *mod, struct load_info *info)
* users of info can keep taking advantage and using the newly
* minted official memory area.
*/
- shdr->sh_addr = addr;
+ shdr->sh_addr = (unsigned long)dest;
pr_debug("\t0x%lx 0x%.8lx %s\n", (long)shdr->sh_addr,
(long)shdr->sh_size, info->secstrings + shdr->sh_name);
}
return 0;
out_err:
+ module_memory_restore_rox(mod);
for (t--; t >= 0; t--)
module_memory_free(mod, t);
if (codetag_section_found)
@@ -2863,17 +2853,8 @@ int __weak module_finalize(const Elf_Ehdr *hdr,
return 0;
}
-int __weak module_post_finalize(const Elf_Ehdr *hdr,
- const Elf_Shdr *sechdrs,
- struct module *me)
-{
- return 0;
-}
-
static int post_relocation(struct module *mod, const struct load_info *info)
{
- int ret;
-
/* Sort exception table now relocations are done. */
sort_extable(mod->extable, mod->extable + mod->num_exentries);
@@ -2885,24 +2866,7 @@ static int post_relocation(struct module *mod, const struct load_info *info)
add_kallsyms(mod, info);
/* Arch-specific module finalizing. */
- ret = module_finalize(info->hdr, info->sechdrs, mod);
- if (ret)
- return ret;
-
- for_each_mod_mem_type(type) {
- struct module_memory *mem = &mod->mem[type];
-
- if (mem->is_rox) {
- if (!execmem_update_copy(mem->base, mem->rw_copy,
- mem->size))
- return -ENOMEM;
-
- vfree(mem->rw_copy);
- mem->rw_copy = NULL;
- }
- }
-
- return module_post_finalize(info->hdr, info->sechdrs, mod);
+ return module_finalize(info->hdr, info->sechdrs, mod);
}
/* Call module constructors. */
@@ -3499,6 +3463,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
mod->mem[type].size);
}
+ module_memory_restore_rox(mod);
module_deallocate(mod, info);
free_copy:
/*
diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c
index 74834ba15615..03f4142cfbf4 100644
--- a/kernel/module/strict_rwx.c
+++ b/kernel/module/strict_rwx.c
@@ -9,6 +9,7 @@
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/set_memory.h>
+#include <linux/execmem.h>
#include "internal.h"
static int module_set_memory(const struct module *mod, enum mod_mem_type type,
@@ -32,12 +33,12 @@ static int module_set_memory(const struct module *mod, enum mod_mem_type type,
int module_enable_text_rox(const struct module *mod)
{
for_class_mod_mem_type(type, text) {
+ const struct module_memory *mem = &mod->mem[type];
int ret;
- if (mod->mem[type].is_rox)
- continue;
-
- if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+ if (mem->is_rox)
+ ret = execmem_restore_rox(mem->base, mem->size);
+ else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
ret = module_set_memory(mod, type, set_memory_rox);
else
ret = module_set_memory(mod, type, set_memory_x);
diff --git a/kernel/padata.c b/kernel/padata.c
index 418987056340..b3d4eacc4f5d 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -290,7 +290,7 @@ static struct padata_priv *padata_find_next(struct parallel_data *pd,
if (remove_object) {
list_del_init(&padata->list);
++pd->processed;
- pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1, false);
+ pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu);
}
spin_unlock(&reorder->lock);
diff --git a/kernel/pid.c b/kernel/pid.c
index 924084713be8..4ac2ce46817f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -88,20 +88,6 @@ struct pid_namespace init_pid_ns = {
};
EXPORT_SYMBOL_GPL(init_pid_ns);
-/*
- * Note: disable interrupts while the pidmap_lock is held as an
- * interrupt might come in and do read_lock(&tasklist_lock).
- *
- * If we don't disable interrupts there is a nasty deadlock between
- * detach_pid()->free_pid() and another cpu that does
- * spin_lock(&pidmap_lock) followed by an interrupt routine that does
- * read_lock(&tasklist_lock);
- *
- * After we clean up the tasklist_lock and know there are no
- * irq handlers that take it we can leave the interrupts enabled.
- * For now it is easier to be safe than to prove it can't happen.
- */
-
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
@@ -128,11 +114,11 @@ static void delayed_put_pid(struct rcu_head *rhp)
void free_pid(struct pid *pid)
{
- /* We can be called with write_lock_irq(&tasklist_lock) held */
int i;
- unsigned long flags;
- spin_lock_irqsave(&pidmap_lock, flags);
+ lockdep_assert_not_held(&tasklist_lock);
+
+ spin_lock(&pidmap_lock);
for (i = 0; i <= pid->level; i++) {
struct upid *upid = pid->numbers + i;
struct pid_namespace *ns = upid->ns;
@@ -155,11 +141,23 @@ void free_pid(struct pid *pid)
idr_remove(&ns->idr, upid->nr);
}
pidfs_remove_pid(pid);
- spin_unlock_irqrestore(&pidmap_lock, flags);
+ spin_unlock(&pidmap_lock);
call_rcu(&pid->rcu, delayed_put_pid);
}
+void free_pids(struct pid **pids)
+{
+ int tmp;
+
+ /*
+ * This can batch pidmap_lock.
+ */
+ for (tmp = PIDTYPE_MAX; --tmp >= 0; )
+ if (pids[tmp])
+ free_pid(pids[tmp]);
+}
+
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
size_t set_tid_size)
{
@@ -211,7 +209,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
}
idr_preload(GFP_KERNEL);
- spin_lock_irq(&pidmap_lock);
+ spin_lock(&pidmap_lock);
if (tid) {
nr = idr_alloc(&tmp->idr, NULL, tid,
@@ -238,7 +236,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
pid_max, GFP_ATOMIC);
}
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
idr_preload_end();
if (nr < 0) {
@@ -272,7 +270,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
upid = pid->numbers + ns->level;
idr_preload(GFP_KERNEL);
- spin_lock_irq(&pidmap_lock);
+ spin_lock(&pidmap_lock);
if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock;
pidfs_add_pid(pid);
@@ -281,18 +279,18 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
idr_replace(&upid->ns->idr, pid, upid->nr);
upid->ns->pid_allocated++;
}
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
idr_preload_end();
return pid;
out_unlock:
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
idr_preload_end();
put_pid_ns(ns);
out_free:
- spin_lock_irq(&pidmap_lock);
+ spin_lock(&pidmap_lock);
while (++i <= ns->level) {
upid = pid->numbers + i;
idr_remove(&upid->ns->idr, upid->nr);
@@ -302,7 +300,7 @@ out_free:
if (ns->pid_allocated == PIDNS_ADDING)
idr_set_cursor(&ns->idr, 0);
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
kmem_cache_free(ns->pid_cachep, pid);
return ERR_PTR(retval);
@@ -310,9 +308,9 @@ out_free:
void disable_pid_allocation(struct pid_namespace *ns)
{
- spin_lock_irq(&pidmap_lock);
+ spin_lock(&pidmap_lock);
ns->pid_allocated &= ~PIDNS_ADDING;
- spin_unlock_irq(&pidmap_lock);
+ spin_unlock(&pidmap_lock);
}
struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
@@ -339,17 +337,23 @@ static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type)
*/
void attach_pid(struct task_struct *task, enum pid_type type)
{
- struct pid *pid = *task_pid_ptr(task, type);
+ struct pid *pid;
+
+ lockdep_assert_held_write(&tasklist_lock);
+
+ pid = *task_pid_ptr(task, type);
hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
}
-static void __change_pid(struct task_struct *task, enum pid_type type,
- struct pid *new)
+static void __change_pid(struct pid **pids, struct task_struct *task,
+ enum pid_type type, struct pid *new)
{
- struct pid **pid_ptr = task_pid_ptr(task, type);
- struct pid *pid;
+ struct pid **pid_ptr, *pid;
int tmp;
+ lockdep_assert_held_write(&tasklist_lock);
+
+ pid_ptr = task_pid_ptr(task, type);
pid = *pid_ptr;
hlist_del_rcu(&task->pid_links[type]);
@@ -364,18 +368,19 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
if (pid_has_task(pid, tmp))
return;
- free_pid(pid);
+ WARN_ON(pids[type]);
+ pids[type] = pid;
}
-void detach_pid(struct task_struct *task, enum pid_type type)
+void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type type)
{
- __change_pid(task, type, NULL);
+ __change_pid(pids, task, type, NULL);
}
-void change_pid(struct task_struct *task, enum pid_type type,
+void change_pid(struct pid **pids, struct task_struct *task, enum pid_type type,
struct pid *pid)
{
- __change_pid(task, type, pid);
+ __change_pid(pids, task, type, pid);
attach_pid(task, type);
}
@@ -386,6 +391,8 @@ void exchange_tids(struct task_struct *left, struct task_struct *right)
struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID];
struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID];
+ lockdep_assert_held_write(&tasklist_lock);
+
/* Swap the single entry tid lists */
hlists_swap_heads_rcu(head1, head2);
@@ -403,6 +410,7 @@ void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type type)
{
WARN_ON_ONCE(type == PIDTYPE_PID);
+ lockdep_assert_held_write(&tasklist_lock);
hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
}
@@ -564,15 +572,29 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
*/
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
{
- unsigned int f_flags;
+ unsigned int f_flags = 0;
struct pid *pid;
struct task_struct *task;
+ enum pid_type type;
- pid = pidfd_get_pid(pidfd, &f_flags);
- if (IS_ERR(pid))
- return ERR_CAST(pid);
+ switch (pidfd) {
+ case PIDFD_SELF_THREAD:
+ type = PIDTYPE_PID;
+ pid = get_task_pid(current, type);
+ break;
+ case PIDFD_SELF_THREAD_GROUP:
+ type = PIDTYPE_TGID;
+ pid = get_task_pid(current, type);
+ break;
+ default:
+ pid = pidfd_get_pid(pidfd, &f_flags);
+ if (IS_ERR(pid))
+ return ERR_CAST(pid);
+ type = PIDTYPE_TGID;
+ break;
+ }
- task = get_pid_task(pid, PIDTYPE_TGID);
+ task = get_pid_task(pid, type);
put_pid(pid);
if (!task)
return ERR_PTR(-ESRCH);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ca947ed32e3d..54a623680019 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -380,8 +380,7 @@ config CPU_PM
config ENERGY_MODEL
bool "Energy Model for devices with DVFS (CPUs, GPUs, etc)"
- depends on SMP
- depends on CPU_FREQ
+ depends on CPU_FREQ || PM_DEVFREQ
help
Several subsystems (thermal and/or the task scheduler for example)
can leverage information about the energy consumed by devices to
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 3874f0e97651..d9b7e2b38c7a 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -161,22 +161,10 @@ static void em_debug_create_pd(struct device *dev) {}
static void em_debug_remove_pd(struct device *dev) {}
#endif
-static void em_destroy_table_rcu(struct rcu_head *rp)
-{
- struct em_perf_table __rcu *table;
-
- table = container_of(rp, struct em_perf_table, rcu);
- kfree(table);
-}
-
static void em_release_table_kref(struct kref *kref)
{
- struct em_perf_table __rcu *table;
-
/* It was the last owner of this table so we can free */
- table = container_of(kref, struct em_perf_table, kref);
-
- call_rcu(&table->rcu, em_destroy_table_rcu);
+ kfree_rcu(container_of(kref, struct em_perf_table, kref), rcu);
}
/**
@@ -185,7 +173,7 @@ static void em_release_table_kref(struct kref *kref)
*
* No return values.
*/
-void em_table_free(struct em_perf_table __rcu *table)
+void em_table_free(struct em_perf_table *table)
{
kref_put(&table->kref, em_release_table_kref);
}
@@ -198,9 +186,9 @@ void em_table_free(struct em_perf_table __rcu *table)
* has a user.
* Returns allocated table or NULL.
*/
-struct em_perf_table __rcu *em_table_alloc(struct em_perf_domain *pd)
+struct em_perf_table *em_table_alloc(struct em_perf_domain *pd)
{
- struct em_perf_table __rcu *table;
+ struct em_perf_table *table;
int table_size;
table_size = sizeof(struct em_perf_state) * pd->nr_perf_states;
@@ -239,7 +227,7 @@ static void em_init_performance(struct device *dev, struct em_perf_domain *pd,
}
static int em_compute_costs(struct device *dev, struct em_perf_state *table,
- struct em_data_callback *cb, int nr_states,
+ const struct em_data_callback *cb, int nr_states,
unsigned long flags)
{
unsigned long prev_cost = ULONG_MAX;
@@ -308,9 +296,9 @@ int em_dev_compute_costs(struct device *dev, struct em_perf_state *table,
* Return 0 on success or an error code on failure.
*/
int em_dev_update_perf_domain(struct device *dev,
- struct em_perf_table __rcu *new_table)
+ struct em_perf_table *new_table)
{
- struct em_perf_table __rcu *old_table;
+ struct em_perf_table *old_table;
struct em_perf_domain *pd;
if (!dev)
@@ -327,7 +315,8 @@ int em_dev_update_perf_domain(struct device *dev,
kref_get(&new_table->kref);
- old_table = pd->em_table;
+ old_table = rcu_dereference_protected(pd->em_table,
+ lockdep_is_held(&em_pd_mutex));
rcu_assign_pointer(pd->em_table, new_table);
em_cpufreq_update_efficiencies(dev, new_table->state);
@@ -341,7 +330,7 @@ EXPORT_SYMBOL_GPL(em_dev_update_perf_domain);
static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
struct em_perf_state *table,
- struct em_data_callback *cb,
+ const struct em_data_callback *cb,
unsigned long flags)
{
unsigned long power, freq, prev_freq = 0;
@@ -396,10 +385,11 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
}
static int em_create_pd(struct device *dev, int nr_states,
- struct em_data_callback *cb, cpumask_t *cpus,
+ const struct em_data_callback *cb,
+ const cpumask_t *cpus,
unsigned long flags)
{
- struct em_perf_table __rcu *em_table;
+ struct em_perf_table *em_table;
struct em_perf_domain *pd;
struct device *cpu_dev;
int cpu, ret, num_cpus;
@@ -556,9 +546,10 @@ EXPORT_SYMBOL_GPL(em_cpu_get);
* Return 0 on success
*/
int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
- struct em_data_callback *cb, cpumask_t *cpus,
- bool microwatts)
+ const struct em_data_callback *cb,
+ const cpumask_t *cpus, bool microwatts)
{
+ struct em_perf_table *em_table;
unsigned long cap, prev_cap = 0;
unsigned long flags = 0;
int cpu, ret;
@@ -631,7 +622,9 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
dev->em_pd->min_perf_state = 0;
dev->em_pd->max_perf_state = nr_states - 1;
- em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state);
+ em_table = rcu_dereference_protected(dev->em_pd->em_table,
+ lockdep_is_held(&em_pd_mutex));
+ em_cpufreq_update_efficiencies(dev, em_table->state);
em_debug_create_pd(dev);
dev_info(dev, "EM: created perf domain\n");
@@ -668,7 +661,8 @@ void em_dev_unregister_perf_domain(struct device *dev)
mutex_lock(&em_pd_mutex);
em_debug_remove_pd(dev);
- em_table_free(dev->em_pd->em_table);
+ em_table_free(rcu_dereference_protected(dev->em_pd->em_table,
+ lockdep_is_held(&em_pd_mutex)));
kfree(dev->em_pd);
dev->em_pd = NULL;
@@ -676,9 +670,9 @@ void em_dev_unregister_perf_domain(struct device *dev)
}
EXPORT_SYMBOL_GPL(em_dev_unregister_perf_domain);
-static struct em_perf_table __rcu *em_table_dup(struct em_perf_domain *pd)
+static struct em_perf_table *em_table_dup(struct em_perf_domain *pd)
{
- struct em_perf_table __rcu *em_table;
+ struct em_perf_table *em_table;
struct em_perf_state *ps, *new_ps;
int ps_size;
@@ -700,7 +694,7 @@ static struct em_perf_table __rcu *em_table_dup(struct em_perf_domain *pd)
}
static int em_recalc_and_update(struct device *dev, struct em_perf_domain *pd,
- struct em_perf_table __rcu *em_table)
+ struct em_perf_table *em_table)
{
int ret;
@@ -728,10 +722,9 @@ free_em_table:
* are correctly calculated.
*/
static void em_adjust_new_capacity(struct device *dev,
- struct em_perf_domain *pd,
- u64 max_cap)
+ struct em_perf_domain *pd)
{
- struct em_perf_table __rcu *em_table;
+ struct em_perf_table *em_table;
em_table = em_table_dup(pd);
if (!em_table) {
@@ -775,7 +768,8 @@ static void em_check_capacity_update(void)
}
cpufreq_cpu_put(policy);
- pd = em_cpu_get(cpu);
+ dev = get_cpu_device(cpu);
+ pd = em_pd_get(dev);
if (!pd || em_is_artificial(pd))
continue;
@@ -799,8 +793,7 @@ static void em_check_capacity_update(void)
pr_debug("updating cpu%d cpu_cap=%lu old capacity=%lu\n",
cpu, cpu_capacity, em_max_perf);
- dev = get_cpu_device(cpu);
- em_adjust_new_capacity(dev, pd, cpu_capacity);
+ em_adjust_new_capacity(dev, pd);
}
free_cpumask_var(cpu_done_mask);
@@ -822,7 +815,7 @@ static void em_update_workfn(struct work_struct *work)
*/
int em_dev_update_chip_binning(struct device *dev)
{
- struct em_perf_table __rcu *em_table;
+ struct em_perf_table *em_table;
struct em_perf_domain *pd;
int i, ret;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 10a01af63a80..b129ed1d25a8 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1446,10 +1446,10 @@ static const char * const comp_alg_enabled[] = {
static int hibernate_compressor_param_set(const char *compressor,
const struct kernel_param *kp)
{
- unsigned int sleep_flags;
int index, ret;
- sleep_flags = lock_system_sleep();
+ if (!mutex_trylock(&system_transition_mutex))
+ return -EBUSY;
index = sysfs_match_string(comp_alg_enabled, compressor);
if (index >= 0) {
@@ -1461,7 +1461,7 @@ static int hibernate_compressor_param_set(const char *compressor,
ret = index;
}
- unlock_system_sleep(sleep_flags);
+ mutex_unlock(&system_transition_mutex);
if (ret)
pr_debug("Cannot set specified compressor %s\n",
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index c9fb559a6399..4e6e24e8b854 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -2270,9 +2270,9 @@ int snapshot_read_next(struct snapshot_handle *handle)
*/
void *kaddr;
- kaddr = kmap_atomic(page);
+ kaddr = kmap_local_page(page);
copy_page(buffer, kaddr);
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
handle->buffer = buffer;
} else {
handle->buffer = page_address(page);
@@ -2561,9 +2561,9 @@ static void copy_last_highmem_page(void)
if (last_highmem_page) {
void *dst;
- dst = kmap_atomic(last_highmem_page);
+ dst = kmap_local_page(last_highmem_page);
copy_page(dst, buffer);
- kunmap_atomic(dst);
+ kunmap_local(dst);
last_highmem_page = NULL;
}
}
@@ -2881,13 +2881,13 @@ static inline void swap_two_pages_data(struct page *p1, struct page *p2,
{
void *kaddr1, *kaddr2;
- kaddr1 = kmap_atomic(p1);
- kaddr2 = kmap_atomic(p2);
+ kaddr1 = kmap_local_page(p1);
+ kaddr2 = kmap_local_page(p2);
copy_page(buf, kaddr1);
copy_page(kaddr1, kaddr2);
copy_page(kaddr2, buf);
- kunmap_atomic(kaddr2);
- kunmap_atomic(kaddr1);
+ kunmap_local(kaddr2);
+ kunmap_local(kaddr1);
}
/**
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 09f8397bae15..6fae1e0a331c 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -91,6 +91,16 @@ static void s2idle_enter(void)
{
trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_TO_IDLE, true);
+ /*
+ * The correctness of the code below depends on the number of online
+ * CPUs being stable, but CPUs cannot be taken offline or put online
+ * while it is running.
+ *
+ * The s2idle_lock must be acquired before the pending wakeup check to
+ * prevent pm_system_wakeup() from running as a whole between that check
+ * and the subsequent s2idle_state update in which case a wakeup event
+ * would get lost.
+ */
raw_spin_lock_irq(&s2idle_lock);
if (pm_wakeup_pending())
goto out;
@@ -98,8 +108,6 @@ static void s2idle_enter(void)
s2idle_state = S2IDLE_STATE_ENTER;
raw_spin_unlock_irq(&s2idle_lock);
- cpus_read_lock();
-
/* Push all the CPUs into the idle loop. */
wake_up_all_idle_cpus();
/* Make the current CPU wait so it can enter the idle loop too. */
@@ -112,8 +120,6 @@ static void s2idle_enter(void)
*/
wake_up_all_idle_cpus();
- cpus_read_unlock();
-
raw_spin_lock_irq(&s2idle_lock);
out:
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 07668433644b..057db78876cd 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2461,7 +2461,6 @@ asmlinkage __visible int _printk(const char *fmt, ...)
}
EXPORT_SYMBOL(_printk);
-static bool pr_flush(int timeout_ms, bool reset_on_progress);
static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress);
#else /* CONFIG_PRINTK */
@@ -2474,7 +2473,6 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
static u64 syslog_seq;
-static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; }
static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; }
#endif /* CONFIG_PRINTK */
@@ -4466,7 +4464,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
* Context: Process context. May sleep while acquiring console lock.
* Return: true if all usable printers are caught up.
*/
-static bool pr_flush(int timeout_ms, bool reset_on_progress)
+bool pr_flush(int timeout_ms, bool reset_on_progress)
{
return __pr_flush(NULL, timeout_ms, reset_on_progress);
}
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index b9b6bc55185d..aa42de4d2768 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -18,7 +18,7 @@ config TREE_RCU
config PREEMPT_RCU
bool
- default y if PREEMPTION
+ default y if (PREEMPT || PREEMPT_RT || PREEMPT_DYNAMIC)
select TREE_RCU
help
This option selects the RCU implementation that is
@@ -65,6 +65,17 @@ config TREE_SRCU
help
This option selects the full-fledged version of SRCU.
+config FORCE_NEED_SRCU_NMI_SAFE
+ bool "Force selection of NEED_SRCU_NMI_SAFE"
+ depends on !TINY_SRCU
+ select NEED_SRCU_NMI_SAFE
+ default n
+ help
+ This option forces selection of the NEED_SRCU_NMI_SAFE
+ Kconfig option, allowing testing of srcu_read_lock_nmisafe()
+ and srcu_read_unlock_nmisafe() on architectures (like x86)
+ that select the ARCH_HAS_NMI_SAFE_THIS_CPU_OPS Kconfig option.
+
config NEED_SRCU_NMI_SAFE
def_bool HAVE_NMI && !ARCH_HAS_NMI_SAFE_THIS_CPU_OPS && !TINY_SRCU
@@ -91,7 +102,7 @@ config NEED_TASKS_RCU
config TASKS_RCU
bool
- default NEED_TASKS_RCU && (PREEMPTION || PREEMPT_AUTO)
+ default NEED_TASKS_RCU && PREEMPTION
select IRQ_WORK
config FORCE_TASKS_RUDE_RCU
@@ -323,21 +334,27 @@ config RCU_LAZY
depends on RCU_NOCB_CPU
default n
help
- To save power, batch RCU callbacks and flush after delay, memory
- pressure, or callback list growing too big.
+ To save power, batch RCU callbacks and delay starting the
+ corresponding grace period for multiple seconds. The grace
+ period will be started after this delay, in case of memory
+ pressure, or if the corresponding CPU's callback list grows
+ too large.
- Requires rcu_nocbs=all to be set.
+ These delays happen only on rcu_nocbs CPUs, that is, CPUs
+ whose callbacks have been offloaded.
- Use rcutree.enable_rcu_lazy=0 to turn it off at boot time.
+ Use the rcutree.enable_rcu_lazy=0 kernel-boot parameter to
+ globally disable these delays.
config RCU_LAZY_DEFAULT_OFF
bool "Turn RCU lazy invocation off by default"
depends on RCU_LAZY
default n
help
- Allows building the kernel with CONFIG_RCU_LAZY=y yet keep it default
- off. Boot time param rcutree.enable_rcu_lazy=1 can be used to switch
- it back on.
+ Build the kernel with CONFIG_RCU_LAZY=y, but cause the kernel
+ to boot with these energy-efficiency delays disabled. Use the
+ rcutree.enable_rcu_lazy=0 kernel-boot parameter to override
+ the this option at boot time, thus re-enabling these delays.
config RCU_DOUBLE_CHECK_CB_TIME
bool "RCU callback-batch backup time check"
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 6af90510a1ca..12e4c64ebae1 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -54,7 +54,7 @@ config RCU_TORTURE_TEST
Say N if you are unsure.
config RCU_TORTURE_TEST_CHK_RDR_STATE
- tristate "Check rcutorture reader state"
+ bool "Check rcutorture reader state"
depends on RCU_TORTURE_TEST
default n
help
@@ -70,7 +70,7 @@ config RCU_TORTURE_TEST_CHK_RDR_STATE
Say N if you are unsure.
config RCU_TORTURE_TEST_LOG_CPU
- tristate "Log CPU for rcutorture failures"
+ bool "Log CPU for rcutorture failures"
depends on RCU_TORTURE_TEST
default n
help
@@ -84,6 +84,20 @@ config RCU_TORTURE_TEST_LOG_CPU
Say Y here if you want CPU IDs logged.
Say N if you are unsure.
+config RCU_TORTURE_TEST_LOG_GP
+ bool "Log grace-period numbers for rcutorture failures"
+ depends on RCU_TORTURE_TEST
+ default n
+ help
+ This option causes rcutorture to decorate each entry of its
+ log of failure/close-call rcutorture reader segments with the
+ corresponding grace-period sequence numbers. This information
+ can be useful, but it does incur additional overhead, overhead
+ that can make both failures and close calls less probable.
+
+ Say Y here if you want grace-period sequence numbers logged.
+ Say N if you are unsure.
+
config RCU_REF_SCALE_TEST
tristate "Scalability tests for read-side synchronization (RCU and others)"
depends on DEBUG_KERNEL
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index feb3ac1dc5d5..eed2951a4962 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -162,7 +162,7 @@ static inline bool rcu_seq_done_exact(unsigned long *sp, unsigned long s)
{
unsigned long cur_s = READ_ONCE(*sp);
- return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (2 * RCU_SEQ_STATE_MASK + 1));
+ return ULONG_CMP_GE(cur_s, s) || ULONG_CMP_LT(cur_s, s - (3 * RCU_SEQ_STATE_MASK + 1));
}
/*
@@ -590,6 +590,8 @@ void do_trace_rcu_torture_read(const char *rcutorturename,
#endif
static inline void rcu_gp_set_torture_wait(int duration) { }
#endif
+unsigned long long rcutorture_gather_gp_seqs(void);
+void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len);
#ifdef CONFIG_TINY_SRCU
@@ -611,8 +613,6 @@ void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags,
static inline bool rcu_watching_zero_in_eqs(int cpu, int *vp) { return false; }
static inline unsigned long rcu_get_gp_seq(void) { return 0; }
static inline unsigned long rcu_exp_batches_completed(void) { return 0; }
-static inline unsigned long
-srcu_batches_completed(struct srcu_struct *sp) { return 0; }
static inline void rcu_force_quiescent_state(void) { }
static inline bool rcu_check_boost_fail(unsigned long gp_state, int *cpup) { return true; }
static inline void show_rcu_gp_kthreads(void) { }
@@ -624,7 +624,6 @@ static inline void rcu_gp_slow_unregister(atomic_t *rgssp) { }
bool rcu_watching_zero_in_eqs(int cpu, int *vp);
unsigned long rcu_get_gp_seq(void);
unsigned long rcu_exp_batches_completed(void);
-unsigned long srcu_batches_completed(struct srcu_struct *sp);
bool rcu_check_boost_fail(unsigned long gp_state, int *cpup);
void show_rcu_gp_kthreads(void);
int rcu_get_gp_kthreads_prio(void);
@@ -636,6 +635,12 @@ void rcu_gp_slow_register(atomic_t *rgssp);
void rcu_gp_slow_unregister(atomic_t *rgssp);
#endif /* #else #ifdef CONFIG_TINY_RCU */
+#ifdef CONFIG_TINY_SRCU
+static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) { return 0; }
+#else // #ifdef CONFIG_TINY_SRCU
+unsigned long srcu_batches_completed(struct srcu_struct *sp);
+#endif // #else // #ifdef CONFIG_TINY_SRCU
+
#ifdef CONFIG_RCU_NOCB_CPU
void rcu_bind_current_to_nocb(void);
#else
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index d26fb1d33ed9..65095664f5c5 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -135,6 +135,7 @@ torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s
torture_param(int, stutter, 5, "Number of seconds to run/halt test");
torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
torture_param(int, test_boost_duration, 4, "Duration of each boost test, seconds.");
+torture_param(int, test_boost_holdoff, 0, "Holdoff time from rcutorture start, seconds.");
torture_param(int, test_boost_interval, 7, "Interval between boost tests, seconds.");
torture_param(int, test_nmis, 0, "End-test NMI tests, 0 to disable.");
torture_param(bool, test_no_idle_hz, true, "Test support for tickless idle CPUs");
@@ -147,6 +148,7 @@ MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)");
static int nrealnocbers;
static int nrealreaders;
+static int nrealfakewriters;
static struct task_struct *writer_task;
static struct task_struct **fakewriter_tasks;
static struct task_struct **reader_tasks;
@@ -272,6 +274,9 @@ struct rt_read_seg {
bool rt_preempted;
int rt_cpu;
int rt_end_cpu;
+ unsigned long long rt_gp_seq;
+ unsigned long long rt_gp_seq_end;
+ u64 rt_ts;
};
static int err_segs_recorded;
static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS];
@@ -406,6 +411,8 @@ struct rcu_torture_ops {
void (*gp_slow_register)(atomic_t *rgssp);
void (*gp_slow_unregister)(atomic_t *rgssp);
bool (*reader_blocked)(void);
+ unsigned long long (*gather_gp_seqs)(void);
+ void (*format_gp_seqs)(unsigned long long seqs, char *cp, size_t len);
long cbflood_max;
int irq_capable;
int can_boost;
@@ -610,6 +617,8 @@ static struct rcu_torture_ops rcu_ops = {
.reader_blocked = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)
? has_rcu_reader_blocked
: NULL,
+ .gather_gp_seqs = rcutorture_gather_gp_seqs,
+ .format_gp_seqs = rcutorture_format_gp_seqs,
.irq_capable = 1,
.can_boost = IS_ENABLED(CONFIG_RCU_BOOST),
.extendables = RCUTORTURE_MAX_EXTEND,
@@ -655,6 +664,8 @@ static struct rcu_torture_ops rcu_busted_ops = {
.sync = synchronize_rcu_busted,
.exp_sync = synchronize_rcu_busted,
.call = call_rcu_busted,
+ .gather_gp_seqs = rcutorture_gather_gp_seqs,
+ .format_gp_seqs = rcutorture_format_gp_seqs,
.irq_capable = 1,
.extendables = RCUTORTURE_MAX_EXTEND,
.name = "busted"
@@ -677,8 +688,11 @@ static void srcu_get_gp_data(int *flags, unsigned long *gp_seq)
static int srcu_torture_read_lock(void)
{
int idx;
+ struct srcu_ctr __percpu *scp;
int ret = 0;
+ WARN_ON_ONCE(reader_flavor & ~SRCU_READ_FLAVOR_ALL);
+
if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) {
idx = srcu_read_lock(srcu_ctlp);
WARN_ON_ONCE(idx & ~0x1);
@@ -694,6 +708,12 @@ static int srcu_torture_read_lock(void)
WARN_ON_ONCE(idx & ~0x1);
ret += idx << 2;
}
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST) {
+ scp = srcu_read_lock_fast(srcu_ctlp);
+ idx = __srcu_ptr_to_ctr(srcu_ctlp, scp);
+ WARN_ON_ONCE(idx & ~0x1);
+ ret += idx << 3;
+ }
return ret;
}
@@ -719,6 +739,8 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
static void srcu_torture_read_unlock(int idx)
{
WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1)));
+ if (reader_flavor & SRCU_READ_FLAVOR_FAST)
+ srcu_read_unlock_fast(srcu_ctlp, __srcu_ctr_to_ptr(srcu_ctlp, (idx & 0x8) >> 3));
if (reader_flavor & SRCU_READ_FLAVOR_LITE)
srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2);
if (reader_flavor & SRCU_READ_FLAVOR_NMI)
@@ -791,6 +813,7 @@ static struct rcu_torture_ops srcu_ops = {
.readunlock = srcu_torture_read_unlock,
.readlock_held = torture_srcu_read_lock_held,
.get_gp_seq = srcu_torture_completed,
+ .gp_diff = rcu_seq_diff,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
.exp_sync = srcu_torture_synchronize_expedited,
@@ -834,6 +857,7 @@ static struct rcu_torture_ops srcud_ops = {
.readunlock = srcu_torture_read_unlock,
.readlock_held = torture_srcu_read_lock_held,
.get_gp_seq = srcu_torture_completed,
+ .gp_diff = rcu_seq_diff,
.deferred_free = srcu_torture_deferred_free,
.sync = srcu_torture_synchronize,
.exp_sync = srcu_torture_synchronize_expedited,
@@ -1148,8 +1172,19 @@ static int rcu_torture_boost(void *arg)
unsigned long gp_state;
unsigned long gp_state_time;
unsigned long oldstarttime;
+ unsigned long booststarttime = get_torture_init_jiffies() + test_boost_holdoff * HZ;
- VERBOSE_TOROUT_STRING("rcu_torture_boost started");
+ if (test_boost_holdoff <= 0 || time_after(jiffies, booststarttime)) {
+ VERBOSE_TOROUT_STRING("rcu_torture_boost started");
+ } else {
+ VERBOSE_TOROUT_STRING("rcu_torture_boost started holdoff period");
+ while (time_before(jiffies, booststarttime)) {
+ schedule_timeout_idle(HZ);
+ if (kthread_should_stop())
+ goto cleanup;
+ }
+ VERBOSE_TOROUT_STRING("rcu_torture_boost finished holdoff period");
+ }
/* Set real-time priority. */
sched_set_fifo_low(current);
@@ -1225,6 +1260,7 @@ checkwait: if (stutter_wait("rcu_torture_boost"))
sched_set_fifo_low(current);
} while (!torture_must_stop());
+cleanup:
/* Clean up and exit. */
while (!kthread_should_stop()) {
torture_shutdown_absorb("rcu_torture_boost");
@@ -1728,7 +1764,7 @@ rcu_torture_fakewriter(void *arg)
do {
torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand);
if (cur_ops->cb_barrier != NULL &&
- torture_random(&rand) % (nfakewriters * 8) == 0) {
+ torture_random(&rand) % (nrealfakewriters * 8) == 0) {
cur_ops->cb_barrier();
} else {
switch (synctype[torture_random(&rand) % nsynctypes]) {
@@ -1873,6 +1909,8 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp,
#define ROEC_ARGS "%s %s: Current %#x To add %#x To remove %#x preempt_count() %#x\n", __func__, s, curstate, new, old, preempt_count()
static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, bool insoftirq)
{
+ int mask;
+
if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE))
return;
@@ -1899,11 +1937,27 @@ static void rcutorture_one_extend_check(char *s, int curstate, int new, int old,
WARN_ONCE(cur_ops->extendables &&
!(curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) &&
(preempt_count() & SOFTIRQ_MASK), ROEC_ARGS);
- WARN_ONCE(cur_ops->extendables &&
- !(curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) &&
+
+ /*
+ * non-preemptible RCU in a preemptible kernel uses preempt_disable()
+ * as rcu_read_lock().
+ */
+ mask = RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED;
+ if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
+ mask |= RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2;
+
+ WARN_ONCE(cur_ops->extendables && !(curstate & mask) &&
(preempt_count() & PREEMPT_MASK), ROEC_ARGS);
- WARN_ONCE(cur_ops->readlock_nesting &&
- !(curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) &&
+
+ /*
+ * non-preemptible RCU in a preemptible kernel uses "preempt_count() &
+ * PREEMPT_MASK" as ->readlock_nesting().
+ */
+ mask = RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2;
+ if (!IS_ENABLED(CONFIG_PREEMPT_RCU))
+ mask |= RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED;
+
+ WARN_ONCE(cur_ops->readlock_nesting && !(curstate & mask) &&
cur_ops->readlock_nesting() > 0, ROEC_ARGS);
}
@@ -1965,6 +2019,13 @@ static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq,
rtrsp[-1].rt_preempted = cur_ops->reader_blocked();
}
}
+ // Sample grace-period sequence number, as good a place as any.
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) && cur_ops->gather_gp_seqs) {
+ rtrsp->rt_gp_seq = cur_ops->gather_gp_seqs();
+ rtrsp->rt_ts = ktime_get_mono_fast_ns();
+ if (!first)
+ rtrsp[-1].rt_gp_seq_end = rtrsp->rt_gp_seq;
+ }
/*
* Next, remove old protection, in decreasing order of strength
@@ -2512,7 +2573,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"shuffle_interval=%d stutter=%d irqreader=%d "
"fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
"test_boost=%d/%d test_boost_interval=%d "
- "test_boost_duration=%d shutdown_secs=%d "
+ "test_boost_duration=%d test_boost_holdoff=%d shutdown_secs=%d "
"stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d "
"stall_cpu_block=%d stall_cpu_repeat=%d "
"n_barrier_cbs=%d "
@@ -2522,11 +2583,11 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"nocbs_nthreads=%d nocbs_toggle=%d "
"test_nmis=%d "
"preempt_duration=%d preempt_interval=%d\n",
- torture_type, tag, nrealreaders, nfakewriters,
+ torture_type, tag, nrealreaders, nrealfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
test_boost, cur_ops->can_boost,
- test_boost_interval, test_boost_duration, shutdown_secs,
+ test_boost_interval, test_boost_duration, test_boost_holdoff, shutdown_secs,
stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff,
stall_cpu_block, stall_cpu_repeat,
n_barrier_cbs,
@@ -3553,6 +3614,7 @@ rcu_torture_cleanup(void)
int flags = 0;
unsigned long gp_seq = 0;
int i;
+ int j;
if (torture_cleanup_begin()) {
if (cur_ops->cb_barrier != NULL) {
@@ -3597,7 +3659,7 @@ rcu_torture_cleanup(void)
rcu_torture_reader_mbchk = NULL;
if (fakewriter_tasks) {
- for (i = 0; i < nfakewriters; i++)
+ for (i = 0; i < nrealfakewriters; i++)
torture_stop_kthread(rcu_torture_fakewriter,
fakewriter_tasks[i]);
kfree(fakewriter_tasks);
@@ -3635,7 +3697,11 @@ rcu_torture_cleanup(void)
pr_alert("\t: No segments recorded!!!\n");
firsttime = 1;
for (i = 0; i < rt_read_nsegs; i++) {
- pr_alert("\t%d: %#4x", i, err_segs[i].rt_readstate);
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP))
+ pr_alert("\t%lluus ", div64_u64(err_segs[i].rt_ts, 1000ULL));
+ else
+ pr_alert("\t");
+ pr_cont("%d: %#4x", i, err_segs[i].rt_readstate);
if (err_segs[i].rt_delay_jiffies != 0) {
pr_cont("%s%ldjiffies", firsttime ? "" : "+",
err_segs[i].rt_delay_jiffies);
@@ -3648,6 +3714,27 @@ rcu_torture_cleanup(void)
else
pr_cont(" ...");
}
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_GP) &&
+ cur_ops->gather_gp_seqs && cur_ops->format_gp_seqs) {
+ char buf1[20+1];
+ char buf2[20+1];
+ char sepchar = '-';
+
+ cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq,
+ buf1, ARRAY_SIZE(buf1));
+ cur_ops->format_gp_seqs(err_segs[i].rt_gp_seq_end,
+ buf2, ARRAY_SIZE(buf2));
+ if (err_segs[i].rt_gp_seq == err_segs[i].rt_gp_seq_end) {
+ if (buf2[0]) {
+ for (j = 0; buf2[j]; j++)
+ buf2[j] = '.';
+ if (j)
+ buf2[j - 1] = ' ';
+ }
+ sepchar = ' ';
+ }
+ pr_cont(" %s%c%s", buf1, sepchar, buf2);
+ }
if (err_segs[i].rt_delay_ms != 0) {
pr_cont(" %s%ldms", firsttime ? "" : "+",
err_segs[i].rt_delay_ms);
@@ -3994,6 +4081,14 @@ rcu_torture_init(void)
rcu_torture_init_srcu_lockdep();
+ if (nfakewriters >= 0) {
+ nrealfakewriters = nfakewriters;
+ } else {
+ nrealfakewriters = num_online_cpus() - 2 - nfakewriters;
+ if (nrealfakewriters <= 0)
+ nrealfakewriters = 1;
+ }
+
if (nreaders >= 0) {
nrealreaders = nreaders;
} else {
@@ -4050,8 +4145,9 @@ rcu_torture_init(void)
writer_task);
if (torture_init_error(firsterr))
goto unwind;
- if (nfakewriters > 0) {
- fakewriter_tasks = kcalloc(nfakewriters,
+
+ if (nrealfakewriters > 0) {
+ fakewriter_tasks = kcalloc(nrealfakewriters,
sizeof(fakewriter_tasks[0]),
GFP_KERNEL);
if (fakewriter_tasks == NULL) {
@@ -4060,7 +4156,7 @@ rcu_torture_init(void)
goto unwind;
}
}
- for (i = 0; i < nfakewriters; i++) {
+ for (i = 0; i < nrealfakewriters; i++) {
firsterr = torture_create_kthread(rcu_torture_fakewriter,
NULL, fakewriter_tasks[i]);
if (torture_init_error(firsterr))
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 1b47376acdc4..f11a7c2af778 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -216,6 +216,36 @@ static const struct ref_scale_ops srcu_ops = {
.name = "srcu"
};
+static void srcu_fast_ref_scale_read_section(const int nloops)
+{
+ int i;
+ struct srcu_ctr __percpu *scp;
+
+ for (i = nloops; i >= 0; i--) {
+ scp = srcu_read_lock_fast(srcu_ctlp);
+ srcu_read_unlock_fast(srcu_ctlp, scp);
+ }
+}
+
+static void srcu_fast_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+ int i;
+ struct srcu_ctr __percpu *scp;
+
+ for (i = nloops; i >= 0; i--) {
+ scp = srcu_read_lock_fast(srcu_ctlp);
+ un_delay(udl, ndl);
+ srcu_read_unlock_fast(srcu_ctlp, scp);
+ }
+}
+
+static const struct ref_scale_ops srcu_fast_ops = {
+ .init = rcu_sync_scale_init,
+ .readsection = srcu_fast_ref_scale_read_section,
+ .delaysection = srcu_fast_ref_scale_delay_section,
+ .name = "srcu-fast"
+};
+
static void srcu_lite_ref_scale_read_section(const int nloops)
{
int i;
@@ -1163,7 +1193,7 @@ ref_scale_init(void)
long i;
int firsterr = 0;
static const struct ref_scale_ops *scale_ops[] = {
- &rcu_ops, &srcu_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS
+ &rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS
&refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops,
&acqrel_ops, &sched_clock_ops, &clock_ops, &jiffies_ops,
&typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops,
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 4dcbf8aa80ff..6e9fe2ce1075 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -20,7 +20,11 @@
#include "rcu_segcblist.h"
#include "rcu.h"
+#ifndef CONFIG_TREE_RCU
int rcu_scheduler_active __read_mostly;
+#else // #ifndef CONFIG_TREE_RCU
+extern int rcu_scheduler_active;
+#endif // #else // #ifndef CONFIG_TREE_RCU
static LIST_HEAD(srcu_boot_list);
static bool srcu_init_done;
@@ -98,7 +102,7 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
{
int newval;
- preempt_disable(); // Needed for PREEMPT_AUTO
+ preempt_disable(); // Needed for PREEMPT_LAZY
newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval);
preempt_enable();
@@ -120,7 +124,7 @@ void srcu_drive_gp(struct work_struct *wp)
struct srcu_struct *ssp;
ssp = container_of(wp, struct srcu_struct, srcu_work);
- preempt_disable(); // Needed for PREEMPT_AUTO
+ preempt_disable(); // Needed for PREEMPT_LAZY
if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) {
preempt_enable();
return; /* Already running or nothing to do. */
@@ -138,7 +142,7 @@ void srcu_drive_gp(struct work_struct *wp)
WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
preempt_enable();
swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx]));
- preempt_disable(); // Needed for PREEMPT_AUTO
+ preempt_disable(); // Needed for PREEMPT_LAZY
WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
preempt_enable();
@@ -159,7 +163,7 @@ void srcu_drive_gp(struct work_struct *wp)
* at interrupt level, but the ->srcu_gp_running checks will
* straighten that out.
*/
- preempt_disable(); // Needed for PREEMPT_AUTO
+ preempt_disable(); // Needed for PREEMPT_LAZY
WRITE_ONCE(ssp->srcu_gp_running, false);
idx = ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max));
preempt_enable();
@@ -172,7 +176,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
{
unsigned long cookie;
- preempt_disable(); // Needed for PREEMPT_AUTO
+ preempt_disable(); // Needed for PREEMPT_LAZY
cookie = get_state_synchronize_srcu(ssp);
if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) {
preempt_enable();
@@ -199,7 +203,7 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
rhp->func = func;
rhp->next = NULL;
- preempt_disable(); // Needed for PREEMPT_AUTO
+ preempt_disable(); // Needed for PREEMPT_LAZY
local_irq_save(flags);
*ssp->srcu_cb_tail = rhp;
ssp->srcu_cb_tail = &rhp->next;
@@ -261,7 +265,7 @@ unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
{
unsigned long ret;
- preempt_disable(); // Needed for PREEMPT_AUTO
+ preempt_disable(); // Needed for PREEMPT_LAZY
ret = get_state_synchronize_srcu(ssp);
srcu_gp_start_if_needed(ssp);
preempt_enable();
@@ -282,11 +286,13 @@ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
}
EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
+#ifndef CONFIG_TREE_RCU
/* Lockdep diagnostics. */
void __init rcu_scheduler_starting(void)
{
rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
}
+#endif // #ifndef CONFIG_TREE_RCU
/*
* Queue work for srcu_struct structures with early boot callbacks.
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index b83c74c4dcc0..d2a694944553 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -116,8 +116,9 @@ do { \
/*
* Initialize SRCU per-CPU data. Note that statically allocated
* srcu_struct structures might already have srcu_read_lock() and
- * srcu_read_unlock() running against them. So if the is_static parameter
- * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[].
+ * srcu_read_unlock() running against them. So if the is_static
+ * parameter is set, don't initialize ->srcu_ctrs[].srcu_locks and
+ * ->srcu_ctrs[].srcu_unlocks.
*/
static void init_srcu_struct_data(struct srcu_struct *ssp)
{
@@ -128,8 +129,6 @@ static void init_srcu_struct_data(struct srcu_struct *ssp)
* Initialize the per-CPU srcu_data array, which feeds into the
* leaves of the srcu_node tree.
*/
- BUILD_BUG_ON(ARRAY_SIZE(sdp->srcu_lock_count) !=
- ARRAY_SIZE(sdp->srcu_unlock_count));
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(ssp->sda, cpu);
spin_lock_init(&ACCESS_PRIVATE(sdp, lock));
@@ -247,15 +246,16 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static)
ssp->srcu_sup->node = NULL;
mutex_init(&ssp->srcu_sup->srcu_cb_mutex);
mutex_init(&ssp->srcu_sup->srcu_gp_mutex);
- ssp->srcu_idx = 0;
ssp->srcu_sup->srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL;
ssp->srcu_sup->srcu_barrier_seq = 0;
mutex_init(&ssp->srcu_sup->srcu_barrier_mutex);
atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0);
INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu);
ssp->srcu_sup->sda_is_static = is_static;
- if (!is_static)
+ if (!is_static) {
ssp->sda = alloc_percpu(struct srcu_data);
+ ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0];
+ }
if (!ssp->sda)
goto err_free_sup;
init_srcu_struct_data(ssp);
@@ -429,10 +429,10 @@ static bool srcu_gp_is_expedited(struct srcu_struct *ssp)
}
/*
- * Computes approximate total of the readers' ->srcu_lock_count[] values
- * for the rank of per-CPU counters specified by idx, and returns true if
- * the caller did the proper barrier (gp), and if the count of the locks
- * matches that of the unlocks passed in.
+ * Computes approximate total of the readers' ->srcu_ctrs[].srcu_locks
+ * values for the rank of per-CPU counters specified by idx, and returns
+ * true if the caller did the proper barrier (gp), and if the count of
+ * the locks matches that of the unlocks passed in.
*/
static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, unsigned long unlocks)
{
@@ -443,20 +443,20 @@ static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, uns
for_each_possible_cpu(cpu) {
struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
- sum += atomic_long_read(&sdp->srcu_lock_count[idx]);
+ sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks);
if (IS_ENABLED(CONFIG_PROVE_RCU))
mask = mask | READ_ONCE(sdp->srcu_reader_flavor);
}
WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)),
"Mixed reader flavors for srcu_struct at %ps.\n", ssp);
- if (mask & SRCU_READ_FLAVOR_LITE && !gp)
+ if (mask & SRCU_READ_FLAVOR_SLOWGP && !gp)
return false;
return sum == unlocks;
}
/*
- * Returns approximate total of the readers' ->srcu_unlock_count[] values
- * for the rank of per-CPU counters specified by idx.
+ * Returns approximate total of the readers' ->srcu_ctrs[].srcu_unlocks
+ * values for the rank of per-CPU counters specified by idx.
*/
static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, unsigned long *rdm)
{
@@ -467,7 +467,7 @@ static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, u
for_each_possible_cpu(cpu) {
struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
- sum += atomic_long_read(&sdp->srcu_unlock_count[idx]);
+ sum += atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks);
mask = mask | READ_ONCE(sdp->srcu_reader_flavor);
}
WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)),
@@ -487,7 +487,7 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)
unsigned long unlocks;
unlocks = srcu_readers_unlock_idx(ssp, idx, &rdm);
- did_gp = !!(rdm & SRCU_READ_FLAVOR_LITE);
+ did_gp = !!(rdm & SRCU_READ_FLAVOR_SLOWGP);
/*
* Make sure that a lock is always counted if the corresponding
@@ -509,48 +509,49 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx)
* If the locks are the same as the unlocks, then there must have
* been no readers on this index at some point in this function.
* But there might be more readers, as a task might have read
- * the current ->srcu_idx but not yet have incremented its CPU's
- * ->srcu_lock_count[idx] counter. In fact, it is possible
+ * the current ->srcu_ctrp but not yet have incremented its CPU's
+ * ->srcu_ctrs[idx].srcu_locks counter. In fact, it is possible
* that most of the tasks have been preempted between fetching
- * ->srcu_idx and incrementing ->srcu_lock_count[idx]. And there
- * could be almost (ULONG_MAX / sizeof(struct task_struct)) tasks
- * in a system whose address space was fully populated with memory.
- * Call this quantity Nt.
+ * ->srcu_ctrp and incrementing ->srcu_ctrs[idx].srcu_locks. And
+ * there could be almost (ULONG_MAX / sizeof(struct task_struct))
+ * tasks in a system whose address space was fully populated
+ * with memory. Call this quantity Nt.
*
- * So suppose that the updater is preempted at this point in the
- * code for a long time. That now-preempted updater has already
- * flipped ->srcu_idx (possibly during the preceding grace period),
- * done an smp_mb() (again, possibly during the preceding grace
- * period), and summed up the ->srcu_unlock_count[idx] counters.
- * How many times can a given one of the aforementioned Nt tasks
- * increment the old ->srcu_idx value's ->srcu_lock_count[idx]
- * counter, in the absence of nesting?
+ * So suppose that the updater is preempted at this
+ * point in the code for a long time. That now-preempted
+ * updater has already flipped ->srcu_ctrp (possibly during
+ * the preceding grace period), done an smp_mb() (again,
+ * possibly during the preceding grace period), and summed up
+ * the ->srcu_ctrs[idx].srcu_unlocks counters. How many times
+ * can a given one of the aforementioned Nt tasks increment the
+ * old ->srcu_ctrp value's ->srcu_ctrs[idx].srcu_locks counter,
+ * in the absence of nesting?
*
* It can clearly do so once, given that it has already fetched
- * the old value of ->srcu_idx and is just about to use that value
- * to index its increment of ->srcu_lock_count[idx]. But as soon as
- * it leaves that SRCU read-side critical section, it will increment
- * ->srcu_unlock_count[idx], which must follow the updater's above
- * read from that same value. Thus, as soon the reading task does
- * an smp_mb() and a later fetch from ->srcu_idx, that task will be
- * guaranteed to get the new index. Except that the increment of
- * ->srcu_unlock_count[idx] in __srcu_read_unlock() is after the
- * smp_mb(), and the fetch from ->srcu_idx in __srcu_read_lock()
- * is before the smp_mb(). Thus, that task might not see the new
- * value of ->srcu_idx until the -second- __srcu_read_lock(),
- * which in turn means that this task might well increment
- * ->srcu_lock_count[idx] for the old value of ->srcu_idx twice,
- * not just once.
+ * the old value of ->srcu_ctrp and is just about to use that
+ * value to index its increment of ->srcu_ctrs[idx].srcu_locks.
+ * But as soon as it leaves that SRCU read-side critical section,
+ * it will increment ->srcu_ctrs[idx].srcu_unlocks, which must
+ * follow the updater's above read from that same value. Thus,
+ as soon the reading task does an smp_mb() and a later fetch from
+ * ->srcu_ctrp, that task will be guaranteed to get the new index.
+ * Except that the increment of ->srcu_ctrs[idx].srcu_unlocks
+ * in __srcu_read_unlock() is after the smp_mb(), and the fetch
+ * from ->srcu_ctrp in __srcu_read_lock() is before the smp_mb().
+ * Thus, that task might not see the new value of ->srcu_ctrp until
+ * the -second- __srcu_read_lock(), which in turn means that this
+ * task might well increment ->srcu_ctrs[idx].srcu_locks for the
+ * old value of ->srcu_ctrp twice, not just once.
*
* However, it is important to note that a given smp_mb() takes
* effect not just for the task executing it, but also for any
* later task running on that same CPU.
*
- * That is, there can be almost Nt + Nc further increments of
- * ->srcu_lock_count[idx] for the old index, where Nc is the number
- * of CPUs. But this is OK because the size of the task_struct
- * structure limits the value of Nt and current systems limit Nc
- * to a few thousand.
+ * That is, there can be almost Nt + Nc further increments
+ * of ->srcu_ctrs[idx].srcu_locks for the old index, where Nc
+ * is the number of CPUs. But this is OK because the size of
+ * the task_struct structure limits the value of Nt and current
+ * systems limit Nc to a few thousand.
*
* OK, but what about nesting? This does impose a limit on
* nesting of half of the size of the task_struct structure
@@ -581,10 +582,10 @@ static bool srcu_readers_active(struct srcu_struct *ssp)
for_each_possible_cpu(cpu) {
struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu);
- sum += atomic_long_read(&sdp->srcu_lock_count[0]);
- sum += atomic_long_read(&sdp->srcu_lock_count[1]);
- sum -= atomic_long_read(&sdp->srcu_unlock_count[0]);
- sum -= atomic_long_read(&sdp->srcu_unlock_count[1]);
+ sum += atomic_long_read(&sdp->srcu_ctrs[0].srcu_locks);
+ sum += atomic_long_read(&sdp->srcu_ctrs[1].srcu_locks);
+ sum -= atomic_long_read(&sdp->srcu_ctrs[0].srcu_unlocks);
+ sum -= atomic_long_read(&sdp->srcu_ctrs[1].srcu_unlocks);
}
return sum;
}
@@ -647,6 +648,7 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
unsigned long jbase = SRCU_INTERVAL;
struct srcu_usage *sup = ssp->srcu_sup;
+ lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock));
if (srcu_gp_is_expedited(ssp))
jbase = 0;
if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) {
@@ -674,9 +676,13 @@ static unsigned long srcu_get_delay(struct srcu_struct *ssp)
void cleanup_srcu_struct(struct srcu_struct *ssp)
{
int cpu;
+ unsigned long delay;
struct srcu_usage *sup = ssp->srcu_sup;
- if (WARN_ON(!srcu_get_delay(ssp)))
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
+ delay = srcu_get_delay(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
+ if (WARN_ON(!delay))
return; /* Just leak it! */
if (WARN_ON(srcu_readers_active(ssp)))
return; /* Just leak it! */
@@ -743,12 +749,11 @@ EXPORT_SYMBOL_GPL(__srcu_check_read_flavor);
*/
int __srcu_read_lock(struct srcu_struct *ssp)
{
- int idx;
+ struct srcu_ctr __percpu *scp = READ_ONCE(ssp->srcu_ctrp);
- idx = READ_ONCE(ssp->srcu_idx) & 0x1;
- this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter);
+ this_cpu_inc(scp->srcu_locks.counter);
smp_mb(); /* B */ /* Avoid leaking the critical section. */
- return idx;
+ return __srcu_ptr_to_ctr(ssp, scp);
}
EXPORT_SYMBOL_GPL(__srcu_read_lock);
@@ -760,7 +765,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
{
smp_mb(); /* C */ /* Avoid leaking the critical section. */
- this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter);
+ this_cpu_inc(__srcu_ctr_to_ptr(ssp, idx)->srcu_unlocks.counter);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -773,13 +778,12 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
*/
int __srcu_read_lock_nmisafe(struct srcu_struct *ssp)
{
- int idx;
- struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
+ struct srcu_ctr __percpu *scpp = READ_ONCE(ssp->srcu_ctrp);
+ struct srcu_ctr *scp = raw_cpu_ptr(scpp);
- idx = READ_ONCE(ssp->srcu_idx) & 0x1;
- atomic_long_inc(&sdp->srcu_lock_count[idx]);
+ atomic_long_inc(&scp->srcu_locks);
smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */
- return idx;
+ return __srcu_ptr_to_ctr(ssp, scpp);
}
EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe);
@@ -790,10 +794,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe);
*/
void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
{
- struct srcu_data *sdp = raw_cpu_ptr(ssp->sda);
-
smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */
- atomic_long_inc(&sdp->srcu_unlock_count[idx]);
+ atomic_long_inc(&raw_cpu_ptr(__srcu_ctr_to_ptr(ssp, idx))->srcu_unlocks);
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe);
@@ -1096,13 +1098,15 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
/*
* Wait until all readers counted by array index idx complete, but
* loop an additional time if there is an expedited grace period pending.
- * The caller must ensure that ->srcu_idx is not changed while checking.
+ * The caller must ensure that ->srcu_ctrp is not changed while checking.
*/
static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
{
unsigned long curdelay;
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
curdelay = !srcu_get_delay(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
for (;;) {
if (srcu_readers_active_idx_check(ssp, idx))
@@ -1114,30 +1118,30 @@ static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount)
}
/*
- * Increment the ->srcu_idx counter so that future SRCU readers will
+ * Increment the ->srcu_ctrp counter so that future SRCU readers will
* use the other rank of the ->srcu_(un)lock_count[] arrays. This allows
* us to wait for pre-existing readers in a starvation-free manner.
*/
static void srcu_flip(struct srcu_struct *ssp)
{
/*
- * Because the flip of ->srcu_idx is executed only if the
+ * Because the flip of ->srcu_ctrp is executed only if the
* preceding call to srcu_readers_active_idx_check() found that
- * the ->srcu_unlock_count[] and ->srcu_lock_count[] sums matched
- * and because that summing uses atomic_long_read(), there is
- * ordering due to a control dependency between that summing and
- * the WRITE_ONCE() in this call to srcu_flip(). This ordering
- * ensures that if this updater saw a given reader's increment from
- * __srcu_read_lock(), that reader was using a value of ->srcu_idx
- * from before the previous call to srcu_flip(), which should be
- * quite rare. This ordering thus helps forward progress because
- * the grace period could otherwise be delayed by additional
- * calls to __srcu_read_lock() using that old (soon to be new)
- * value of ->srcu_idx.
+ * the ->srcu_ctrs[].srcu_unlocks and ->srcu_ctrs[].srcu_locks sums
+ * matched and because that summing uses atomic_long_read(),
+ * there is ordering due to a control dependency between that
+ * summing and the WRITE_ONCE() in this call to srcu_flip().
+ * This ordering ensures that if this updater saw a given reader's
+ * increment from __srcu_read_lock(), that reader was using a value
+ * of ->srcu_ctrp from before the previous call to srcu_flip(),
+ * which should be quite rare. This ordering thus helps forward
+ * progress because the grace period could otherwise be delayed
+ * by additional calls to __srcu_read_lock() using that old (soon
+ * to be new) value of ->srcu_ctrp.
*
* This sum-equality check and ordering also ensures that if
* a given call to __srcu_read_lock() uses the new value of
- * ->srcu_idx, this updater's earlier scans cannot have seen
+ * ->srcu_ctrp, this updater's earlier scans cannot have seen
* that reader's increments, which is all to the good, because
* this grace period need not wait on that reader. After all,
* if those earlier scans had seen that reader, there would have
@@ -1152,7 +1156,8 @@ static void srcu_flip(struct srcu_struct *ssp)
*/
smp_mb(); /* E */ /* Pairs with B and C. */
- WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); // Flip the counter.
+ WRITE_ONCE(ssp->srcu_ctrp,
+ &ssp->sda->srcu_ctrs[!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0])]);
/*
* Ensure that if the updater misses an __srcu_read_unlock()
@@ -1198,7 +1203,7 @@ static bool srcu_should_expedite(struct srcu_struct *ssp)
check_init_srcu_struct(ssp);
/* If _lite() readers, don't do unsolicited expediting. */
- if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE)
+ if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_SLOWGP)
return false;
/* If the local srcu_data structure has callbacks, not idle. */
sdp = raw_cpu_ptr(ssp->sda);
@@ -1398,8 +1403,12 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
* read-side critical sections are delimited by srcu_read_lock() and
* srcu_read_unlock(), and may be nested.
*
- * The callback will be invoked from process context, but must nevertheless
- * be fast and must not block.
+ * The callback will be invoked from process context, but with bh
+ * disabled. The callback function must therefore be fast and must
+ * not block.
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
*/
void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
rcu_callback_t func)
@@ -1465,8 +1474,9 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
*
* Wait for the count to drain to zero of both indexes. To avoid the
* possible starvation of synchronize_srcu(), it waits for the count of
- * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first,
- * and then flip the srcu_idx and wait for the count of the other index.
+ * the index=!(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]) to drain to zero
+ * at first, and then flip the ->srcu_ctrp and wait for the count of the
+ * other index.
*
* Can block; must be called from process context.
*
@@ -1675,7 +1685,7 @@ EXPORT_SYMBOL_GPL(srcu_barrier);
*/
unsigned long srcu_batches_completed(struct srcu_struct *ssp)
{
- return READ_ONCE(ssp->srcu_idx);
+ return READ_ONCE(ssp->srcu_sup->srcu_gp_seq);
}
EXPORT_SYMBOL_GPL(srcu_batches_completed);
@@ -1692,7 +1702,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
/*
* Because readers might be delayed for an extended period after
- * fetching ->srcu_idx for their index, at any point in time there
+ * fetching ->srcu_ctrp for their index, at any point in time there
* might well be readers using both idx=0 and idx=1. We therefore
* need to wait for readers to clear from both index values before
* invoking a callback.
@@ -1720,7 +1730,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
}
if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) {
- idx = 1 ^ (ssp->srcu_idx & 1);
+ idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]);
if (!try_check_zero(ssp, idx, 1)) {
mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* readers present, retry later. */
@@ -1738,7 +1748,7 @@ static void srcu_advance_state(struct srcu_struct *ssp)
* SRCU read-side critical sections are normally short,
* so check at least twice in quick succession after a flip.
*/
- idx = 1 ^ (ssp->srcu_idx & 1);
+ idx = !(ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0]);
if (!try_check_zero(ssp, idx, 2)) {
mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex);
return; /* readers present, retry later. */
@@ -1849,7 +1859,9 @@ static void process_srcu(struct work_struct *work)
ssp = sup->srcu_ssp;
srcu_advance_state(ssp);
+ spin_lock_irq_rcu_node(ssp->srcu_sup);
curdelay = srcu_get_delay(ssp);
+ spin_unlock_irq_rcu_node(ssp->srcu_sup);
if (curdelay) {
WRITE_ONCE(sup->reschedule_count, 0);
} else {
@@ -1896,7 +1908,7 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state);
int ss_state_idx = ss_state;
- idx = ssp->srcu_idx & 0x1;
+ idx = ssp->srcu_ctrp - &ssp->sda->srcu_ctrs[0];
if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name))
ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1;
pr_alert("%s%s Tree SRCU g%ld state %d (%s)",
@@ -1914,8 +1926,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
struct srcu_data *sdp;
sdp = per_cpu_ptr(ssp->sda, cpu);
- u0 = data_race(atomic_long_read(&sdp->srcu_unlock_count[!idx]));
- u1 = data_race(atomic_long_read(&sdp->srcu_unlock_count[idx]));
+ u0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_unlocks));
+ u1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_unlocks));
/*
* Make sure that a lock is always counted if the corresponding
@@ -1923,8 +1935,8 @@ void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf)
*/
smp_rmb();
- l0 = data_race(atomic_long_read(&sdp->srcu_lock_count[!idx]));
- l1 = data_race(atomic_long_read(&sdp->srcu_lock_count[idx]));
+ l0 = data_race(atomic_long_read(&sdp->srcu_ctrs[!idx].srcu_locks));
+ l1 = data_race(atomic_long_read(&sdp->srcu_ctrs[idx].srcu_locks));
c0 = l0 - u0;
c1 = l1 - u1;
@@ -2001,6 +2013,7 @@ static int srcu_module_coming(struct module *mod)
ssp->sda = alloc_percpu(struct srcu_data);
if (WARN_ON_ONCE(!ssp->sda))
return -ENOMEM;
+ ssp->srcu_ctrp = &ssp->sda->srcu_ctrs[0];
}
return 0;
}
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 59314da5eb60..466668eb4fad 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -2256,7 +2256,7 @@ void __init tasks_cblist_init_generic(void)
#endif
}
-void __init rcu_init_tasks_generic(void)
+static int __init rcu_init_tasks_generic(void)
{
#ifdef CONFIG_TASKS_RCU
rcu_spawn_tasks_kthread();
@@ -2272,7 +2272,10 @@ void __init rcu_init_tasks_generic(void)
// Run the self-tests.
rcu_tasks_initiate_self_tests();
+
+ return 0;
}
+core_initcall(rcu_init_tasks_generic);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
static inline void rcu_tasks_bootup_oddness(void) {}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 4b3f31911465..c1ebfd51768b 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -85,15 +85,8 @@ void rcu_sched_clock_irq(int user)
static inline bool rcu_reclaim_tiny(struct rcu_head *head)
{
rcu_callback_t f;
- unsigned long offset = (unsigned long)head->func;
rcu_lock_acquire(&rcu_callback_map);
- if (__is_kvfree_rcu_offset(offset)) {
- trace_rcu_invoke_kvfree_callback("", head, offset);
- kvfree((void *)head - offset);
- rcu_lock_release(&rcu_callback_map);
- return true;
- }
trace_rcu_invoke_callback("", head);
f = head->func;
@@ -159,10 +152,6 @@ void synchronize_rcu(void)
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
-static void tiny_rcu_leak_callback(struct rcu_head *rhp)
-{
-}
-
/*
* Post an RCU callback to be invoked after the end of an RCU grace
* period. But since we have but one CPU, that would be after any
@@ -178,9 +167,6 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func);
mem_dump_obj(head);
}
-
- if (!__is_kvfree_rcu_offset((unsigned long)head->func))
- WRITE_ONCE(head->func, tiny_rcu_leak_callback);
return;
}
@@ -246,15 +232,18 @@ bool poll_state_synchronize_rcu(unsigned long oldstate)
}
EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
-#ifdef CONFIG_KASAN_GENERIC
-void kvfree_call_rcu(struct rcu_head *head, void *ptr)
+#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST)
+unsigned long long rcutorture_gather_gp_seqs(void)
{
- if (head)
- kasan_record_aux_stack(ptr);
+ return READ_ONCE(rcu_ctrlblk.gp_seq) & 0xffffULL;
+}
+EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs);
- __kvfree_call_rcu(head, ptr);
+void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len)
+{
+ snprintf(cp, len, "g%04llx", seqs & 0xffffULL);
}
-EXPORT_SYMBOL_GPL(kvfree_call_rcu);
+EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs);
#endif
void __init rcu_init(void)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 475f31deed14..659f83e71048 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -538,6 +538,26 @@ void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq)
}
EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
+/* Gather grace-period sequence numbers for rcutorture diagnostics. */
+unsigned long long rcutorture_gather_gp_seqs(void)
+{
+ return ((READ_ONCE(rcu_state.gp_seq) & 0xffffULL) << 40) |
+ ((READ_ONCE(rcu_state.expedited_sequence) & 0xffffffULL) << 16) |
+ (READ_ONCE(rcu_state.gp_seq_polled) & 0xffffULL);
+}
+EXPORT_SYMBOL_GPL(rcutorture_gather_gp_seqs);
+
+/* Format grace-period sequence numbers for rcutorture diagnostics. */
+void rcutorture_format_gp_seqs(unsigned long long seqs, char *cp, size_t len)
+{
+ unsigned int egp = (seqs >> 16) & 0xffffffULL;
+ unsigned int ggp = (seqs >> 40) & 0xffffULL;
+ unsigned int pgp = seqs & 0xffffULL;
+
+ snprintf(cp, len, "g%04x:e%06x:p%04x", ggp, egp, pgp);
+}
+EXPORT_SYMBOL_GPL(rcutorture_format_gp_seqs);
+
#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK))
/*
* An empty function that will trigger a reschedule on
@@ -1254,7 +1274,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
/* Handle the ends of any preceding grace periods first. */
if (rcu_seq_completed_gp(rdp->gp_seq, rnp->gp_seq) ||
- unlikely(READ_ONCE(rdp->gpwrap))) {
+ unlikely(rdp->gpwrap)) {
if (!offloaded)
ret = rcu_advance_cbs(rnp, rdp); /* Advance CBs. */
rdp->core_needs_qs = false;
@@ -1268,7 +1288,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
/* Now handle the beginnings of any new-to-this-CPU grace periods. */
if (rcu_seq_new_gp(rdp->gp_seq, rnp->gp_seq) ||
- unlikely(READ_ONCE(rdp->gpwrap))) {
+ unlikely(rdp->gpwrap)) {
/*
* If the current grace period is waiting for this CPU,
* set up to detect a quiescent state, otherwise don't
@@ -1283,7 +1303,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp)
rdp->gp_seq = rnp->gp_seq; /* Remember new grace-period state. */
if (ULONG_CMP_LT(rdp->gp_seq_needed, rnp->gp_seq_needed) || rdp->gpwrap)
WRITE_ONCE(rdp->gp_seq_needed, rnp->gp_seq_needed);
- if (IS_ENABLED(CONFIG_PROVE_RCU) && READ_ONCE(rdp->gpwrap))
+ if (IS_ENABLED(CONFIG_PROVE_RCU) && rdp->gpwrap)
WRITE_ONCE(rdp->last_sched_clock, jiffies);
WRITE_ONCE(rdp->gpwrap, false);
rcu_gpnum_ovf(rnp, rdp);
@@ -1612,12 +1632,10 @@ static void rcu_sr_normal_complete(struct llist_node *node)
{
struct rcu_synchronize *rs = container_of(
(struct rcu_head *) node, struct rcu_synchronize, head);
- unsigned long oldstate = (unsigned long) rs->head.func;
WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) &&
- !poll_state_synchronize_rcu(oldstate),
- "A full grace period is not passed yet: %lu",
- rcu_seq_diff(get_state_synchronize_rcu(), oldstate));
+ !poll_state_synchronize_rcu_full(&rs->oldstate),
+ "A full grace period is not passed yet!\n");
/* Finally. */
complete(&rs->completion);
@@ -1801,10 +1819,14 @@ static noinline_for_stack bool rcu_gp_init(void)
/* Advance to a new grace period and initialize state. */
record_gp_stall_check_time();
+ /*
+ * A new wait segment must be started before gp_seq advanced, so
+ * that previous gp waiters won't observe the new gp_seq.
+ */
+ start_new_poll = rcu_sr_normal_gp_init();
/* Record GP times before starting GP, hence rcu_seq_start(). */
rcu_seq_start(&rcu_state.gp_seq);
ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
- start_new_poll = rcu_sr_normal_gp_init();
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap);
raw_spin_unlock_irq_rcu_node(rnp);
@@ -2931,13 +2953,8 @@ static int __init rcu_spawn_core_kthreads(void)
static void rcutree_enqueue(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func)
{
rcu_segcblist_enqueue(&rdp->cblist, head);
- if (__is_kvfree_rcu_offset((unsigned long)func))
- trace_rcu_kvfree_callback(rcu_state.name, head,
- (unsigned long)func,
- rcu_segcblist_n_cbs(&rdp->cblist));
- else
- trace_rcu_callback(rcu_state.name, head,
- rcu_segcblist_n_cbs(&rdp->cblist));
+ trace_rcu_callback(rcu_state.name, head,
+ rcu_segcblist_n_cbs(&rdp->cblist));
trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued"));
}
@@ -3107,7 +3124,7 @@ module_param(enable_rcu_lazy, bool, 0444);
* critical sections have completed.
*
* Use this API instead of call_rcu() if you don't want the callback to be
- * invoked after very long periods of time, which can happen on systems without
+ * delayed for very long periods of time, which can happen on systems without
* memory pressure and on systems which are lightly loaded or mostly idle.
* This function will cause callbacks to be invoked sooner than later at the
* expense of extra power. Other than that, this function is identical to, and
@@ -3138,6 +3155,12 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry);
* might well execute concurrently with RCU read-side critical sections
* that started after call_rcu() was invoked.
*
+ * It is perfectly legal to repost an RCU callback, potentially with
+ * a different callback function, from within its callback function.
+ * The specified function will be invoked after another full grace period
+ * has elapsed. This use case is similar in form to the common practice
+ * of reposting a timer from within its own handler.
+ *
* RCU read-side critical sections are delimited by rcu_read_lock()
* and rcu_read_unlock(), and may be nested. In addition, but only in
* v5.0 and later, regions of code across which interrupts, preemption,
@@ -3166,6 +3189,13 @@ EXPORT_SYMBOL_GPL(call_rcu_hurry);
*
* Implementation of these memory-ordering guarantees is described here:
* Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst.
+ *
+ * Specific to call_rcu() (as opposed to the other call_rcu*() functions),
+ * in kernels built with CONFIG_RCU_LAZY=y, call_rcu() might delay for many
+ * seconds before starting the grace period needed by the corresponding
+ * callback. This delay can significantly improve energy-efficiency
+ * on low-utilization battery-powered devices. To avoid this delay,
+ * in latency-sensitive kernel code, use call_rcu_hurry().
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
@@ -3214,7 +3244,7 @@ static void synchronize_rcu_normal(void)
* snapshot before adding a request.
*/
if (IS_ENABLED(CONFIG_PROVE_RCU))
- rs.head.func = (void *) get_state_synchronize_rcu();
+ get_state_synchronize_rcu_full(&rs.oldstate);
rcu_sr_normal_add_req(&rs);
@@ -3357,14 +3387,17 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
*/
void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp)
{
- struct rcu_node *rnp = rcu_get_root();
-
/*
* Any prior manipulation of RCU-protected data must happen
* before the loads from ->gp_seq and ->expedited_sequence.
*/
smp_mb(); /* ^^^ */
- rgosp->rgos_norm = rcu_seq_snap(&rnp->gp_seq);
+
+ // Yes, rcu_state.gp_seq, not rnp_root->gp_seq, the latter's use
+ // in poll_state_synchronize_rcu_full() notwithstanding. Use of
+ // the latter here would result in too-short grace periods due to
+ // interactions with newly onlined CPUs.
+ rgosp->rgos_norm = rcu_seq_snap(&rcu_state.gp_seq);
rgosp->rgos_exp = rcu_seq_snap(&rcu_state.expedited_sequence);
}
EXPORT_SYMBOL_GPL(get_state_synchronize_rcu_full);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 77efed89c79e..8d4895c854c5 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -230,17 +230,19 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake)
* specified leaf rcu_node structure, which is acquired by the caller.
*/
static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, unsigned long flags,
- unsigned long mask, bool wake)
+ unsigned long mask_in, bool wake)
__releases(rnp->lock)
{
int cpu;
+ unsigned long mask;
struct rcu_data *rdp;
raw_lockdep_assert_held_rcu_node(rnp);
- if (!(rnp->expmask & mask)) {
+ if (!(rnp->expmask & mask_in)) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
+ mask = mask_in & rnp->expmask;
WRITE_ONCE(rnp->expmask, rnp->expmask & ~mask);
for_each_leaf_node_cpu_mask(rnp, cpu, mask) {
rdp = per_cpu_ptr(&rcu_data, cpu);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 2605dd234a13..5ff3bc56ff51 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -1557,8 +1557,11 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
/* Dump out nocb kthread state for the specified rcu_data structure. */
static void show_rcu_nocb_state(struct rcu_data *rdp)
{
- char bufw[20];
- char bufr[20];
+ char bufd[22];
+ char bufw[45];
+ char bufr[45];
+ char bufn[22];
+ char bufb[22];
struct rcu_data *nocb_next_rdp;
struct rcu_segcblist *rsclp = &rdp->cblist;
bool waslocked;
@@ -1572,9 +1575,13 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
typeof(*rdp),
nocb_entry_rdp);
- sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
- sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
- pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
+ sprintf(bufd, "%ld", rsclp->seglen[RCU_DONE_TAIL]);
+ sprintf(bufw, "%ld(%ld)", rsclp->seglen[RCU_WAIT_TAIL], rsclp->gp_seq[RCU_WAIT_TAIL]);
+ sprintf(bufr, "%ld(%ld)", rsclp->seglen[RCU_NEXT_READY_TAIL],
+ rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
+ sprintf(bufn, "%ld", rsclp->seglen[RCU_NEXT_TAIL]);
+ sprintf(bufb, "%ld", rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%s%c%s%c%s%c%s%c%s q%ld %c CPU %d%s\n",
rdp->cpu, rdp->nocb_gp_rdp->cpu,
nocb_next_rdp ? nocb_next_rdp->cpu : -1,
"kK"[!!rdp->nocb_cb_kthread],
@@ -1586,12 +1593,15 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
jiffies - rdp->nocb_nobypass_last,
rdp->nocb_nobypass_count,
".D"[rcu_segcblist_ready_cbs(rsclp)],
+ rcu_segcblist_segempty(rsclp, RCU_DONE_TAIL) ? "" : bufd,
".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)],
rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw,
".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)],
rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr,
".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL) ? "" : bufn,
".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
+ !rcu_cblist_n_cbs(&rdp->nocb_bypass) ? "" : bufb,
rcu_segcblist_n_cbs(&rdp->cblist),
rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_cb_kthread) : -1,
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3600152b858e..3c0bbbbb686f 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -833,8 +833,17 @@ void rcu_read_unlock_strict(void)
{
struct rcu_data *rdp;
- if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
+ if (irqs_disabled() || in_atomic_preempt_off() || !rcu_state.gp_kthread)
return;
+
+ /*
+ * rcu_report_qs_rdp() can only be invoked with a stable rdp and
+ * from the local CPU.
+ *
+ * The in_atomic_preempt_off() check ensures that we come here holding
+ * the last preempt_count (which will get dropped once we return to
+ * __rcu_read_unlock().
+ */
rdp = this_cpu_ptr(&rcu_data);
rdp->cpu_no_qs.b.norm = false;
rcu_report_qs_rdp(rdp);
@@ -975,13 +984,16 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
*/
static void rcu_flavor_sched_clock_irq(int user)
{
- if (user || rcu_is_cpu_rrupt_from_idle()) {
+ if (user || rcu_is_cpu_rrupt_from_idle() ||
+ (IS_ENABLED(CONFIG_PREEMPT_COUNT) &&
+ (preempt_count() == HARDIRQ_OFFSET))) {
/*
* Get here if this CPU took its interrupt from user
- * mode or from the idle loop, and if this is not a
- * nested interrupt. In this case, the CPU is in
- * a quiescent state, so note it.
+ * mode, from the idle loop without this being a nested
+ * interrupt, or while not holding the task preempt count
+ * (with PREEMPT_COUNT=y). In this case, the CPU is in a
+ * quiescent state, so note it.
*
* No memory barrier is required here because rcu_qs()
* references only CPU-local variables that other CPUs
diff --git a/kernel/reboot.c b/kernel/reboot.c
index b5a8569e5d81..41ab9e1ba357 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -704,6 +704,7 @@ void kernel_power_off(void)
migrate_to_reboot_cpu();
syscore_shutdown();
pr_emerg("Power down\n");
+ pr_flush(1000, true);
kmsg_dump(KMSG_DUMP_SHUTDOWN);
machine_power_off();
}
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 2cb16091ec0a..b7a1ec327e81 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -78,24 +78,24 @@ efault:
return -EFAULT;
}
-static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id,
- u32 node_id, u32 mm_cid)
-{
- rseq_kernel_fields(t)->cpu_id_start = cpu_id;
- rseq_kernel_fields(t)->cpu_id = cpu_id;
- rseq_kernel_fields(t)->node_id = node_id;
- rseq_kernel_fields(t)->mm_cid = mm_cid;
-}
+/*
+ * Update an rseq field and its in-kernel copy in lock-step to keep a coherent
+ * state.
+ */
+#define rseq_unsafe_put_user(t, value, field, error_label) \
+ do { \
+ unsafe_put_user(value, &t->rseq->field, error_label); \
+ rseq_kernel_fields(t)->field = value; \
+ } while (0)
+
#else
static int rseq_validate_ro_fields(struct task_struct *t)
{
return 0;
}
-static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id,
- u32 node_id, u32 mm_cid)
-{
-}
+#define rseq_unsafe_put_user(t, value, field, error_label) \
+ unsafe_put_user(value, &t->rseq->field, error_label)
#endif
/*
@@ -173,17 +173,18 @@ static int rseq_update_cpu_node_id(struct task_struct *t)
WARN_ON_ONCE((int) mm_cid < 0);
if (!user_write_access_begin(rseq, t->rseq_len))
goto efault;
- unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end);
- unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end);
- unsafe_put_user(node_id, &rseq->node_id, efault_end);
- unsafe_put_user(mm_cid, &rseq->mm_cid, efault_end);
+
+ rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end);
+ rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end);
+ rseq_unsafe_put_user(t, node_id, node_id, efault_end);
+ rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end);
+
/*
* Additional feature fields added after ORIG_RSEQ_SIZE
* need to be conditionally updated only if
* t->rseq_len != ORIG_RSEQ_SIZE.
*/
user_write_access_end();
- rseq_set_ro_fields(t, cpu_id, cpu_id, node_id, mm_cid);
trace_rseq_update(t);
return 0;
@@ -195,6 +196,7 @@ efault:
static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
{
+ struct rseq __user *rseq = t->rseq;
u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0,
mm_cid = 0;
@@ -202,40 +204,61 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
* Validate read-only rseq fields.
*/
if (rseq_validate_ro_fields(t))
- return -EFAULT;
- /*
- * Reset cpu_id_start to its initial state (0).
- */
- if (put_user(cpu_id_start, &t->rseq->cpu_id_start))
- return -EFAULT;
- /*
- * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming
- * in after unregistration can figure out that rseq needs to be
- * registered again.
- */
- if (put_user(cpu_id, &t->rseq->cpu_id))
- return -EFAULT;
- /*
- * Reset node_id to its initial state (0).
- */
- if (put_user(node_id, &t->rseq->node_id))
- return -EFAULT;
+ goto efault;
+
+ if (!user_write_access_begin(rseq, t->rseq_len))
+ goto efault;
+
/*
- * Reset mm_cid to its initial state (0).
+ * Reset all fields to their initial state.
+ *
+ * All fields have an initial state of 0 except cpu_id which is set to
+ * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after
+ * unregistration can figure out that rseq needs to be registered
+ * again.
*/
- if (put_user(mm_cid, &t->rseq->mm_cid))
- return -EFAULT;
-
- rseq_set_ro_fields(t, cpu_id_start, cpu_id, node_id, mm_cid);
+ rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end);
+ rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end);
+ rseq_unsafe_put_user(t, node_id, node_id, efault_end);
+ rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end);
/*
* Additional feature fields added after ORIG_RSEQ_SIZE
* need to be conditionally reset only if
* t->rseq_len != ORIG_RSEQ_SIZE.
*/
+ user_write_access_end();
+ return 0;
+
+efault_end:
+ user_write_access_end();
+efault:
+ return -EFAULT;
+}
+
+/*
+ * Get the user-space pointer value stored in the 'rseq_cs' field.
+ */
+static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs)
+{
+ if (!rseq_cs)
+ return -EFAULT;
+
+#ifdef CONFIG_64BIT
+ if (get_user(*rseq_cs, &rseq->rseq_cs))
+ return -EFAULT;
+#else
+ if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs)))
+ return -EFAULT;
+#endif
+
return 0;
}
+/*
+ * If the rseq_cs field of 'struct rseq' contains a valid pointer to
+ * user-space, copy 'struct rseq_cs' from user-space and validate its fields.
+ */
static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
{
struct rseq_cs __user *urseq_cs;
@@ -244,17 +267,16 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
u32 sig;
int ret;
-#ifdef CONFIG_64BIT
- if (get_user(ptr, &t->rseq->rseq_cs))
- return -EFAULT;
-#else
- if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr)))
- return -EFAULT;
-#endif
+ ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr);
+ if (ret)
+ return ret;
+
+ /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */
if (!ptr) {
memset(rseq_cs, 0, sizeof(*rseq_cs));
return 0;
}
+ /* Check that the pointer value fits in the user-space process space. */
if (ptr >= TASK_SIZE)
return -EINVAL;
urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr;
@@ -330,7 +352,7 @@ static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
return !!event_mask;
}
-static int clear_rseq_cs(struct task_struct *t)
+static int clear_rseq_cs(struct rseq __user *rseq)
{
/*
* The rseq_cs field is set to NULL on preemption or signal
@@ -341,9 +363,9 @@ static int clear_rseq_cs(struct task_struct *t)
* Set rseq_cs to NULL.
*/
#ifdef CONFIG_64BIT
- return put_user(0UL, &t->rseq->rseq_cs);
+ return put_user(0UL, &rseq->rseq_cs);
#else
- if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs)))
+ if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs)))
return -EFAULT;
return 0;
#endif
@@ -375,11 +397,11 @@ static int rseq_ip_fixup(struct pt_regs *regs)
* Clear the rseq_cs pointer and return.
*/
if (!in_rseq_cs(ip, &rseq_cs))
- return clear_rseq_cs(t);
+ return clear_rseq_cs(t->rseq);
ret = rseq_need_restart(t, rseq_cs.flags);
if (ret <= 0)
return ret;
- ret = clear_rseq_cs(t);
+ ret = clear_rseq_cs(t->rseq);
if (ret)
return ret;
trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
@@ -453,6 +475,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
int, flags, u32, sig)
{
int ret;
+ u64 rseq_cs;
if (flags & RSEQ_FLAG_UNREGISTER) {
if (flags & ~RSEQ_FLAG_UNREGISTER)
@@ -507,6 +530,19 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
return -EINVAL;
if (!access_ok(rseq, rseq_len))
return -EFAULT;
+
+ /*
+ * If the rseq_cs pointer is non-NULL on registration, clear it to
+ * avoid a potential segfault on return to user-space. The proper thing
+ * to do would have been to fail the registration but this would break
+ * older libcs that reuse the rseq area for new threads without
+ * clearing the fields.
+ */
+ if (rseq_get_rseq_cs_ptr_val(rseq, &rseq_cs))
+ return -EFAULT;
+ if (rseq_cs && clear_rseq_cs(rseq))
+ return -EFAULT;
+
#ifdef CONFIG_DEBUG_RSEQ
/*
* Initialize the in-kernel rseq fields copy for validation of
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 976092b7bd45..8ae86371ddcd 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -22,6 +22,11 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
+# Branch profiling isn't noinstr-safe
+ifdef CONFIG_TRACE_BRANCH_PROFILING
+CFLAGS_build_policy.o += -DDISABLE_BRANCH_PROFILING
+CFLAGS_build_utility.o += -DDISABLE_BRANCH_PROFILING
+endif
#
# Build efficiency:
#
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index fae1f5c921eb..72d97aa8b726 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -61,6 +61,7 @@
#ifdef CONFIG_SCHED_CLASS_EXT
# include "ext.c"
+# include "ext_idle.c"
#endif
#include "syscalls.c"
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
index 80a3df49ab47..bf9d8db94b70 100644
--- a/kernel/sched/build_utility.c
+++ b/kernel/sched/build_utility.c
@@ -68,9 +68,7 @@
# include "cpufreq_schedutil.c"
#endif
-#ifdef CONFIG_SCHED_DEBUG
-# include "debug.c"
-#endif
+#include "debug.c"
#ifdef CONFIG_SCHEDSTATS
# include "stats.c"
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 67189907214d..87540217fc09 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -91,7 +91,6 @@
#include "autogroup.h"
#include "pelt.h"
#include "smp.h"
-#include "stats.h"
#include "../workqueue_internal.h"
#include "../../io_uring/io-wq.h"
@@ -119,7 +118,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-#ifdef CONFIG_SCHED_DEBUG
/*
* Debugging: various feature bits
*
@@ -129,7 +127,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
*/
#define SCHED_FEAT(name, enabled) \
(1UL << __SCHED_FEAT_##name) * enabled |
-const_debug unsigned int sysctl_sched_features =
+__read_mostly unsigned int sysctl_sched_features =
#include "features.h"
0;
#undef SCHED_FEAT
@@ -143,13 +141,12 @@ const_debug unsigned int sysctl_sched_features =
*/
__read_mostly int sysctl_resched_latency_warn_ms = 100;
__read_mostly int sysctl_resched_latency_warn_once = 1;
-#endif /* CONFIG_SCHED_DEBUG */
/*
* Number of tasks to iterate in a single balance run.
* Limited because this is done with IRQs disabled.
*/
-const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
+__read_mostly unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
__read_mostly int scheduler_running;
@@ -800,11 +797,10 @@ void update_rq_clock(struct rq *rq)
if (rq->clock_update_flags & RQCF_ACT_SKIP)
return;
-#ifdef CONFIG_SCHED_DEBUG
if (sched_feat(WARN_DOUBLE_CLOCK))
- SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
+ WARN_ON_ONCE(rq->clock_update_flags & RQCF_UPDATED);
rq->clock_update_flags |= RQCF_UPDATED;
-#endif
+
clock = sched_clock_cpu(cpu_of(rq));
scx_rq_clock_update(rq, clock);
@@ -916,8 +912,7 @@ static void hrtick_rq_init(struct rq *rq)
#ifdef CONFIG_SMP
INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
#endif
- hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
- rq->hrtick_timer.function = hrtick;
+ hrtimer_setup(&rq->hrtick_timer, hrtick, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
}
#else /* CONFIG_SCHED_HRTICK */
static inline void hrtick_clear(struct rq *rq)
@@ -1720,7 +1715,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
bucket = &uc_rq->bucket[uc_se->bucket_id];
- SCHED_WARN_ON(!bucket->tasks);
+ WARN_ON_ONCE(!bucket->tasks);
if (likely(bucket->tasks))
bucket->tasks--;
@@ -1740,7 +1735,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
* Defensive programming: this should never happen. If it happens,
* e.g. due to future modification, warn and fix up the expected value.
*/
- SCHED_WARN_ON(bucket->value > rq_clamp);
+ WARN_ON_ONCE(bucket->value > rq_clamp);
if (bucket->value >= rq_clamp) {
bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
uclamp_rq_set(rq, clamp_id, bkt_clamp);
@@ -1757,7 +1752,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
* The condition is constructed such that a NOP is generated when
* sched_uclamp_used is disabled.
*/
- if (!static_branch_unlikely(&sched_uclamp_used))
+ if (!uclamp_is_used())
return;
if (unlikely(!p->sched_class->uclamp_enabled))
@@ -1784,7 +1779,7 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
* The condition is constructed such that a NOP is generated when
* sched_uclamp_used is disabled.
*/
- if (!static_branch_unlikely(&sched_uclamp_used))
+ if (!uclamp_is_used())
return;
if (unlikely(!p->sched_class->uclamp_enabled))
@@ -1942,12 +1937,12 @@ static int sysctl_sched_uclamp_handler(const struct ctl_table *table, int write,
}
if (update_root_tg) {
- static_branch_enable(&sched_uclamp_used);
+ sched_uclamp_enable();
uclamp_update_root_tg();
}
if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
- static_branch_enable(&sched_uclamp_used);
+ sched_uclamp_enable();
uclamp_sync_util_min_rt_default();
}
@@ -2122,7 +2117,7 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
- SCHED_WARN_ON(flags & DEQUEUE_SLEEP);
+ WARN_ON_ONCE(flags & DEQUEUE_SLEEP);
WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
ASSERT_EXCLUSIVE_WRITER(p->on_rq);
@@ -2727,7 +2722,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
* XXX do further audits, this smells like something putrid.
*/
if (ctx->flags & SCA_MIGRATE_DISABLE)
- SCHED_WARN_ON(!p->on_cpu);
+ WARN_ON_ONCE(!p->on_cpu);
else
lockdep_assert_held(&p->pi_lock);
@@ -3292,7 +3287,6 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
-#ifdef CONFIG_SCHED_DEBUG
unsigned int state = READ_ONCE(p->__state);
/*
@@ -3330,7 +3324,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
WARN_ON_ONCE(!cpu_online(new_cpu));
WARN_ON_ONCE(is_migration_disabled(p));
-#endif
trace_sched_migrate_task(p, new_cpu);
@@ -3922,13 +3915,8 @@ bool cpus_share_resources(int this_cpu, int that_cpu)
static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
{
- /*
- * The BPF scheduler may depend on select_task_rq() being invoked during
- * wakeups. In addition, @p may end up executing on a different CPU
- * regardless of what happens in the wakeup path making the ttwu_queue
- * optimization less meaningful. Skip if on SCX.
- */
- if (task_on_scx(p))
+ /* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */
+ if (!scx_allow_ttwu_queue(p))
return false;
/*
@@ -4196,7 +4184,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* - we're serialized against set_special_state() by virtue of
* it disabling IRQs (this allows not taking ->pi_lock).
*/
- SCHED_WARN_ON(p->se.sched_delayed);
+ WARN_ON_ONCE(p->se.sched_delayed);
if (!ttwu_state_match(p, state, &success))
goto out;
@@ -4490,7 +4478,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
INIT_LIST_HEAD(&p->se.group_node);
/* A delayed task cannot be in clone(). */
- SCHED_WARN_ON(p->se.sched_delayed);
+ WARN_ON_ONCE(p->se.sched_delayed);
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = NULL;
@@ -5578,7 +5566,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}
-#ifdef CONFIG_SCHED_DEBUG
static u64 cpu_resched_latency(struct rq *rq)
{
int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
@@ -5623,9 +5610,6 @@ static int __init setup_resched_latency_warn_ms(char *str)
return 1;
}
__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
-#else
-static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
-#endif /* CONFIG_SCHED_DEBUG */
/*
* This function gets called by the timer code, with HZ frequency.
@@ -5746,7 +5730,7 @@ static void sched_tick_remote(struct work_struct *work)
* we are always sure that there is no proxy (only a
* single task is running).
*/
- SCHED_WARN_ON(rq->curr != rq->donor);
+ WARN_ON_ONCE(rq->curr != rq->donor);
update_rq_clock(rq);
if (!is_idle_task(curr)) {
@@ -5966,7 +5950,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
preempt_count_set(PREEMPT_DISABLED);
}
rcu_sleep_check();
- SCHED_WARN_ON(ct_state() == CT_STATE_USER);
+ WARN_ON_ONCE(ct_state() == CT_STATE_USER);
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -6719,9 +6703,7 @@ static void __sched notrace __schedule(int sched_mode)
picked:
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
-#ifdef CONFIG_SCHED_DEBUG
rq->last_seen_need_resched_ns = 0;
-#endif
if (likely(prev != next)) {
rq->nr_switches++;
@@ -6812,7 +6794,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
* deadlock if the callback attempts to acquire a lock which is
* already acquired.
*/
- SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT);
+ WARN_ON_ONCE(current->__state & TASK_RTLOCK_WAIT);
/*
* If we are going to sleep and we have plugged IO queued,
@@ -7095,7 +7077,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
void *key)
{
- WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU));
+ WARN_ON_ONCE(wake_flags & ~(WF_SYNC|WF_CURRENT_CPU));
return try_to_wake_up(curr->private, mode, wake_flags);
}
EXPORT_SYMBOL(default_wake_function);
@@ -7290,7 +7272,7 @@ int __sched __cond_resched(void)
return 1;
}
/*
- * In preemptible kernels, ->rcu_read_lock_nesting tells the tick
+ * In PREEMPT_RCU kernels, ->rcu_read_lock_nesting tells the tick
* whether the current CPU is in an RCU read-side critical section,
* so the tick can report quiescent states even for CPUs looping
* in kernel context. In contrast, in non-preemptible kernels,
@@ -7299,6 +7281,8 @@ int __sched __cond_resched(void)
* RCU quiescent state. Therefore, the following code causes
* cond_resched() to report a quiescent state, but only when RCU
* is in urgent need of one.
+ * A third case, preemptible, but non-PREEMPT_RCU provides for
+ * urgently needed quiescent states via rcu_flavor_sched_clock_irq().
*/
#ifndef CONFIG_PREEMPT_RCU
rcu_all_qs();
@@ -7647,10 +7631,57 @@ PREEMPT_MODEL_ACCESSOR(lazy);
#else /* !CONFIG_PREEMPT_DYNAMIC: */
+#define preempt_dynamic_mode -1
+
static inline void preempt_dynamic_init(void) { }
#endif /* CONFIG_PREEMPT_DYNAMIC */
+const char *preempt_modes[] = {
+ "none", "voluntary", "full", "lazy", NULL,
+};
+
+const char *preempt_model_str(void)
+{
+ bool brace = IS_ENABLED(CONFIG_PREEMPT_RT) &&
+ (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC) ||
+ IS_ENABLED(CONFIG_PREEMPT_LAZY));
+ static char buf[128];
+
+ if (IS_ENABLED(CONFIG_PREEMPT_BUILD)) {
+ struct seq_buf s;
+
+ seq_buf_init(&s, buf, sizeof(buf));
+ seq_buf_puts(&s, "PREEMPT");
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ seq_buf_printf(&s, "%sRT%s",
+ brace ? "_{" : "_",
+ brace ? "," : "");
+
+ if (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC)) {
+ seq_buf_printf(&s, "(%s)%s",
+ preempt_dynamic_mode > 0 ?
+ preempt_modes[preempt_dynamic_mode] : "undef",
+ brace ? "}" : "");
+ return seq_buf_str(&s);
+ }
+
+ if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) {
+ seq_buf_printf(&s, "LAZY%s",
+ brace ? "}" : "");
+ return seq_buf_str(&s);
+ }
+
+ return seq_buf_str(&s);
+ }
+
+ if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY_BUILD))
+ return "VOLUNTARY";
+
+ return "NONE";
+}
+
int io_schedule_prepare(void)
{
int old_iowait = current->in_iowait;
@@ -7765,10 +7796,9 @@ void show_state_filter(unsigned int state_filter)
sched_show_task(p);
}
-#ifdef CONFIG_SCHED_DEBUG
if (!state_filter)
sysrq_sched_debug_show();
-#endif
+
rcu_read_unlock();
/*
* Only show locks if all tasks are dumped:
@@ -8183,7 +8213,7 @@ static void cpuset_cpu_active(void)
* operation in the resume sequence, just build a single sched
* domain, ignoring cpusets.
*/
- partition_sched_domains(1, NULL, NULL);
+ cpuset_reset_sched_domains();
if (--num_cpus_frozen)
return;
/*
@@ -8202,7 +8232,7 @@ static void cpuset_cpu_inactive(unsigned int cpu)
cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
- partition_sched_domains(1, NULL, NULL);
+ cpuset_reset_sched_domains();
}
}
@@ -8424,9 +8454,9 @@ void __init sched_init_smp(void)
* CPU masks are stable and all blatant races in the below code cannot
* happen.
*/
- mutex_lock(&sched_domains_mutex);
+ sched_domains_mutex_lock();
sched_init_domains(cpu_active_mask);
- mutex_unlock(&sched_domains_mutex);
+ sched_domains_mutex_unlock();
/* Move init over to a non-isolated CPU */
if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0)
@@ -9016,7 +9046,7 @@ void sched_release_group(struct task_group *tg)
spin_unlock_irqrestore(&task_group_lock, flags);
}
-static struct task_group *sched_get_task_group(struct task_struct *tsk)
+static void sched_change_group(struct task_struct *tsk)
{
struct task_group *tg;
@@ -9028,13 +9058,7 @@ static struct task_group *sched_get_task_group(struct task_struct *tsk)
tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
-
- return tg;
-}
-
-static void sched_change_group(struct task_struct *tsk, struct task_group *group)
-{
- tsk->sched_task_group = group;
+ tsk->sched_task_group = tg;
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_change_group)
@@ -9055,20 +9079,11 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
int queued, running, queue_flags =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
- struct task_group *group;
struct rq *rq;
CLASS(task_rq_lock, rq_guard)(tsk);
rq = rq_guard.rq;
- /*
- * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous
- * group changes.
- */
- group = sched_get_task_group(tsk);
- if (group == tsk->sched_task_group)
- return;
-
update_rq_clock(rq);
running = task_current_donor(rq, tsk);
@@ -9079,7 +9094,7 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup)
if (running)
put_prev_task(rq, tsk);
- sched_change_group(tsk, group);
+ sched_change_group(tsk);
if (!for_autogroup)
scx_cgroup_move_task(tsk);
@@ -9203,7 +9218,7 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css)
unsigned int clamps;
lockdep_assert_held(&uclamp_mutex);
- SCHED_WARN_ON(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held());
css_for_each_descendant_pre(css, top_css) {
uc_parent = css_tg(css)->parent
@@ -9295,7 +9310,7 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
if (req.ret)
return req.ret;
- static_branch_enable(&sched_uclamp_used);
+ sched_uclamp_enable();
guard(mutex)(&uclamp_mutex);
guard(rcu)();
@@ -10538,7 +10553,7 @@ static void task_mm_cid_work(struct callback_head *work)
struct mm_struct *mm;
int weight, cpu;
- SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work));
+ WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work));
work->next = work; /* Prevent double-add */
if (t->flags & PF_EXITING)
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index 1ef98a93eb1d..c4606ca89210 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -65,7 +65,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
* a cookie until after we've removed it, we must have core scheduling
* enabled here.
*/
- SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq));
+ WARN_ON_ONCE((p->core_cookie || cookie) && !sched_core_enabled(rq));
if (sched_core_enqueued(p))
sched_core_dequeue(rq, p, DEQUEUE_SAVE);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index ff4df16b5186..ad45a8fea245 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -166,14 +166,14 @@ static inline unsigned long dl_bw_capacity(int i)
}
}
-static inline bool dl_bw_visited(int cpu, u64 gen)
+bool dl_bw_visited(int cpu, u64 cookie)
{
struct root_domain *rd = cpu_rq(cpu)->rd;
- if (rd->visit_gen == gen)
+ if (rd->visit_cookie == cookie)
return true;
- rd->visit_gen = gen;
+ rd->visit_cookie = cookie;
return false;
}
@@ -207,7 +207,7 @@ static inline unsigned long dl_bw_capacity(int i)
return SCHED_CAPACITY_SCALE;
}
-static inline bool dl_bw_visited(int cpu, u64 gen)
+bool dl_bw_visited(int cpu, u64 cookie)
{
return false;
}
@@ -249,8 +249,8 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->running_bw += dl_bw;
- SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
- SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
+ WARN_ON_ONCE(dl_rq->running_bw < old); /* overflow */
+ WARN_ON_ONCE(dl_rq->running_bw > dl_rq->this_bw);
/* kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
}
@@ -262,7 +262,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->running_bw -= dl_bw;
- SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
+ WARN_ON_ONCE(dl_rq->running_bw > old); /* underflow */
if (dl_rq->running_bw > old)
dl_rq->running_bw = 0;
/* kick cpufreq (see the comment in kernel/sched/sched.h). */
@@ -276,7 +276,7 @@ void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->this_bw += dl_bw;
- SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */
+ WARN_ON_ONCE(dl_rq->this_bw < old); /* overflow */
}
static inline
@@ -286,10 +286,10 @@ void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->this_bw -= dl_bw;
- SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */
+ WARN_ON_ONCE(dl_rq->this_bw > old); /* underflow */
if (dl_rq->this_bw > old)
dl_rq->this_bw = 0;
- SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
+ WARN_ON_ONCE(dl_rq->running_bw > dl_rq->this_bw);
}
static inline
@@ -1382,8 +1382,7 @@ static void init_dl_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->dl_timer;
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
- timer->function = dl_task_timer;
+ hrtimer_setup(timer, dl_task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
}
/*
@@ -1839,8 +1838,7 @@ static void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->inactive_timer;
- hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
- timer->function = inactive_task_timer;
+ hrtimer_setup(timer, inactive_task_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
}
#define __node_2_dle(node) \
@@ -2956,7 +2954,7 @@ void dl_add_task_root_domain(struct task_struct *p)
struct dl_bw *dl_b;
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
- if (!dl_task(p)) {
+ if (!dl_task(p) || dl_entity_is_special(&p->dl)) {
raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
return;
}
@@ -2981,18 +2979,22 @@ void dl_clear_root_domain(struct root_domain *rd)
rd->dl_bw.total_bw = 0;
/*
- * dl_server bandwidth is only restored when CPUs are attached to root
- * domains (after domains are created or CPUs moved back to the
- * default root doamin).
+ * dl_servers are not tasks. Since dl_add_task_root_domain ignores
+ * them, we need to account for them here explicitly.
*/
for_each_cpu(i, rd->span) {
struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server;
if (dl_server(dl_se) && cpu_active(i))
- rd->dl_bw.total_bw += dl_se->dl_bw;
+ __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(i));
}
}
+void dl_clear_root_domain_cpu(int cpu)
+{
+ dl_clear_root_domain(cpu_rq(cpu)->rd);
+}
+
#endif /* CONFIG_SMP */
static void switched_from_dl(struct rq *rq, struct task_struct *p)
@@ -3171,15 +3173,18 @@ DEFINE_SCHED_CLASS(dl) = {
#endif
};
-/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */
-static u64 dl_generation;
+/*
+ * Used for dl_bw check and update, used under sched_rt_handler()::mutex and
+ * sched_domains_mutex.
+ */
+u64 dl_cookie;
int sched_dl_global_validate(void)
{
u64 runtime = global_rt_runtime();
u64 period = global_rt_period();
u64 new_bw = to_ratio(period, runtime);
- u64 gen = ++dl_generation;
+ u64 cookie = ++dl_cookie;
struct dl_bw *dl_b;
int cpu, cpus, ret = 0;
unsigned long flags;
@@ -3192,7 +3197,7 @@ int sched_dl_global_validate(void)
for_each_online_cpu(cpu) {
rcu_read_lock_sched();
- if (dl_bw_visited(cpu, gen))
+ if (dl_bw_visited(cpu, cookie))
goto next;
dl_b = dl_bw_of(cpu);
@@ -3229,7 +3234,7 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
void sched_dl_do_global(void)
{
u64 new_bw = -1;
- u64 gen = ++dl_generation;
+ u64 cookie = ++dl_cookie;
struct dl_bw *dl_b;
int cpu;
unsigned long flags;
@@ -3240,7 +3245,7 @@ void sched_dl_do_global(void)
for_each_possible_cpu(cpu) {
rcu_read_lock_sched();
- if (dl_bw_visited(cpu, gen)) {
+ if (dl_bw_visited(cpu, cookie)) {
rcu_read_unlock_sched();
continue;
}
@@ -3567,9 +3572,7 @@ void dl_bw_free(int cpu, u64 dl_bw)
}
#endif
-#ifdef CONFIG_SCHED_DEBUG
void print_dl_stats(struct seq_file *m, int cpu)
{
print_dl_rq(m, cpu, &cpu_rq(cpu)->dl);
}
-#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index ef047add7f9e..56ae54e0ce6a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -244,11 +244,13 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
static int sched_dynamic_show(struct seq_file *m, void *v)
{
- static const char * preempt_modes[] = {
- "none", "voluntary", "full", "lazy",
- };
- int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2;
+ int j;
+
+ /* Count entries in NULL terminated preempt_modes */
+ for (j = 0; preempt_modes[j]; j++)
+ ;
+ j -= !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
for (; i < j; i++) {
if (preempt_dynamic_mode == i)
@@ -292,7 +294,7 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf,
bool orig;
cpus_read_lock();
- mutex_lock(&sched_domains_mutex);
+ sched_domains_mutex_lock();
orig = sched_debug_verbose;
result = debugfs_write_file_bool(filp, ubuf, cnt, ppos);
@@ -304,7 +306,7 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf,
sd_dentry = NULL;
}
- mutex_unlock(&sched_domains_mutex);
+ sched_domains_mutex_unlock();
cpus_read_unlock();
return result;
@@ -515,9 +517,9 @@ static __init int sched_init_debug(void)
debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
- mutex_lock(&sched_domains_mutex);
+ sched_domains_mutex_lock();
update_sched_domain_debugfs();
- mutex_unlock(&sched_domains_mutex);
+ sched_domains_mutex_unlock();
#endif
#ifdef CONFIG_NUMA_BALANCING
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7b9dfee858e7..21575d39c376 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6,6 +6,9 @@
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
*/
+#include <linux/btf_ids.h>
+#include "ext_idle.h"
+
#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
enum scx_consts {
@@ -93,7 +96,7 @@ enum scx_ops_flags {
/*
* Keep built-in idle tracking even if ops.update_idle() is implemented.
*/
- SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
+ SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
/*
* By default, if there are no other task to run on the CPU, ext core
@@ -101,7 +104,7 @@ enum scx_ops_flags {
* flag is specified, such tasks are passed to ops.enqueue() with
* %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
*/
- SCX_OPS_ENQ_LAST = 1LLU << 1,
+ SCX_OPS_ENQ_LAST = 1LLU << 1,
/*
* An exiting task may schedule after PF_EXITING is set. In such cases,
@@ -114,13 +117,13 @@ enum scx_ops_flags {
* depend on pid lookups and wants to handle these tasks directly, the
* following flag can be used.
*/
- SCX_OPS_ENQ_EXITING = 1LLU << 2,
+ SCX_OPS_ENQ_EXITING = 1LLU << 2,
/*
* If set, only tasks with policy set to SCHED_EXT are attached to
* sched_ext. If clear, SCHED_NORMAL tasks are also included.
*/
- SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
+ SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
/*
* A migration disabled task can only execute on its current CPU. By
@@ -133,7 +136,29 @@ enum scx_ops_flags {
* current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
* and thus may disagree with cpumask_weight(p->cpus_ptr).
*/
- SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
+ SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
+
+ /*
+ * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes
+ * ops.enqueue() on the ops.select_cpu() selected or the wakee's
+ * previous CPU via IPI (inter-processor interrupt) to reduce cacheline
+ * transfers. When this optimization is enabled, ops.select_cpu() is
+ * skipped in some cases (when racing against the wakee switching out).
+ * As the BPF scheduler may depend on ops.select_cpu() being invoked
+ * during wakeups, queued wakeup is disabled by default.
+ *
+ * If this ops flag is set, queued wakeup optimization is enabled and
+ * the BPF scheduler must be able to handle ops.enqueue() invoked on the
+ * wakee's CPU without preceding ops.select_cpu() even for tasks which
+ * may be executed on multiple CPUs.
+ */
+ SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5,
+
+ /*
+ * If set, enable per-node idle cpumasks. If clear, use a single global
+ * flat idle cpumask.
+ */
+ SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6,
/*
* CPU cgroup support flags
@@ -144,7 +169,9 @@ enum scx_ops_flags {
SCX_OPS_ENQ_LAST |
SCX_OPS_ENQ_EXITING |
SCX_OPS_ENQ_MIGRATION_DISABLED |
+ SCX_OPS_ALLOW_QUEUED_WAKEUP |
SCX_OPS_SWITCH_PARTIAL |
+ SCX_OPS_BUILTIN_IDLE_PER_NODE |
SCX_OPS_HAS_CGROUP_WEIGHT,
};
@@ -779,6 +806,7 @@ enum scx_deq_flags {
enum scx_pick_idle_cpu_flags {
SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */
+ SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */
};
enum scx_kick_flags {
@@ -894,16 +922,11 @@ DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
static struct sched_ext_ops scx_ops;
static bool scx_warned_zero_slice;
+DEFINE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled);
static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
-static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
-
-#ifdef CONFIG_SMP
-static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
-static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa);
-#endif
static struct static_key_false scx_has_op[SCX_OPI_END] =
{ [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
@@ -938,21 +961,6 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
static struct delayed_work scx_watchdog_work;
-/* idle tracking */
-#ifdef CONFIG_SMP
-#ifdef CONFIG_CPUMASK_OFFSTACK
-#define CL_ALIGNED_IF_ONSTACK
-#else
-#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp
-#endif
-
-static struct {
- cpumask_var_t cpu;
- cpumask_var_t smt;
-} idle_masks CL_ALIGNED_IF_ONSTACK;
-
-#endif /* CONFIG_SMP */
-
/* for %SCX_KICK_WAIT */
static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
@@ -1473,6 +1481,117 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
return p;
}
+/*
+ * Collection of event counters. Event types are placed in descending order.
+ */
+struct scx_event_stats {
+ /*
+ * If ops.select_cpu() returns a CPU which can't be used by the task,
+ * the core scheduler code silently picks a fallback CPU.
+ */
+ s64 SCX_EV_SELECT_CPU_FALLBACK;
+
+ /*
+ * When dispatching to a local DSQ, the CPU may have gone offline in
+ * the meantime. In this case, the task is bounced to the global DSQ.
+ */
+ s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
+
+ /*
+ * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
+ * continued to run because there were no other tasks on the CPU.
+ */
+ s64 SCX_EV_DISPATCH_KEEP_LAST;
+
+ /*
+ * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
+ * is dispatched to a local DSQ when exiting.
+ */
+ s64 SCX_EV_ENQ_SKIP_EXITING;
+
+ /*
+ * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
+ * migration disabled task skips ops.enqueue() and is dispatched to its
+ * local DSQ.
+ */
+ s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
+
+ /*
+ * The total number of tasks enqueued (or pick_task-ed) with a
+ * default time slice (SCX_SLICE_DFL).
+ */
+ s64 SCX_EV_ENQ_SLICE_DFL;
+
+ /*
+ * The total duration of bypass modes in nanoseconds.
+ */
+ s64 SCX_EV_BYPASS_DURATION;
+
+ /*
+ * The number of tasks dispatched in the bypassing mode.
+ */
+ s64 SCX_EV_BYPASS_DISPATCH;
+
+ /*
+ * The number of times the bypassing mode has been activated.
+ */
+ s64 SCX_EV_BYPASS_ACTIVATE;
+};
+
+/*
+ * The event counter is organized by a per-CPU variable to minimize the
+ * accounting overhead without synchronization. A system-wide view on the
+ * event counter is constructed when requested by scx_bpf_get_event_stat().
+ */
+static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
+
+/**
+ * scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @name: an event name defined in struct scx_event_stats
+ * @cnt: the number of the event occured
+ *
+ * This can be used when preemption is not disabled.
+ */
+#define scx_add_event(name, cnt) do { \
+ this_cpu_add(event_stats_cpu.name, cnt); \
+ trace_sched_ext_event(#name, cnt); \
+} while(0)
+
+/**
+ * __scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @name: an event name defined in struct scx_event_stats
+ * @cnt: the number of the event occured
+ *
+ * This should be used only when preemption is disabled.
+ */
+#define __scx_add_event(name, cnt) do { \
+ __this_cpu_add(event_stats_cpu.name, cnt); \
+ trace_sched_ext_event(#name, cnt); \
+} while(0)
+
+/**
+ * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e'
+ * @dst_e: destination event stats
+ * @src_e: source event stats
+ * @kind: a kind of event to be aggregated
+ */
+#define scx_agg_event(dst_e, src_e, kind) do { \
+ (dst_e)->kind += READ_ONCE((src_e)->kind); \
+} while(0)
+
+/**
+ * scx_dump_event - Dump an event 'kind' in 'events' to 's'
+ * @s: output seq_buf
+ * @events: event stats
+ * @kind: a kind of event to dump
+ */
+#define scx_dump_event(s, events, kind) do { \
+ dump_line(&(s), "%40s: %16lld", #kind, (events)->kind); \
+} while (0)
+
+
+static void scx_bpf_events(struct scx_event_stats *events, size_t events__sz);
+
static enum scx_ops_enable_state scx_ops_enable_state(void)
{
return atomic_read(&scx_ops_enable_state_var);
@@ -2018,21 +2137,27 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
if (!scx_rq_online(rq))
goto local;
- if (scx_rq_bypassing(rq))
+ if (scx_rq_bypassing(rq)) {
+ __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
goto global;
+ }
if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
goto direct;
/* see %SCX_OPS_ENQ_EXITING */
if (!static_branch_unlikely(&scx_ops_enq_exiting) &&
- unlikely(p->flags & PF_EXITING))
+ unlikely(p->flags & PF_EXITING)) {
+ __scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1);
goto local;
+ }
/* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) &&
- is_migration_disabled(p))
+ is_migration_disabled(p)) {
+ __scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
goto local;
+ }
if (!SCX_HAS_OP(enqueue))
goto global;
@@ -2072,6 +2197,7 @@ local:
*/
touch_core_sched(rq, p);
p->scx.slice = SCX_SLICE_DFL;
+ __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
local_norefill:
dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
return;
@@ -2079,6 +2205,7 @@ local_norefill:
global:
touch_core_sched(rq, p); /* see the comment in local: */
p->scx.slice = SCX_SLICE_DFL;
+ __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
dispatch_enqueue(find_global_dsq(p), p, enq_flags);
}
@@ -2150,6 +2277,10 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
do_enqueue_task(rq, p, enq_flags, sticky_cpu);
out:
rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;
+
+ if ((enq_flags & SCX_ENQ_CPU_SELECTED) &&
+ unlikely(cpu_of(rq) != p->scx.selected_cpu))
+ __scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1);
}
static void ops_dequeue(struct task_struct *p, u64 deq_flags)
@@ -2337,11 +2468,11 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
* The caller must ensure that @p and @rq are on different CPUs.
*/
static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
- bool trigger_error)
+ bool enforce)
{
int cpu = cpu_of(rq);
- SCHED_WARN_ON(task_cpu(p) == cpu);
+ WARN_ON_ONCE(task_cpu(p) == cpu);
/*
* If @p has migration disabled, @p->cpus_ptr is updated to contain only
@@ -2356,7 +2487,7 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
* easily be masked if task_allowed_on_cpu() is done first.
*/
if (unlikely(is_migration_disabled(p))) {
- if (trigger_error)
+ if (enforce)
scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d",
p->comm, p->pid, task_cpu(p), cpu);
return false;
@@ -2369,14 +2500,17 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
* picked CPU is outside the allowed mask.
*/
if (!task_allowed_on_cpu(p, cpu)) {
- if (trigger_error)
+ if (enforce)
scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]",
cpu, p->comm, p->pid);
return false;
}
- if (!scx_rq_online(rq))
+ if (!scx_rq_online(rq)) {
+ if (enforce)
+ __scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
return false;
+ }
return true;
}
@@ -2446,7 +2580,7 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p,
}
#else /* CONFIG_SMP */
static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); }
-static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; }
+static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool enforce) { return false; }
static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; }
#endif /* CONFIG_SMP */
@@ -2893,6 +3027,7 @@ no_tasks:
if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) ||
scx_rq_bypassing(rq))) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
+ __scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1);
goto has_tasks;
}
rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
@@ -3159,8 +3294,10 @@ static struct task_struct *pick_task_scx(struct rq *rq)
*/
if (keep_prev) {
p = prev;
- if (!p->scx.slice)
+ if (!p->scx.slice) {
p->scx.slice = SCX_SLICE_DFL;
+ __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
+ }
} else {
p = first_local_task(rq);
if (!p) {
@@ -3176,6 +3313,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
scx_warned_zero_slice = true;
}
p->scx.slice = SCX_SLICE_DFL;
+ __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
}
}
@@ -3220,418 +3358,10 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
#ifdef CONFIG_SMP
-static bool test_and_clear_cpu_idle(int cpu)
-{
-#ifdef CONFIG_SCHED_SMT
- /*
- * SMT mask should be cleared whether we can claim @cpu or not. The SMT
- * cluster is not wholly idle either way. This also prevents
- * scx_pick_idle_cpu() from getting caught in an infinite loop.
- */
- if (sched_smt_active()) {
- const struct cpumask *smt = cpu_smt_mask(cpu);
-
- /*
- * If offline, @cpu is not its own sibling and
- * scx_pick_idle_cpu() can get caught in an infinite loop as
- * @cpu is never cleared from idle_masks.smt. Ensure that @cpu
- * is eventually cleared.
- *
- * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to
- * reduce memory writes, which may help alleviate cache
- * coherence pressure.
- */
- if (cpumask_intersects(smt, idle_masks.smt))
- cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
- else if (cpumask_test_cpu(cpu, idle_masks.smt))
- __cpumask_clear_cpu(cpu, idle_masks.smt);
- }
-#endif
- return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
-}
-
-static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
-{
- int cpu;
-
-retry:
- if (sched_smt_active()) {
- cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed);
- if (cpu < nr_cpu_ids)
- goto found;
-
- if (flags & SCX_PICK_IDLE_CORE)
- return -EBUSY;
- }
-
- cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed);
- if (cpu >= nr_cpu_ids)
- return -EBUSY;
-
-found:
- if (test_and_clear_cpu_idle(cpu))
- return cpu;
- else
- goto retry;
-}
-
-/*
- * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC
- * domain is not defined).
- */
-static unsigned int llc_weight(s32 cpu)
-{
- struct sched_domain *sd;
-
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
- if (!sd)
- return 0;
-
- return sd->span_weight;
-}
-
-/*
- * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC
- * domain is not defined).
- */
-static struct cpumask *llc_span(s32 cpu)
-{
- struct sched_domain *sd;
-
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
- if (!sd)
- return 0;
-
- return sched_domain_span(sd);
-}
-
-/*
- * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the
- * NUMA domain is not defined).
- */
-static unsigned int numa_weight(s32 cpu)
-{
- struct sched_domain *sd;
- struct sched_group *sg;
-
- sd = rcu_dereference(per_cpu(sd_numa, cpu));
- if (!sd)
- return 0;
- sg = sd->groups;
- if (!sg)
- return 0;
-
- return sg->group_weight;
-}
-
-/*
- * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA
- * domain is not defined).
- */
-static struct cpumask *numa_span(s32 cpu)
-{
- struct sched_domain *sd;
- struct sched_group *sg;
-
- sd = rcu_dereference(per_cpu(sd_numa, cpu));
- if (!sd)
- return NULL;
- sg = sd->groups;
- if (!sg)
- return NULL;
-
- return sched_group_span(sg);
-}
-
-/*
- * Return true if the LLC domains do not perfectly overlap with the NUMA
- * domains, false otherwise.
- */
-static bool llc_numa_mismatch(void)
-{
- int cpu;
-
- /*
- * We need to scan all online CPUs to verify whether their scheduling
- * domains overlap.
- *
- * While it is rare to encounter architectures with asymmetric NUMA
- * topologies, CPU hotplugging or virtualized environments can result
- * in asymmetric configurations.
- *
- * For example:
- *
- * NUMA 0:
- * - LLC 0: cpu0..cpu7
- * - LLC 1: cpu8..cpu15 [offline]
- *
- * NUMA 1:
- * - LLC 0: cpu16..cpu23
- * - LLC 1: cpu24..cpu31
- *
- * In this case, if we only check the first online CPU (cpu0), we might
- * incorrectly assume that the LLC and NUMA domains are fully
- * overlapping, which is incorrect (as NUMA 1 has two distinct LLC
- * domains).
- */
- for_each_online_cpu(cpu)
- if (llc_weight(cpu) != numa_weight(cpu))
- return true;
-
- return false;
-}
-
-/*
- * Initialize topology-aware scheduling.
- *
- * Detect if the system has multiple LLC or multiple NUMA domains and enable
- * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
- * selection policy.
- *
- * Assumption: the kernel's internal topology representation assumes that each
- * CPU belongs to a single LLC domain, and that each LLC domain is entirely
- * contained within a single NUMA node.
- */
-static void update_selcpu_topology(void)
-{
- bool enable_llc = false, enable_numa = false;
- unsigned int nr_cpus;
- s32 cpu = cpumask_first(cpu_online_mask);
-
- /*
- * Enable LLC domain optimization only when there are multiple LLC
- * domains among the online CPUs. If all online CPUs are part of a
- * single LLC domain, the idle CPU selection logic can choose any
- * online CPU without bias.
- *
- * Note that it is sufficient to check the LLC domain of the first
- * online CPU to determine whether a single LLC domain includes all
- * CPUs.
- */
- rcu_read_lock();
- nr_cpus = llc_weight(cpu);
- if (nr_cpus > 0) {
- if (nr_cpus < num_online_cpus())
- enable_llc = true;
- pr_debug("sched_ext: LLC=%*pb weight=%u\n",
- cpumask_pr_args(llc_span(cpu)), llc_weight(cpu));
- }
-
- /*
- * Enable NUMA optimization only when there are multiple NUMA domains
- * among the online CPUs and the NUMA domains don't perfectly overlaps
- * with the LLC domains.
- *
- * If all CPUs belong to the same NUMA node and the same LLC domain,
- * enabling both NUMA and LLC optimizations is unnecessary, as checking
- * for an idle CPU in the same domain twice is redundant.
- */
- nr_cpus = numa_weight(cpu);
- if (nr_cpus > 0) {
- if (nr_cpus < num_online_cpus() && llc_numa_mismatch())
- enable_numa = true;
- pr_debug("sched_ext: NUMA=%*pb weight=%u\n",
- cpumask_pr_args(numa_span(cpu)), numa_weight(cpu));
- }
- rcu_read_unlock();
-
- pr_debug("sched_ext: LLC idle selection %s\n",
- str_enabled_disabled(enable_llc));
- pr_debug("sched_ext: NUMA idle selection %s\n",
- str_enabled_disabled(enable_numa));
-
- if (enable_llc)
- static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
- else
- static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
- if (enable_numa)
- static_branch_enable_cpuslocked(&scx_selcpu_topo_numa);
- else
- static_branch_disable_cpuslocked(&scx_selcpu_topo_numa);
-}
-
-/*
- * Built-in CPU idle selection policy:
- *
- * 1. Prioritize full-idle cores:
- * - always prioritize CPUs from fully idle cores (both logical CPUs are
- * idle) to avoid interference caused by SMT.
- *
- * 2. Reuse the same CPU:
- * - prefer the last used CPU to take advantage of cached data (L1, L2) and
- * branch prediction optimizations.
- *
- * 3. Pick a CPU within the same LLC (Last-Level Cache):
- * - if the above conditions aren't met, pick a CPU that shares the same LLC
- * to maintain cache locality.
- *
- * 4. Pick a CPU within the same NUMA node, if enabled:
- * - choose a CPU from the same NUMA node to reduce memory access latency.
- *
- * 5. Pick any idle CPU usable by the task.
- *
- * Step 3 and 4 are performed only if the system has, respectively, multiple
- * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and
- * scx_selcpu_topo_numa).
- *
- * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
- * we never call ops.select_cpu() for them, see select_task_rq().
- */
-static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
- u64 wake_flags, bool *found)
-{
- const struct cpumask *llc_cpus = NULL;
- const struct cpumask *numa_cpus = NULL;
- s32 cpu;
-
- *found = false;
-
- /*
- * This is necessary to protect llc_cpus.
- */
- rcu_read_lock();
-
- /*
- * Determine the scheduling domain only if the task is allowed to run
- * on all CPUs.
- *
- * This is done primarily for efficiency, as it avoids the overhead of
- * updating a cpumask every time we need to select an idle CPU (which
- * can be costly in large SMP systems), but it also aligns logically:
- * if a task's scheduling domain is restricted by user-space (through
- * CPU affinity), the task will simply use the flat scheduling domain
- * defined by user-space.
- */
- if (p->nr_cpus_allowed >= num_possible_cpus()) {
- if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
- numa_cpus = numa_span(prev_cpu);
-
- if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc))
- llc_cpus = llc_span(prev_cpu);
- }
-
- /*
- * If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
- */
- if (wake_flags & SCX_WAKE_SYNC) {
- cpu = smp_processor_id();
-
- /*
- * If the waker's CPU is cache affine and prev_cpu is idle,
- * then avoid a migration.
- */
- if (cpus_share_cache(cpu, prev_cpu) &&
- test_and_clear_cpu_idle(prev_cpu)) {
- cpu = prev_cpu;
- goto cpu_found;
- }
-
- /*
- * If the waker's local DSQ is empty, and the system is under
- * utilized, try to wake up @p to the local DSQ of the waker.
- *
- * Checking only for an empty local DSQ is insufficient as it
- * could give the wakee an unfair advantage when the system is
- * oversaturated.
- *
- * Checking only for the presence of idle CPUs is also
- * insufficient as the local DSQ of the waker could have tasks
- * piled up on it even if there is an idle core elsewhere on
- * the system.
- */
- if (!cpumask_empty(idle_masks.cpu) &&
- !(current->flags & PF_EXITING) &&
- cpu_rq(cpu)->scx.local_dsq.nr == 0) {
- if (cpumask_test_cpu(cpu, p->cpus_ptr))
- goto cpu_found;
- }
- }
-
- /*
- * If CPU has SMT, any wholly idle CPU is likely a better pick than
- * partially idle @prev_cpu.
- */
- if (sched_smt_active()) {
- /*
- * Keep using @prev_cpu if it's part of a fully idle core.
- */
- if (cpumask_test_cpu(prev_cpu, idle_masks.smt) &&
- test_and_clear_cpu_idle(prev_cpu)) {
- cpu = prev_cpu;
- goto cpu_found;
- }
-
- /*
- * Search for any fully idle core in the same LLC domain.
- */
- if (llc_cpus) {
- cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE);
- if (cpu >= 0)
- goto cpu_found;
- }
-
- /*
- * Search for any fully idle core in the same NUMA node.
- */
- if (numa_cpus) {
- cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE);
- if (cpu >= 0)
- goto cpu_found;
- }
-
- /*
- * Search for any full idle core usable by the task.
- */
- cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE);
- if (cpu >= 0)
- goto cpu_found;
- }
-
- /*
- * Use @prev_cpu if it's idle.
- */
- if (test_and_clear_cpu_idle(prev_cpu)) {
- cpu = prev_cpu;
- goto cpu_found;
- }
-
- /*
- * Search for any idle CPU in the same LLC domain.
- */
- if (llc_cpus) {
- cpu = scx_pick_idle_cpu(llc_cpus, 0);
- if (cpu >= 0)
- goto cpu_found;
- }
-
- /*
- * Search for any idle CPU in the same NUMA node.
- */
- if (numa_cpus) {
- cpu = scx_pick_idle_cpu(numa_cpus, 0);
- if (cpu >= 0)
- goto cpu_found;
- }
-
- /*
- * Search for any idle CPU usable by the task.
- */
- cpu = scx_pick_idle_cpu(p->cpus_ptr, 0);
- if (cpu >= 0)
- goto cpu_found;
-
- rcu_read_unlock();
- return prev_cpu;
-
-cpu_found:
- rcu_read_unlock();
-
- *found = true;
- return cpu;
-}
-
static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
{
+ bool rq_bypass;
+
/*
* sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it
* can be a good migration opportunity with low cache and memory
@@ -3645,7 +3375,8 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
if (unlikely(wake_flags & WF_EXEC))
return prev_cpu;
- if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) {
+ rq_bypass = scx_rq_bypassing(task_rq(p));
+ if (SCX_HAS_OP(select_cpu) && !rq_bypass) {
s32 cpu;
struct task_struct **ddsp_taskp;
@@ -3655,20 +3386,27 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
select_cpu, p, prev_cpu, wake_flags);
+ p->scx.selected_cpu = cpu;
*ddsp_taskp = NULL;
if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
return cpu;
else
return prev_cpu;
} else {
- bool found;
s32 cpu;
- cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found);
- if (found) {
+ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0);
+ if (cpu >= 0) {
p->scx.slice = SCX_SLICE_DFL;
p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
+ __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
+ } else {
+ cpu = prev_cpu;
}
+ p->scx.selected_cpu = cpu;
+
+ if (rq_bypass)
+ __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
return cpu;
}
}
@@ -3696,90 +3434,6 @@ static void set_cpus_allowed_scx(struct task_struct *p,
(struct cpumask *)p->cpus_ptr);
}
-static void reset_idle_masks(void)
-{
- /*
- * Consider all online cpus idle. Should converge to the actual state
- * quickly.
- */
- cpumask_copy(idle_masks.cpu, cpu_online_mask);
- cpumask_copy(idle_masks.smt, cpu_online_mask);
-}
-
-static void update_builtin_idle(int cpu, bool idle)
-{
- assign_cpu(cpu, idle_masks.cpu, idle);
-
-#ifdef CONFIG_SCHED_SMT
- if (sched_smt_active()) {
- const struct cpumask *smt = cpu_smt_mask(cpu);
-
- if (idle) {
- /*
- * idle_masks.smt handling is racy but that's fine as
- * it's only for optimization and self-correcting.
- */
- if (!cpumask_subset(smt, idle_masks.cpu))
- return;
- cpumask_or(idle_masks.smt, idle_masks.smt, smt);
- } else {
- cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
- }
- }
-#endif
-}
-
-/*
- * Update the idle state of a CPU to @idle.
- *
- * If @do_notify is true, ops.update_idle() is invoked to notify the scx
- * scheduler of an actual idle state transition (idle to busy or vice
- * versa). If @do_notify is false, only the idle state in the idle masks is
- * refreshed without invoking ops.update_idle().
- *
- * This distinction is necessary, because an idle CPU can be "reserved" and
- * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as
- * busy even if no tasks are dispatched. In this case, the CPU may return
- * to idle without a true state transition. Refreshing the idle masks
- * without invoking ops.update_idle() ensures accurate idle state tracking
- * while avoiding unnecessary updates and maintaining balanced state
- * transitions.
- */
-void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
-{
- int cpu = cpu_of(rq);
-
- lockdep_assert_rq_held(rq);
-
- /*
- * Trigger ops.update_idle() only when transitioning from a task to
- * the idle thread and vice versa.
- *
- * Idle transitions are indicated by do_notify being set to true,
- * managed by put_prev_task_idle()/set_next_task_idle().
- */
- if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
- SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
-
- /*
- * Update the idle masks:
- * - for real idle transitions (do_notify == true)
- * - for idle-to-idle transitions (indicated by the previous task
- * being the idle thread, managed by pick_task_idle())
- *
- * Skip updating idle masks if the previous task is not the idle
- * thread, since set_next_task_idle() has already handled it when
- * transitioning from a task to the idle thread (calling this
- * function with do_notify == true).
- *
- * In this way we can avoid updating the idle masks twice,
- * unnecessarily.
- */
- if (static_branch_likely(&scx_builtin_idle_enabled))
- if (do_notify || is_idle_task(rq->curr))
- update_builtin_idle(cpu, idle);
-}
-
static void handle_hotplug(struct rq *rq, bool online)
{
int cpu = cpu_of(rq);
@@ -3787,7 +3441,7 @@ static void handle_hotplug(struct rq *rq, bool online)
atomic_long_inc(&scx_hotplug_seq);
if (scx_enabled())
- update_selcpu_topology();
+ scx_idle_update_selcpu_topology(&scx_ops);
if (online && SCX_HAS_OP(cpu_online))
SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
@@ -3819,12 +3473,6 @@ static void rq_offline_scx(struct rq *rq)
rq->scx.flags &= ~SCX_RQ_ONLINE;
}
-#else /* CONFIG_SMP */
-
-static bool test_and_clear_cpu_idle(int cpu) { return false; }
-static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; }
-static void reset_idle_masks(void) {}
-
#endif /* CONFIG_SMP */
static bool check_rq_for_timeouts(struct rq *rq)
@@ -4749,8 +4397,33 @@ static ssize_t scx_attr_ops_show(struct kobject *kobj,
}
SCX_ATTR(ops);
+#define scx_attr_event_show(buf, at, events, kind) ({ \
+ sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind); \
+})
+
+static ssize_t scx_attr_events_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ struct scx_event_stats events;
+ int at = 0;
+
+ scx_bpf_events(&events, sizeof(events));
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SLICE_DFL);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE);
+ return at;
+}
+SCX_ATTR(events);
+
static struct attribute *scx_sched_attrs[] = {
&scx_attr_ops.attr,
+ &scx_attr_events.attr,
NULL,
};
ATTRIBUTE_GROUPS(scx_sched);
@@ -4862,6 +4535,8 @@ static void scx_clear_softlockup(void)
static void scx_ops_bypass(bool bypass)
{
static DEFINE_RAW_SPINLOCK(bypass_lock);
+ static unsigned long bypass_timestamp;
+
int cpu;
unsigned long flags;
@@ -4871,11 +4546,15 @@ static void scx_ops_bypass(bool bypass)
WARN_ON_ONCE(scx_ops_bypass_depth <= 0);
if (scx_ops_bypass_depth != 1)
goto unlock;
+ bypass_timestamp = ktime_get_ns();
+ scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1);
} else {
scx_ops_bypass_depth--;
WARN_ON_ONCE(scx_ops_bypass_depth < 0);
if (scx_ops_bypass_depth != 0)
goto unlock;
+ scx_add_event(SCX_EV_BYPASS_DURATION,
+ ktime_get_ns() - bypass_timestamp);
}
atomic_inc(&scx_ops_breather_depth);
@@ -5095,11 +4774,12 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
static_branch_disable(&__scx_ops_enabled);
for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
static_branch_disable(&scx_has_op[i]);
+ static_branch_disable(&scx_ops_allow_queued_wakeup);
static_branch_disable(&scx_ops_enq_last);
static_branch_disable(&scx_ops_enq_exiting);
static_branch_disable(&scx_ops_enq_migration_disabled);
static_branch_disable(&scx_ops_cpu_preempt);
- static_branch_disable(&scx_builtin_idle_enabled);
+ scx_idle_disable();
synchronize_rcu();
if (ei->kind >= SCX_EXIT_ERROR) {
@@ -5349,6 +5029,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
.at_jiffies = jiffies,
};
struct seq_buf s;
+ struct scx_event_stats events;
unsigned long flags;
char *buf;
int cpu;
@@ -5457,6 +5138,21 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
rq_unlock(rq, &rf);
}
+ dump_newline(&s);
+ dump_line(&s, "Event counters");
+ dump_line(&s, "--------------");
+
+ scx_bpf_events(&events, sizeof(events));
+ scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK);
+ scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+ scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
+ scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING);
+ scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+ scx_dump_event(s, &events, SCX_EV_ENQ_SLICE_DFL);
+ scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION);
+ scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH);
+ scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE);
+
if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
memcpy(ei->dump + dump_len - sizeof(trunc_marker),
trunc_marker, sizeof(trunc_marker));
@@ -5546,6 +5242,16 @@ static int validate_ops(const struct sched_ext_ops *ops)
return -EINVAL;
}
+ /*
+ * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle
+ * selection policy to be enabled.
+ */
+ if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) &&
+ (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) {
+ scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled");
+ return -EINVAL;
+ }
+
return 0;
}
@@ -5564,6 +5270,15 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
mutex_lock(&scx_ops_enable_mutex);
+ /*
+ * Clear event counters so a new scx scheduler gets
+ * fresh event counter values.
+ */
+ for_each_possible_cpu(cpu) {
+ struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu);
+ memset(e, 0, sizeof(*e));
+ }
+
if (!scx_ops_helper) {
WRITE_ONCE(scx_ops_helper,
scx_create_rt_helper("sched_ext_ops_helper"));
@@ -5661,9 +5376,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
static_branch_enable_cpuslocked(&scx_has_op[i]);
check_hotplug_seq(ops);
-#ifdef CONFIG_SMP
- update_selcpu_topology();
-#endif
+ scx_idle_update_selcpu_topology(ops);
+
cpus_read_unlock();
ret = validate_ops(ops);
@@ -5702,9 +5416,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (((void (**)(void))ops)[i])
static_branch_enable(&scx_has_op[i]);
+ if (ops->flags & SCX_OPS_ALLOW_QUEUED_WAKEUP)
+ static_branch_enable(&scx_ops_allow_queued_wakeup);
if (ops->flags & SCX_OPS_ENQ_LAST)
static_branch_enable(&scx_ops_enq_last);
-
if (ops->flags & SCX_OPS_ENQ_EXITING)
static_branch_enable(&scx_ops_enq_exiting);
if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED)
@@ -5712,12 +5427,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (scx_ops.cpu_acquire || scx_ops.cpu_release)
static_branch_enable(&scx_ops_cpu_preempt);
- if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
- reset_idle_masks();
- static_branch_enable(&scx_builtin_idle_enabled);
- } else {
- static_branch_disable(&scx_builtin_idle_enabled);
- }
+ scx_idle_enable(ops);
/*
* Lock out forks, cgroup on/offlining and moves before opening the
@@ -6356,10 +6066,8 @@ void __init init_sched_ext_class(void)
SCX_TG_ONLINE);
BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
-#ifdef CONFIG_SMP
- BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
- BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
-#endif
+ scx_idle_init_masks();
+
scx_kick_cpus_pnt_seqs =
__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
__alignof__(scx_kick_cpus_pnt_seqs[0]));
@@ -6367,15 +6075,16 @@ void __init init_sched_ext_class(void)
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
+ int n = cpu_to_node(cpu);
init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
INIT_LIST_HEAD(&rq->scx.runnable_list);
INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
- BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
- BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL));
- BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
- BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n));
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn);
init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
@@ -6392,65 +6101,6 @@ void __init init_sched_ext_class(void)
/********************************************************************************
* Helpers that can be called from the BPF scheduler.
*/
-#include <linux/btf_ids.h>
-
-__bpf_kfunc_start_defs();
-
-static bool check_builtin_idle_enabled(void)
-{
- if (static_branch_likely(&scx_builtin_idle_enabled))
- return true;
-
- scx_ops_error("built-in idle tracking is disabled");
- return false;
-}
-
-/**
- * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
- * @p: task_struct to select a CPU for
- * @prev_cpu: CPU @p was on previously
- * @wake_flags: %SCX_WAKE_* flags
- * @is_idle: out parameter indicating whether the returned CPU is idle
- *
- * Can only be called from ops.select_cpu() if the built-in CPU selection is
- * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
- * @p, @prev_cpu and @wake_flags match ops.select_cpu().
- *
- * Returns the picked CPU with *@is_idle indicating whether the picked CPU is
- * currently idle and thus a good candidate for direct dispatching.
- */
-__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
- u64 wake_flags, bool *is_idle)
-{
- if (!ops_cpu_valid(prev_cpu, NULL))
- goto prev_cpu;
-
- if (!check_builtin_idle_enabled())
- goto prev_cpu;
-
- if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
- goto prev_cpu;
-
-#ifdef CONFIG_SMP
- return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle);
-#endif
-
-prev_cpu:
- *is_idle = false;
- return prev_cpu;
-}
-
-__bpf_kfunc_end_defs();
-
-BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
-BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
-BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
-
-static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
- .owner = THIS_MODULE,
- .set = &scx_kfunc_ids_select_cpu,
-};
-
static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
{
if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
@@ -6972,8 +6622,12 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void)
* CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to
* the current local DSQ for running tasks and thus are not
* visible to the BPF scheduler.
+ *
+ * Also skip re-enqueueing tasks that can only run on this
+ * CPU, as they would just be re-added to the same local
+ * DSQ without any benefit.
*/
- if (p->migration_pending)
+ if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1)
continue;
dispatch_dequeue(rq, p);
@@ -7470,6 +7124,16 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
}
/**
+ * scx_bpf_nr_node_ids - Return the number of possible node IDs
+ *
+ * All valid node IDs in the system are smaller than the returned value.
+ */
+__bpf_kfunc u32 scx_bpf_nr_node_ids(void)
+{
+ return nr_node_ids;
+}
+
+/**
* scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs
*
* All valid CPU IDs in the system are smaller than the returned value.
@@ -7510,142 +7174,6 @@ __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask)
}
/**
- * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
- * per-CPU cpumask.
- *
- * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
- */
-__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
-{
- if (!check_builtin_idle_enabled())
- return cpu_none_mask;
-
-#ifdef CONFIG_SMP
- return idle_masks.cpu;
-#else
- return cpu_none_mask;
-#endif
-}
-
-/**
- * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
- * per-physical-core cpumask. Can be used to determine if an entire physical
- * core is free.
- *
- * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
- */
-__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
-{
- if (!check_builtin_idle_enabled())
- return cpu_none_mask;
-
-#ifdef CONFIG_SMP
- if (sched_smt_active())
- return idle_masks.smt;
- else
- return idle_masks.cpu;
-#else
- return cpu_none_mask;
-#endif
-}
-
-/**
- * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
- * either the percpu, or SMT idle-tracking cpumask.
- * @idle_mask: &cpumask to use
- */
-__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
-{
- /*
- * Empty function body because we aren't actually acquiring or releasing
- * a reference to a global idle cpumask, which is read-only in the
- * caller and is never released. The acquire / release semantics here
- * are just used to make the cpumask a trusted pointer in the caller.
- */
-}
-
-/**
- * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
- * @cpu: cpu to test and clear idle for
- *
- * Returns %true if @cpu was idle and its idle state was successfully cleared.
- * %false otherwise.
- *
- * Unavailable if ops.update_idle() is implemented and
- * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
- */
-__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
-{
- if (!check_builtin_idle_enabled())
- return false;
-
- if (ops_cpu_valid(cpu, NULL))
- return test_and_clear_cpu_idle(cpu);
- else
- return false;
-}
-
-/**
- * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
- * @cpus_allowed: Allowed cpumask
- * @flags: %SCX_PICK_IDLE_CPU_* flags
- *
- * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
- * number on success. -%EBUSY if no matching cpu was found.
- *
- * Idle CPU tracking may race against CPU scheduling state transitions. For
- * example, this function may return -%EBUSY as CPUs are transitioning into the
- * idle state. If the caller then assumes that there will be dispatch events on
- * the CPUs as they were all busy, the scheduler may end up stalling with CPUs
- * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and
- * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch
- * event in the near future.
- *
- * Unavailable if ops.update_idle() is implemented and
- * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
- */
-__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
- u64 flags)
-{
- if (!check_builtin_idle_enabled())
- return -EBUSY;
-
- return scx_pick_idle_cpu(cpus_allowed, flags);
-}
-
-/**
- * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
- * @cpus_allowed: Allowed cpumask
- * @flags: %SCX_PICK_IDLE_CPU_* flags
- *
- * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
- * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
- * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
- * empty.
- *
- * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
- * set, this function can't tell which CPUs are idle and will always pick any
- * CPU.
- */
-__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
- u64 flags)
-{
- s32 cpu;
-
- if (static_branch_likely(&scx_builtin_idle_enabled)) {
- cpu = scx_pick_idle_cpu(cpus_allowed, flags);
- if (cpu >= 0)
- return cpu;
- }
-
- cpu = cpumask_any_distribute(cpus_allowed);
- if (cpu < nr_cpu_ids)
- return cpu;
- else
- return -EBUSY;
-}
-
-/**
* scx_bpf_task_running - Is task currently running?
* @p: task of interest
*/
@@ -7765,6 +7293,43 @@ __bpf_kfunc u64 scx_bpf_now(void)
return clock;
}
+/*
+ * scx_bpf_events - Get a system-wide event counter to
+ * @events: output buffer from a BPF program
+ * @events__sz: @events len, must end in '__sz'' for the verifier
+ */
+__bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
+ size_t events__sz)
+{
+ struct scx_event_stats e_sys, *e_cpu;
+ int cpu;
+
+ /* Aggregate per-CPU event counters into the system-wide counters. */
+ memset(&e_sys, 0, sizeof(e_sys));
+ for_each_possible_cpu(cpu) {
+ e_cpu = per_cpu_ptr(&event_stats_cpu, cpu);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SLICE_DFL);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE);
+ }
+
+ /*
+ * We cannot entirely trust a BPF-provided size since a BPF program
+ * might be compiled against a different vmlinux.h, of which
+ * scx_event_stats would be larger (a newer vmlinux.h) or smaller
+ * (an older vmlinux.h). Hence, we use the smaller size to avoid
+ * memory corruption.
+ */
+ events__sz = min(events__sz, sizeof(*events));
+ memcpy(events, &e_sys, events__sz);
+}
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_any)
@@ -7780,6 +7345,7 @@ BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_set)
+BTF_ID_FLAGS(func, scx_bpf_nr_node_ids)
BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
@@ -7797,6 +7363,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
#endif
BTF_ID_FLAGS(func, scx_bpf_now)
+BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(scx_kfunc_ids_any)
static const struct btf_kfunc_id_set scx_kfunc_set_any = {
@@ -7820,8 +7387,6 @@ static int __init scx_init(void)
* check using scx_kf_allowed().
*/
if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
- &scx_kfunc_set_select_cpu)) ||
- (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
&scx_kfunc_set_enqueue_dispatch)) ||
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
&scx_kfunc_set_dispatch)) ||
@@ -7841,6 +7406,12 @@ static int __init scx_init(void)
return ret;
}
+ ret = scx_idle_init();
+ if (ret) {
+ pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret);
+ return ret;
+ }
+
ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops);
if (ret) {
pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret);
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 1079b56b0f7a..1bda96b19a1b 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -8,6 +8,8 @@
*/
#ifdef CONFIG_SCHED_CLASS_EXT
+DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
+
void scx_tick(struct rq *rq);
void init_scx_entity(struct sched_ext_entity *scx);
void scx_pre_fork(struct task_struct *p);
@@ -34,6 +36,13 @@ static inline bool task_on_scx(const struct task_struct *p)
return scx_enabled() && p->sched_class == &ext_sched_class;
}
+static inline bool scx_allow_ttwu_queue(const struct task_struct *p)
+{
+ return !scx_enabled() ||
+ static_branch_likely(&scx_ops_allow_queued_wakeup) ||
+ p->sched_class != &ext_sched_class;
+}
+
#ifdef CONFIG_SCHED_CORE
bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
bool in_fi);
@@ -52,6 +61,7 @@ static inline void scx_rq_activate(struct rq *rq) {}
static inline void scx_rq_deactivate(struct rq *rq) {}
static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; }
static inline bool task_on_scx(const struct task_struct *p) { return false; }
+static inline bool scx_allow_ttwu_queue(const struct task_struct *p) { return true; }
static inline void init_sched_ext_class(void) {}
#endif /* CONFIG_SCHED_CLASS_EXT */
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
new file mode 100644
index 000000000000..52c36a70a3d0
--- /dev/null
+++ b/kernel/sched/ext_idle.c
@@ -0,0 +1,1171 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Built-in idle CPU tracking policy.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com>
+ */
+#include "ext_idle.h"
+
+/* Enable/disable built-in idle CPU selection policy */
+static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
+
+/* Enable/disable per-node idle cpumasks */
+static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node);
+
+#ifdef CONFIG_SMP
+/* Enable/disable LLC aware optimizations */
+static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
+
+/* Enable/disable NUMA aware optimizations */
+static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa);
+
+/*
+ * cpumasks to track idle CPUs within each NUMA node.
+ *
+ * If SCX_OPS_BUILTIN_IDLE_PER_NODE is not enabled, a single global cpumask
+ * from is used to track all the idle CPUs in the system.
+ */
+struct scx_idle_cpus {
+ cpumask_var_t cpu;
+ cpumask_var_t smt;
+};
+
+/*
+ * Global host-wide idle cpumasks (used when SCX_OPS_BUILTIN_IDLE_PER_NODE
+ * is not enabled).
+ */
+static struct scx_idle_cpus scx_idle_global_masks;
+
+/*
+ * Per-node idle cpumasks.
+ */
+static struct scx_idle_cpus **scx_idle_node_masks;
+
+/*
+ * Return the idle masks associated to a target @node.
+ *
+ * NUMA_NO_NODE identifies the global idle cpumask.
+ */
+static struct scx_idle_cpus *idle_cpumask(int node)
+{
+ return node == NUMA_NO_NODE ? &scx_idle_global_masks : scx_idle_node_masks[node];
+}
+
+/*
+ * Returns the NUMA node ID associated with a @cpu, or NUMA_NO_NODE if
+ * per-node idle cpumasks are disabled.
+ */
+static int scx_cpu_node_if_enabled(int cpu)
+{
+ if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node))
+ return NUMA_NO_NODE;
+
+ return cpu_to_node(cpu);
+}
+
+bool scx_idle_test_and_clear_cpu(int cpu)
+{
+ int node = scx_cpu_node_if_enabled(cpu);
+ struct cpumask *idle_cpus = idle_cpumask(node)->cpu;
+
+#ifdef CONFIG_SCHED_SMT
+ /*
+ * SMT mask should be cleared whether we can claim @cpu or not. The SMT
+ * cluster is not wholly idle either way. This also prevents
+ * scx_pick_idle_cpu() from getting caught in an infinite loop.
+ */
+ if (sched_smt_active()) {
+ const struct cpumask *smt = cpu_smt_mask(cpu);
+ struct cpumask *idle_smts = idle_cpumask(node)->smt;
+
+ /*
+ * If offline, @cpu is not its own sibling and
+ * scx_pick_idle_cpu() can get caught in an infinite loop as
+ * @cpu is never cleared from the idle SMT mask. Ensure that
+ * @cpu is eventually cleared.
+ *
+ * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to
+ * reduce memory writes, which may help alleviate cache
+ * coherence pressure.
+ */
+ if (cpumask_intersects(smt, idle_smts))
+ cpumask_andnot(idle_smts, idle_smts, smt);
+ else if (cpumask_test_cpu(cpu, idle_smts))
+ __cpumask_clear_cpu(cpu, idle_smts);
+ }
+#endif
+
+ return cpumask_test_and_clear_cpu(cpu, idle_cpus);
+}
+
+/*
+ * Pick an idle CPU in a specific NUMA node.
+ */
+static s32 pick_idle_cpu_in_node(const struct cpumask *cpus_allowed, int node, u64 flags)
+{
+ int cpu;
+
+retry:
+ if (sched_smt_active()) {
+ cpu = cpumask_any_and_distribute(idle_cpumask(node)->smt, cpus_allowed);
+ if (cpu < nr_cpu_ids)
+ goto found;
+
+ if (flags & SCX_PICK_IDLE_CORE)
+ return -EBUSY;
+ }
+
+ cpu = cpumask_any_and_distribute(idle_cpumask(node)->cpu, cpus_allowed);
+ if (cpu >= nr_cpu_ids)
+ return -EBUSY;
+
+found:
+ if (scx_idle_test_and_clear_cpu(cpu))
+ return cpu;
+ else
+ goto retry;
+}
+
+/*
+ * Tracks nodes that have not yet been visited when searching for an idle
+ * CPU across all available nodes.
+ */
+static DEFINE_PER_CPU(nodemask_t, per_cpu_unvisited);
+
+/*
+ * Search for an idle CPU across all nodes, excluding @node.
+ */
+static s32 pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u64 flags)
+{
+ nodemask_t *unvisited;
+ s32 cpu = -EBUSY;
+
+ preempt_disable();
+ unvisited = this_cpu_ptr(&per_cpu_unvisited);
+
+ /*
+ * Restrict the search to the online nodes (excluding the current
+ * node that has been visited already).
+ */
+ nodes_copy(*unvisited, node_states[N_ONLINE]);
+ node_clear(node, *unvisited);
+
+ /*
+ * Traverse all nodes in order of increasing distance, starting
+ * from @node.
+ *
+ * This loop is O(N^2), with N being the amount of NUMA nodes,
+ * which might be quite expensive in large NUMA systems. However,
+ * this complexity comes into play only when a scheduler enables
+ * SCX_OPS_BUILTIN_IDLE_PER_NODE and it's requesting an idle CPU
+ * without specifying a target NUMA node, so it shouldn't be a
+ * bottleneck is most cases.
+ *
+ * As a future optimization we may want to cache the list of nodes
+ * in a per-node array, instead of actually traversing them every
+ * time.
+ */
+ for_each_node_numadist(node, *unvisited) {
+ cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags);
+ if (cpu >= 0)
+ break;
+ }
+ preempt_enable();
+
+ return cpu;
+}
+
+/*
+ * Find an idle CPU in the system, starting from @node.
+ */
+s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags)
+{
+ s32 cpu;
+
+ /*
+ * Always search in the starting node first (this is an
+ * optimization that can save some cycles even when the search is
+ * not limited to a single node).
+ */
+ cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags);
+ if (cpu >= 0)
+ return cpu;
+
+ /*
+ * Stop the search if we are using only a single global cpumask
+ * (NUMA_NO_NODE) or if the search is restricted to the first node
+ * only.
+ */
+ if (node == NUMA_NO_NODE || flags & SCX_PICK_IDLE_IN_NODE)
+ return -EBUSY;
+
+ /*
+ * Extend the search to the other online nodes.
+ */
+ return pick_idle_cpu_from_online_nodes(cpus_allowed, node, flags);
+}
+
+/*
+ * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC
+ * domain is not defined).
+ */
+static unsigned int llc_weight(s32 cpu)
+{
+ struct sched_domain *sd;
+
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ if (!sd)
+ return 0;
+
+ return sd->span_weight;
+}
+
+/*
+ * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC
+ * domain is not defined).
+ */
+static struct cpumask *llc_span(s32 cpu)
+{
+ struct sched_domain *sd;
+
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ if (!sd)
+ return 0;
+
+ return sched_domain_span(sd);
+}
+
+/*
+ * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the
+ * NUMA domain is not defined).
+ */
+static unsigned int numa_weight(s32 cpu)
+{
+ struct sched_domain *sd;
+ struct sched_group *sg;
+
+ sd = rcu_dereference(per_cpu(sd_numa, cpu));
+ if (!sd)
+ return 0;
+ sg = sd->groups;
+ if (!sg)
+ return 0;
+
+ return sg->group_weight;
+}
+
+/*
+ * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA
+ * domain is not defined).
+ */
+static struct cpumask *numa_span(s32 cpu)
+{
+ struct sched_domain *sd;
+ struct sched_group *sg;
+
+ sd = rcu_dereference(per_cpu(sd_numa, cpu));
+ if (!sd)
+ return NULL;
+ sg = sd->groups;
+ if (!sg)
+ return NULL;
+
+ return sched_group_span(sg);
+}
+
+/*
+ * Return true if the LLC domains do not perfectly overlap with the NUMA
+ * domains, false otherwise.
+ */
+static bool llc_numa_mismatch(void)
+{
+ int cpu;
+
+ /*
+ * We need to scan all online CPUs to verify whether their scheduling
+ * domains overlap.
+ *
+ * While it is rare to encounter architectures with asymmetric NUMA
+ * topologies, CPU hotplugging or virtualized environments can result
+ * in asymmetric configurations.
+ *
+ * For example:
+ *
+ * NUMA 0:
+ * - LLC 0: cpu0..cpu7
+ * - LLC 1: cpu8..cpu15 [offline]
+ *
+ * NUMA 1:
+ * - LLC 0: cpu16..cpu23
+ * - LLC 1: cpu24..cpu31
+ *
+ * In this case, if we only check the first online CPU (cpu0), we might
+ * incorrectly assume that the LLC and NUMA domains are fully
+ * overlapping, which is incorrect (as NUMA 1 has two distinct LLC
+ * domains).
+ */
+ for_each_online_cpu(cpu)
+ if (llc_weight(cpu) != numa_weight(cpu))
+ return true;
+
+ return false;
+}
+
+/*
+ * Initialize topology-aware scheduling.
+ *
+ * Detect if the system has multiple LLC or multiple NUMA domains and enable
+ * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
+ * selection policy.
+ *
+ * Assumption: the kernel's internal topology representation assumes that each
+ * CPU belongs to a single LLC domain, and that each LLC domain is entirely
+ * contained within a single NUMA node.
+ */
+void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
+{
+ bool enable_llc = false, enable_numa = false;
+ unsigned int nr_cpus;
+ s32 cpu = cpumask_first(cpu_online_mask);
+
+ /*
+ * Enable LLC domain optimization only when there are multiple LLC
+ * domains among the online CPUs. If all online CPUs are part of a
+ * single LLC domain, the idle CPU selection logic can choose any
+ * online CPU without bias.
+ *
+ * Note that it is sufficient to check the LLC domain of the first
+ * online CPU to determine whether a single LLC domain includes all
+ * CPUs.
+ */
+ rcu_read_lock();
+ nr_cpus = llc_weight(cpu);
+ if (nr_cpus > 0) {
+ if (nr_cpus < num_online_cpus())
+ enable_llc = true;
+ pr_debug("sched_ext: LLC=%*pb weight=%u\n",
+ cpumask_pr_args(llc_span(cpu)), llc_weight(cpu));
+ }
+
+ /*
+ * Enable NUMA optimization only when there are multiple NUMA domains
+ * among the online CPUs and the NUMA domains don't perfectly overlaps
+ * with the LLC domains.
+ *
+ * If all CPUs belong to the same NUMA node and the same LLC domain,
+ * enabling both NUMA and LLC optimizations is unnecessary, as checking
+ * for an idle CPU in the same domain twice is redundant.
+ *
+ * If SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled ignore the NUMA
+ * optimization, as we would naturally select idle CPUs within
+ * specific NUMA nodes querying the corresponding per-node cpumask.
+ */
+ if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) {
+ nr_cpus = numa_weight(cpu);
+ if (nr_cpus > 0) {
+ if (nr_cpus < num_online_cpus() && llc_numa_mismatch())
+ enable_numa = true;
+ pr_debug("sched_ext: NUMA=%*pb weight=%u\n",
+ cpumask_pr_args(numa_span(cpu)), nr_cpus);
+ }
+ }
+ rcu_read_unlock();
+
+ pr_debug("sched_ext: LLC idle selection %s\n",
+ str_enabled_disabled(enable_llc));
+ pr_debug("sched_ext: NUMA idle selection %s\n",
+ str_enabled_disabled(enable_numa));
+
+ if (enable_llc)
+ static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
+ else
+ static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
+ if (enable_numa)
+ static_branch_enable_cpuslocked(&scx_selcpu_topo_numa);
+ else
+ static_branch_disable_cpuslocked(&scx_selcpu_topo_numa);
+}
+
+/*
+ * Built-in CPU idle selection policy:
+ *
+ * 1. Prioritize full-idle cores:
+ * - always prioritize CPUs from fully idle cores (both logical CPUs are
+ * idle) to avoid interference caused by SMT.
+ *
+ * 2. Reuse the same CPU:
+ * - prefer the last used CPU to take advantage of cached data (L1, L2) and
+ * branch prediction optimizations.
+ *
+ * 3. Pick a CPU within the same LLC (Last-Level Cache):
+ * - if the above conditions aren't met, pick a CPU that shares the same LLC
+ * to maintain cache locality.
+ *
+ * 4. Pick a CPU within the same NUMA node, if enabled:
+ * - choose a CPU from the same NUMA node to reduce memory access latency.
+ *
+ * 5. Pick any idle CPU usable by the task.
+ *
+ * Step 3 and 4 are performed only if the system has, respectively,
+ * multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and
+ * scx_selcpu_topo_numa) and they don't contain the same subset of CPUs.
+ *
+ * If %SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled, the search will always
+ * begin in @prev_cpu's node and proceed to other nodes in order of
+ * increasing distance.
+ *
+ * Return the picked CPU if idle, or a negative value otherwise.
+ *
+ * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
+ * we never call ops.select_cpu() for them, see select_task_rq().
+ */
+s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags)
+{
+ const struct cpumask *llc_cpus = NULL;
+ const struct cpumask *numa_cpus = NULL;
+ int node = scx_cpu_node_if_enabled(prev_cpu);
+ s32 cpu;
+
+ /*
+ * This is necessary to protect llc_cpus.
+ */
+ rcu_read_lock();
+
+ /*
+ * Determine the scheduling domain only if the task is allowed to run
+ * on all CPUs.
+ *
+ * This is done primarily for efficiency, as it avoids the overhead of
+ * updating a cpumask every time we need to select an idle CPU (which
+ * can be costly in large SMP systems), but it also aligns logically:
+ * if a task's scheduling domain is restricted by user-space (through
+ * CPU affinity), the task will simply use the flat scheduling domain
+ * defined by user-space.
+ */
+ if (p->nr_cpus_allowed >= num_possible_cpus()) {
+ if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
+ numa_cpus = numa_span(prev_cpu);
+
+ if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc))
+ llc_cpus = llc_span(prev_cpu);
+ }
+
+ /*
+ * If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
+ */
+ if (wake_flags & SCX_WAKE_SYNC) {
+ int waker_node;
+
+ /*
+ * If the waker's CPU is cache affine and prev_cpu is idle,
+ * then avoid a migration.
+ */
+ cpu = smp_processor_id();
+ if (cpus_share_cache(cpu, prev_cpu) &&
+ scx_idle_test_and_clear_cpu(prev_cpu)) {
+ cpu = prev_cpu;
+ goto out_unlock;
+ }
+
+ /*
+ * If the waker's local DSQ is empty, and the system is under
+ * utilized, try to wake up @p to the local DSQ of the waker.
+ *
+ * Checking only for an empty local DSQ is insufficient as it
+ * could give the wakee an unfair advantage when the system is
+ * oversaturated.
+ *
+ * Checking only for the presence of idle CPUs is also
+ * insufficient as the local DSQ of the waker could have tasks
+ * piled up on it even if there is an idle core elsewhere on
+ * the system.
+ */
+ waker_node = cpu_to_node(cpu);
+ if (!(current->flags & PF_EXITING) &&
+ cpu_rq(cpu)->scx.local_dsq.nr == 0 &&
+ (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) &&
+ !cpumask_empty(idle_cpumask(waker_node)->cpu)) {
+ if (cpumask_test_cpu(cpu, p->cpus_ptr))
+ goto out_unlock;
+ }
+ }
+
+ /*
+ * If CPU has SMT, any wholly idle CPU is likely a better pick than
+ * partially idle @prev_cpu.
+ */
+ if (sched_smt_active()) {
+ /*
+ * Keep using @prev_cpu if it's part of a fully idle core.
+ */
+ if (cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) &&
+ scx_idle_test_and_clear_cpu(prev_cpu)) {
+ cpu = prev_cpu;
+ goto out_unlock;
+ }
+
+ /*
+ * Search for any fully idle core in the same LLC domain.
+ */
+ if (llc_cpus) {
+ cpu = pick_idle_cpu_in_node(llc_cpus, node, SCX_PICK_IDLE_CORE);
+ if (cpu >= 0)
+ goto out_unlock;
+ }
+
+ /*
+ * Search for any fully idle core in the same NUMA node.
+ */
+ if (numa_cpus) {
+ cpu = pick_idle_cpu_in_node(numa_cpus, node, SCX_PICK_IDLE_CORE);
+ if (cpu >= 0)
+ goto out_unlock;
+ }
+
+ /*
+ * Search for any full-idle core usable by the task.
+ *
+ * If the node-aware idle CPU selection policy is enabled
+ * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always
+ * begin in prev_cpu's node and proceed to other nodes in
+ * order of increasing distance.
+ */
+ cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags | SCX_PICK_IDLE_CORE);
+ if (cpu >= 0)
+ goto out_unlock;
+
+ /*
+ * Give up if we're strictly looking for a full-idle SMT
+ * core.
+ */
+ if (flags & SCX_PICK_IDLE_CORE) {
+ cpu = prev_cpu;
+ goto out_unlock;
+ }
+ }
+
+ /*
+ * Use @prev_cpu if it's idle.
+ */
+ if (scx_idle_test_and_clear_cpu(prev_cpu)) {
+ cpu = prev_cpu;
+ goto out_unlock;
+ }
+
+ /*
+ * Search for any idle CPU in the same LLC domain.
+ */
+ if (llc_cpus) {
+ cpu = pick_idle_cpu_in_node(llc_cpus, node, 0);
+ if (cpu >= 0)
+ goto out_unlock;
+ }
+
+ /*
+ * Search for any idle CPU in the same NUMA node.
+ */
+ if (numa_cpus) {
+ cpu = pick_idle_cpu_in_node(numa_cpus, node, 0);
+ if (cpu >= 0)
+ goto out_unlock;
+ }
+
+ /*
+ * Search for any idle CPU usable by the task.
+ *
+ * If the node-aware idle CPU selection policy is enabled
+ * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always begin
+ * in prev_cpu's node and proceed to other nodes in order of
+ * increasing distance.
+ */
+ cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags);
+ if (cpu >= 0)
+ goto out_unlock;
+
+out_unlock:
+ rcu_read_unlock();
+
+ return cpu;
+}
+
+/*
+ * Initialize global and per-node idle cpumasks.
+ */
+void scx_idle_init_masks(void)
+{
+ int node;
+
+ /* Allocate global idle cpumasks */
+ BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.smt, GFP_KERNEL));
+
+ /* Allocate per-node idle cpumasks */
+ scx_idle_node_masks = kcalloc(num_possible_nodes(),
+ sizeof(*scx_idle_node_masks), GFP_KERNEL);
+ BUG_ON(!scx_idle_node_masks);
+
+ for_each_node(node) {
+ scx_idle_node_masks[node] = kzalloc_node(sizeof(**scx_idle_node_masks),
+ GFP_KERNEL, node);
+ BUG_ON(!scx_idle_node_masks[node]);
+
+ BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->cpu, GFP_KERNEL, node));
+ BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->smt, GFP_KERNEL, node));
+ }
+}
+
+static void update_builtin_idle(int cpu, bool idle)
+{
+ int node = scx_cpu_node_if_enabled(cpu);
+ struct cpumask *idle_cpus = idle_cpumask(node)->cpu;
+
+ assign_cpu(cpu, idle_cpus, idle);
+
+#ifdef CONFIG_SCHED_SMT
+ if (sched_smt_active()) {
+ const struct cpumask *smt = cpu_smt_mask(cpu);
+ struct cpumask *idle_smts = idle_cpumask(node)->smt;
+
+ if (idle) {
+ /*
+ * idle_smt handling is racy but that's fine as it's
+ * only for optimization and self-correcting.
+ */
+ if (!cpumask_subset(smt, idle_cpus))
+ return;
+ cpumask_or(idle_smts, idle_smts, smt);
+ } else {
+ cpumask_andnot(idle_smts, idle_smts, smt);
+ }
+ }
+#endif
+}
+
+/*
+ * Update the idle state of a CPU to @idle.
+ *
+ * If @do_notify is true, ops.update_idle() is invoked to notify the scx
+ * scheduler of an actual idle state transition (idle to busy or vice
+ * versa). If @do_notify is false, only the idle state in the idle masks is
+ * refreshed without invoking ops.update_idle().
+ *
+ * This distinction is necessary, because an idle CPU can be "reserved" and
+ * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as
+ * busy even if no tasks are dispatched. In this case, the CPU may return
+ * to idle without a true state transition. Refreshing the idle masks
+ * without invoking ops.update_idle() ensures accurate idle state tracking
+ * while avoiding unnecessary updates and maintaining balanced state
+ * transitions.
+ */
+void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
+{
+ int cpu = cpu_of(rq);
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * Trigger ops.update_idle() only when transitioning from a task to
+ * the idle thread and vice versa.
+ *
+ * Idle transitions are indicated by do_notify being set to true,
+ * managed by put_prev_task_idle()/set_next_task_idle().
+ */
+ if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
+ SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
+
+ /*
+ * Update the idle masks:
+ * - for real idle transitions (do_notify == true)
+ * - for idle-to-idle transitions (indicated by the previous task
+ * being the idle thread, managed by pick_task_idle())
+ *
+ * Skip updating idle masks if the previous task is not the idle
+ * thread, since set_next_task_idle() has already handled it when
+ * transitioning from a task to the idle thread (calling this
+ * function with do_notify == true).
+ *
+ * In this way we can avoid updating the idle masks twice,
+ * unnecessarily.
+ */
+ if (static_branch_likely(&scx_builtin_idle_enabled))
+ if (do_notify || is_idle_task(rq->curr))
+ update_builtin_idle(cpu, idle);
+}
+
+static void reset_idle_masks(struct sched_ext_ops *ops)
+{
+ int node;
+
+ /*
+ * Consider all online cpus idle. Should converge to the actual state
+ * quickly.
+ */
+ if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) {
+ cpumask_copy(idle_cpumask(NUMA_NO_NODE)->cpu, cpu_online_mask);
+ cpumask_copy(idle_cpumask(NUMA_NO_NODE)->smt, cpu_online_mask);
+ return;
+ }
+
+ for_each_node(node) {
+ const struct cpumask *node_mask = cpumask_of_node(node);
+
+ cpumask_and(idle_cpumask(node)->cpu, cpu_online_mask, node_mask);
+ cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask);
+ }
+}
+#endif /* CONFIG_SMP */
+
+void scx_idle_enable(struct sched_ext_ops *ops)
+{
+ if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))
+ static_branch_enable(&scx_builtin_idle_enabled);
+ else
+ static_branch_disable(&scx_builtin_idle_enabled);
+
+ if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)
+ static_branch_enable(&scx_builtin_idle_per_node);
+ else
+ static_branch_disable(&scx_builtin_idle_per_node);
+
+#ifdef CONFIG_SMP
+ reset_idle_masks(ops);
+#endif
+}
+
+void scx_idle_disable(void)
+{
+ static_branch_disable(&scx_builtin_idle_enabled);
+ static_branch_disable(&scx_builtin_idle_per_node);
+}
+
+/********************************************************************************
+ * Helpers that can be called from the BPF scheduler.
+ */
+
+static int validate_node(int node)
+{
+ if (!static_branch_likely(&scx_builtin_idle_per_node)) {
+ scx_ops_error("per-node idle tracking is disabled");
+ return -EOPNOTSUPP;
+ }
+
+ /* Return no entry for NUMA_NO_NODE (not a critical scx error) */
+ if (node == NUMA_NO_NODE)
+ return -ENOENT;
+
+ /* Make sure node is in a valid range */
+ if (node < 0 || node >= nr_node_ids) {
+ scx_ops_error("invalid node %d", node);
+ return -EINVAL;
+ }
+
+ /* Make sure the node is part of the set of possible nodes */
+ if (!node_possible(node)) {
+ scx_ops_error("unavailable node %d", node);
+ return -EINVAL;
+ }
+
+ return node;
+}
+
+__bpf_kfunc_start_defs();
+
+static bool check_builtin_idle_enabled(void)
+{
+ if (static_branch_likely(&scx_builtin_idle_enabled))
+ return true;
+
+ scx_ops_error("built-in idle tracking is disabled");
+ return false;
+}
+
+/**
+ * scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or
+ * trigger an error if @cpu is invalid
+ * @cpu: target CPU
+ */
+__bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
+{
+#ifdef CONFIG_NUMA
+ if (!ops_cpu_valid(cpu, NULL))
+ return NUMA_NO_NODE;
+
+ return cpu_to_node(cpu);
+#else
+ return 0;
+#endif
+}
+
+/**
+ * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
+ * @p: task_struct to select a CPU for
+ * @prev_cpu: CPU @p was on previously
+ * @wake_flags: %SCX_WAKE_* flags
+ * @is_idle: out parameter indicating whether the returned CPU is idle
+ *
+ * Can only be called from ops.select_cpu() if the built-in CPU selection is
+ * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
+ * @p, @prev_cpu and @wake_flags match ops.select_cpu().
+ *
+ * Returns the picked CPU with *@is_idle indicating whether the picked CPU is
+ * currently idle and thus a good candidate for direct dispatching.
+ */
+__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
+ u64 wake_flags, bool *is_idle)
+{
+#ifdef CONFIG_SMP
+ s32 cpu;
+#endif
+ if (!ops_cpu_valid(prev_cpu, NULL))
+ goto prev_cpu;
+
+ if (!check_builtin_idle_enabled())
+ goto prev_cpu;
+
+ if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
+ goto prev_cpu;
+
+#ifdef CONFIG_SMP
+ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0);
+ if (cpu >= 0) {
+ *is_idle = true;
+ return cpu;
+ }
+#endif
+
+prev_cpu:
+ *is_idle = false;
+ return prev_cpu;
+}
+
+/**
+ * scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the
+ * idle-tracking per-CPU cpumask of a target NUMA node.
+ * @node: target NUMA node
+ *
+ * Returns an empty cpumask if idle tracking is not enabled, if @node is
+ * not valid, or running on a UP kernel. In this case the actual error will
+ * be reported to the BPF scheduler via scx_ops_error().
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
+{
+ node = validate_node(node);
+ if (node < 0)
+ return cpu_none_mask;
+
+#ifdef CONFIG_SMP
+ return idle_cpumask(node)->cpu;
+#else
+ return cpu_none_mask;
+#endif
+}
+
+/**
+ * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
+ * per-CPU cpumask.
+ *
+ * Returns an empty mask if idle tracking is not enabled, or running on a
+ * UP kernel.
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
+{
+ if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
+ scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ return cpu_none_mask;
+ }
+
+ if (!check_builtin_idle_enabled())
+ return cpu_none_mask;
+
+#ifdef CONFIG_SMP
+ return idle_cpumask(NUMA_NO_NODE)->cpu;
+#else
+ return cpu_none_mask;
+#endif
+}
+
+/**
+ * scx_bpf_get_idle_smtmask_node - Get a referenced kptr to the
+ * idle-tracking, per-physical-core cpumask of a target NUMA node. Can be
+ * used to determine if an entire physical core is free.
+ * @node: target NUMA node
+ *
+ * Returns an empty cpumask if idle tracking is not enabled, if @node is
+ * not valid, or running on a UP kernel. In this case the actual error will
+ * be reported to the BPF scheduler via scx_ops_error().
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
+{
+ node = validate_node(node);
+ if (node < 0)
+ return cpu_none_mask;
+
+#ifdef CONFIG_SMP
+ if (sched_smt_active())
+ return idle_cpumask(node)->smt;
+ else
+ return idle_cpumask(node)->cpu;
+#else
+ return cpu_none_mask;
+#endif
+}
+
+/**
+ * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
+ * per-physical-core cpumask. Can be used to determine if an entire physical
+ * core is free.
+ *
+ * Returns an empty mask if idle tracking is not enabled, or running on a
+ * UP kernel.
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
+{
+ if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
+ scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ return cpu_none_mask;
+ }
+
+ if (!check_builtin_idle_enabled())
+ return cpu_none_mask;
+
+#ifdef CONFIG_SMP
+ if (sched_smt_active())
+ return idle_cpumask(NUMA_NO_NODE)->smt;
+ else
+ return idle_cpumask(NUMA_NO_NODE)->cpu;
+#else
+ return cpu_none_mask;
+#endif
+}
+
+/**
+ * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
+ * either the percpu, or SMT idle-tracking cpumask.
+ * @idle_mask: &cpumask to use
+ */
+__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
+{
+ /*
+ * Empty function body because we aren't actually acquiring or releasing
+ * a reference to a global idle cpumask, which is read-only in the
+ * caller and is never released. The acquire / release semantics here
+ * are just used to make the cpumask a trusted pointer in the caller.
+ */
+}
+
+/**
+ * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
+ * @cpu: cpu to test and clear idle for
+ *
+ * Returns %true if @cpu was idle and its idle state was successfully cleared.
+ * %false otherwise.
+ *
+ * Unavailable if ops.update_idle() is implemented and
+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
+ */
+__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
+{
+ if (!check_builtin_idle_enabled())
+ return false;
+
+ if (ops_cpu_valid(cpu, NULL))
+ return scx_idle_test_and_clear_cpu(cpu);
+ else
+ return false;
+}
+
+/**
+ * scx_bpf_pick_idle_cpu_node - Pick and claim an idle cpu from @node
+ * @cpus_allowed: Allowed cpumask
+ * @node: target NUMA node
+ * @flags: %SCX_PICK_IDLE_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed from the NUMA node @node.
+ *
+ * Returns the picked idle cpu number on success, or -%EBUSY if no matching
+ * cpu was found.
+ *
+ * The search starts from @node and proceeds to other online NUMA nodes in
+ * order of increasing distance (unless SCX_PICK_IDLE_IN_NODE is specified,
+ * in which case the search is limited to the target @node).
+ *
+ * Always returns an error if ops.update_idle() is implemented and
+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set, or if
+ * %SCX_OPS_BUILTIN_IDLE_PER_NODE is not set.
+ */
+__bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed,
+ int node, u64 flags)
+{
+ node = validate_node(node);
+ if (node < 0)
+ return node;
+
+ return scx_pick_idle_cpu(cpus_allowed, node, flags);
+}
+
+/**
+ * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
+ * @cpus_allowed: Allowed cpumask
+ * @flags: %SCX_PICK_IDLE_CPU_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
+ * number on success. -%EBUSY if no matching cpu was found.
+ *
+ * Idle CPU tracking may race against CPU scheduling state transitions. For
+ * example, this function may return -%EBUSY as CPUs are transitioning into the
+ * idle state. If the caller then assumes that there will be dispatch events on
+ * the CPUs as they were all busy, the scheduler may end up stalling with CPUs
+ * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and
+ * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch
+ * event in the near future.
+ *
+ * Unavailable if ops.update_idle() is implemented and
+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
+ *
+ * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use
+ * scx_bpf_pick_idle_cpu_node() instead.
+ */
+__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
+ u64 flags)
+{
+ if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
+ scx_ops_error("per-node idle tracking is enabled");
+ return -EBUSY;
+ }
+
+ if (!check_builtin_idle_enabled())
+ return -EBUSY;
+
+ return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags);
+}
+
+/**
+ * scx_bpf_pick_any_cpu_node - Pick and claim an idle cpu if available
+ * or pick any CPU from @node
+ * @cpus_allowed: Allowed cpumask
+ * @node: target NUMA node
+ * @flags: %SCX_PICK_IDLE_CPU_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
+ * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
+ * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
+ * empty.
+ *
+ * The search starts from @node and proceeds to other online NUMA nodes in
+ * order of increasing distance (unless %SCX_PICK_IDLE_IN_NODE is specified,
+ * in which case the search is limited to the target @node, regardless of
+ * the CPU idle state).
+ *
+ * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
+ * set, this function can't tell which CPUs are idle and will always pick any
+ * CPU.
+ */
+__bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed,
+ int node, u64 flags)
+{
+ s32 cpu;
+
+ node = validate_node(node);
+ if (node < 0)
+ return node;
+
+ cpu = scx_pick_idle_cpu(cpus_allowed, node, flags);
+ if (cpu >= 0)
+ return cpu;
+
+ if (flags & SCX_PICK_IDLE_IN_NODE)
+ cpu = cpumask_any_and_distribute(cpumask_of_node(node), cpus_allowed);
+ else
+ cpu = cpumask_any_distribute(cpus_allowed);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+ else
+ return -EBUSY;
+}
+
+/**
+ * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
+ * @cpus_allowed: Allowed cpumask
+ * @flags: %SCX_PICK_IDLE_CPU_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
+ * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
+ * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
+ * empty.
+ *
+ * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
+ * set, this function can't tell which CPUs are idle and will always pick any
+ * CPU.
+ *
+ * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use
+ * scx_bpf_pick_any_cpu_node() instead.
+ */
+__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
+ u64 flags)
+{
+ s32 cpu;
+
+ if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
+ scx_ops_error("per-node idle tracking is enabled");
+ return -EBUSY;
+ }
+
+ if (static_branch_likely(&scx_builtin_idle_enabled)) {
+ cpu = scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags);
+ if (cpu >= 0)
+ return cpu;
+ }
+
+ cpu = cpumask_any_distribute(cpus_allowed);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+ else
+ return -EBUSY;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_idle)
+BTF_ID_FLAGS(func, scx_bpf_cpu_node)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
+BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_idle)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_idle,
+};
+
+BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_select_cpu,
+};
+
+int scx_idle_init(void)
+{
+ int ret;
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) ||
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) ||
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) ||
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle);
+
+ return ret;
+}
diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h
new file mode 100644
index 000000000000..511cc2221f7a
--- /dev/null
+++ b/kernel/sched/ext_idle.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com>
+ */
+#ifndef _KERNEL_SCHED_EXT_IDLE_H
+#define _KERNEL_SCHED_EXT_IDLE_H
+
+struct sched_ext_ops;
+
+#ifdef CONFIG_SMP
+void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops);
+void scx_idle_init_masks(void);
+bool scx_idle_test_and_clear_cpu(int cpu);
+s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags);
+#else /* !CONFIG_SMP */
+static inline void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) {}
+static inline void scx_idle_init_masks(void) {}
+static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; }
+static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags)
+{
+ return -EBUSY;
+}
+#endif /* CONFIG_SMP */
+
+s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags);
+void scx_idle_enable(struct sched_ext_ops *ops);
+void scx_idle_disable(void);
+int scx_idle_init(void);
+
+#endif /* _KERNEL_SCHED_EXT_IDLE_H */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c798d2795243..e43993a4e580 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -74,12 +74,12 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
/*
* Minimal preemption granularity for CPU-bound tasks:
*
- * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_base_slice = 750000ULL;
-static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
+unsigned int sysctl_sched_base_slice = 700000ULL;
+static unsigned int normalized_sysctl_sched_base_slice = 700000ULL;
-const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+__read_mostly unsigned int sysctl_sched_migration_cost = 500000UL;
static int __init setup_sched_thermal_decay_shift(char *str)
{
@@ -399,7 +399,7 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
static inline void assert_list_leaf_cfs_rq(struct rq *rq)
{
- SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
+ WARN_ON_ONCE(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
}
/* Iterate through all leaf cfs_rq's on a runqueue */
@@ -696,7 +696,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
s64 vlag, limit;
- SCHED_WARN_ON(!se->on_rq);
+ WARN_ON_ONCE(!se->on_rq);
vlag = avg_vruntime(cfs_rq) - se->vruntime;
limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
@@ -884,6 +884,26 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
}
/*
+ * HACK, stash a copy of deadline at the point of pick in vlag,
+ * which isn't used until dequeue.
+ */
+static inline void set_protect_slice(struct sched_entity *se)
+{
+ se->vlag = se->deadline;
+}
+
+static inline bool protect_slice(struct sched_entity *se)
+{
+ return se->vlag == se->deadline;
+}
+
+static inline void cancel_protect_slice(struct sched_entity *se)
+{
+ if (protect_slice(se))
+ se->vlag = se->deadline + 1;
+}
+
+/*
* Earliest Eligible Virtual Deadline First
*
* In order to provide latency guarantees for different request sizes
@@ -919,11 +939,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
curr = NULL;
- /*
- * Once selected, run a task until it either becomes non-eligible or
- * until it gets a new slice. See the HACK in set_next_entity().
- */
- if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
+ if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr))
return curr;
/* Pick the leftmost entity if it's eligible */
@@ -967,7 +983,6 @@ found:
return best;
}
-#ifdef CONFIG_SCHED_DEBUG
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
@@ -994,7 +1009,6 @@ int sched_update_scaling(void)
return 0;
}
#endif
-#endif
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
@@ -3301,7 +3315,7 @@ static void task_numa_work(struct callback_head *work)
bool vma_pids_skipped;
bool vma_pids_forced = false;
- SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
+ WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
work->next = work;
/*
@@ -4020,7 +4034,7 @@ static inline bool load_avg_is_decayed(struct sched_avg *sa)
* Make sure that rounding and/or propagation of PELT values never
* break this.
*/
- SCHED_WARN_ON(sa->load_avg ||
+ WARN_ON_ONCE(sa->load_avg ||
sa->util_avg ||
sa->runnable_avg);
@@ -5444,7 +5458,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
clear_buddies(cfs_rq, se);
if (flags & DEQUEUE_DELAYED) {
- SCHED_WARN_ON(!se->sched_delayed);
+ WARN_ON_ONCE(!se->sched_delayed);
} else {
bool delay = sleep;
/*
@@ -5454,7 +5468,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (flags & DEQUEUE_SPECIAL)
delay = false;
- SCHED_WARN_ON(delay && se->sched_delayed);
+ WARN_ON_ONCE(delay && se->sched_delayed);
if (sched_feat(DELAY_DEQUEUE) && delay &&
!entity_eligible(cfs_rq, se)) {
@@ -5530,15 +5544,12 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_stats_wait_end_fair(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
- /*
- * HACK, stash a copy of deadline at the point of pick in vlag,
- * which isn't used until dequeue.
- */
- se->vlag = se->deadline;
+
+ set_protect_slice(se);
}
update_stats_curr_start(cfs_rq, se);
- SCHED_WARN_ON(cfs_rq->curr);
+ WARN_ON_ONCE(cfs_rq->curr);
cfs_rq->curr = se;
/*
@@ -5579,7 +5590,7 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
if (sched_feat(PICK_BUDDY) &&
cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
/* ->next will never be delayed */
- SCHED_WARN_ON(cfs_rq->next->sched_delayed);
+ WARN_ON_ONCE(cfs_rq->next->sched_delayed);
return cfs_rq->next;
}
@@ -5615,7 +5626,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* in !on_rq case, update occurred at dequeue */
update_load_avg(cfs_rq, prev, 0);
}
- SCHED_WARN_ON(cfs_rq->curr != prev);
+ WARN_ON_ONCE(cfs_rq->curr != prev);
cfs_rq->curr = NULL;
}
@@ -5838,7 +5849,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
cfs_rq->throttled_clock_self = 0;
- if (SCHED_WARN_ON((s64)delta < 0))
+ if (WARN_ON_ONCE((s64)delta < 0))
delta = 0;
cfs_rq->throttled_clock_self_time += delta;
@@ -5858,7 +5869,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
list_del_leaf_cfs_rq(cfs_rq);
- SCHED_WARN_ON(cfs_rq->throttled_clock_self);
+ WARN_ON_ONCE(cfs_rq->throttled_clock_self);
if (cfs_rq->nr_queued)
cfs_rq->throttled_clock_self = rq_clock(rq);
}
@@ -5967,7 +5978,7 @@ done:
* throttled-list. rq->lock protects completion.
*/
cfs_rq->throttled = 1;
- SCHED_WARN_ON(cfs_rq->throttled_clock);
+ WARN_ON_ONCE(cfs_rq->throttled_clock);
if (cfs_rq->nr_queued)
cfs_rq->throttled_clock = rq_clock(rq);
return true;
@@ -6123,7 +6134,7 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
}
/* Already enqueued */
- if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
+ if (WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_csd_list)))
return;
first = list_empty(&rq->cfsb_csd_list);
@@ -6142,7 +6153,7 @@ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
{
lockdep_assert_rq_held(rq_of(cfs_rq));
- if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) ||
+ if (WARN_ON_ONCE(!cfs_rq_throttled(cfs_rq) ||
cfs_rq->runtime_remaining <= 0))
return;
@@ -6178,7 +6189,7 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
goto next;
/* By the above checks, this should never be true */
- SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
+ WARN_ON_ONCE(cfs_rq->runtime_remaining > 0);
raw_spin_lock(&cfs_b->lock);
runtime = -cfs_rq->runtime_remaining + 1;
@@ -6199,7 +6210,7 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
* We currently only expect to be unthrottling
* a single cfs_rq locally.
*/
- SCHED_WARN_ON(!list_empty(&local_unthrottle));
+ WARN_ON_ONCE(!list_empty(&local_unthrottle));
list_add_tail(&cfs_rq->throttled_csd_list,
&local_unthrottle);
}
@@ -6224,7 +6235,7 @@ next:
rq_unlock_irqrestore(rq, &rf);
}
- SCHED_WARN_ON(!list_empty(&local_unthrottle));
+ WARN_ON_ONCE(!list_empty(&local_unthrottle));
rcu_read_unlock();
@@ -6541,14 +6552,14 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *paren
cfs_b->hierarchical_quota = parent ? parent->hierarchical_quota : RUNTIME_INF;
INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
- hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
- cfs_b->period_timer.function = sched_cfs_period_timer;
+ hrtimer_setup(&cfs_b->period_timer, sched_cfs_period_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_PINNED);
/* Add a random offset so that timers interleave */
hrtimer_set_expires(&cfs_b->period_timer,
get_random_u32_below(cfs_b->period));
- hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- cfs_b->slack_timer.function = sched_cfs_slack_timer;
+ hrtimer_setup(&cfs_b->slack_timer, sched_cfs_slack_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL);
cfs_b->slack_started = false;
}
@@ -6776,7 +6787,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
- SCHED_WARN_ON(task_rq(p) != rq);
+ WARN_ON_ONCE(task_rq(p) != rq);
if (rq->cfs.h_nr_queued > 1) {
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
@@ -6887,8 +6898,8 @@ requeue_delayed_entity(struct sched_entity *se)
* Because a delayed entity is one that is still on
* the runqueue competing until elegibility.
*/
- SCHED_WARN_ON(!se->sched_delayed);
- SCHED_WARN_ON(!se->on_rq);
+ WARN_ON_ONCE(!se->sched_delayed);
+ WARN_ON_ONCE(!se->on_rq);
if (sched_feat(DELAY_ZERO)) {
update_entity_lag(cfs_rq, se);
@@ -6991,6 +7002,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_group(se);
se->slice = slice;
+ if (se != cfs_rq->curr)
+ min_vruntime_cb_propagate(&se->run_node, NULL);
slice = cfs_rq_min_slice(cfs_rq);
cfs_rq->h_nr_runnable += h_nr_runnable;
@@ -7120,6 +7133,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
update_cfs_group(se);
se->slice = slice;
+ if (se != cfs_rq->curr)
+ min_vruntime_cb_propagate(&se->run_node, NULL);
slice = cfs_rq_min_slice(cfs_rq);
cfs_rq->h_nr_runnable -= h_nr_runnable;
@@ -7144,8 +7159,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
rq->next_balance = jiffies;
if (p && task_delayed) {
- SCHED_WARN_ON(!task_sleep);
- SCHED_WARN_ON(p->on_rq != 1);
+ WARN_ON_ONCE(!task_sleep);
+ WARN_ON_ONCE(p->on_rq != 1);
/* Fix-up what dequeue_task_fair() skipped */
hrtick_update(rq);
@@ -8723,7 +8738,7 @@ static inline void set_task_max_allowed_capacity(struct task_struct *p) {}
static void set_next_buddy(struct sched_entity *se)
{
for_each_sched_entity(se) {
- if (SCHED_WARN_ON(!se->on_rq))
+ if (WARN_ON_ONCE(!se->on_rq))
return;
if (se_is_idle(se))
return;
@@ -8783,8 +8798,15 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
* Preempt an idle entity in favor of a non-idle entity (and don't preempt
* in the inverse case).
*/
- if (cse_is_idle && !pse_is_idle)
+ if (cse_is_idle && !pse_is_idle) {
+ /*
+ * When non-idle entity preempt an idle entity,
+ * don't give idle entity slice protection.
+ */
+ cancel_protect_slice(se);
goto preempt;
+ }
+
if (cse_is_idle != pse_is_idle)
return;
@@ -8803,8 +8825,8 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
* Note that even if @p does not turn out to be the most eligible
* task at this moment, current's slice protection will be lost.
*/
- if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline)
- se->vlag = se->deadline + 1;
+ if (do_preempt_short(cfs_rq, pse, se))
+ cancel_protect_slice(se);
/*
* If @p has become the most eligible task, force preemption.
@@ -9417,12 +9439,11 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
return 0;
/* Prevent to re-select dst_cpu via env's CPUs: */
- for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
- if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
- env->flags |= LBF_DST_PINNED;
- env->new_dst_cpu = cpu;
- break;
- }
+ cpu = cpumask_first_and_and(env->dst_grpmask, env->cpus, p->cpus_ptr);
+
+ if (cpu < nr_cpu_ids) {
+ env->flags |= LBF_DST_PINNED;
+ env->new_dst_cpu = cpu;
}
return 0;
@@ -12461,7 +12482,7 @@ unlock:
void nohz_balance_exit_idle(struct rq *rq)
{
- SCHED_WARN_ON(rq != this_rq());
+ WARN_ON_ONCE(rq != this_rq());
if (likely(!rq->nohz_tick_stopped))
return;
@@ -12497,7 +12518,7 @@ void nohz_balance_enter_idle(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- SCHED_WARN_ON(cpu != smp_processor_id());
+ WARN_ON_ONCE(cpu != smp_processor_id());
/* If this CPU is going down, then nothing needs to be done: */
if (!cpu_active(cpu))
@@ -12580,7 +12601,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
int balance_cpu;
struct rq *rq;
- SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
+ WARN_ON_ONCE((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
/*
* We assume there will be no idle load after this update and clear
@@ -13020,7 +13041,7 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
struct cfs_rq *cfs_rqb;
s64 delta;
- SCHED_WARN_ON(task_rq(b)->core != rq->core);
+ WARN_ON_ONCE(task_rq(b)->core != rq->core);
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
@@ -13223,7 +13244,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
- SCHED_WARN_ON(p->se.sched_delayed);
+ WARN_ON_ONCE(p->se.sched_delayed);
attach_task_cfs_rq(p);
@@ -13258,7 +13279,7 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
if (!first)
return;
- SCHED_WARN_ON(se->sched_delayed);
+ WARN_ON_ONCE(se->sched_delayed);
if (hrtick_enabled_fair(rq))
hrtick_start_fair(rq, p);
@@ -13645,7 +13666,6 @@ DEFINE_SCHED_CLASS(fair) = {
#endif
};
-#ifdef CONFIG_SCHED_DEBUG
void print_cfs_stats(struct seq_file *m, int cpu)
{
struct cfs_rq *cfs_rq, *pos;
@@ -13679,7 +13699,6 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
rcu_read_unlock();
}
#endif /* CONFIG_NUMA_BALANCING */
-#endif /* CONFIG_SCHED_DEBUG */
__init void init_sched_fair_class(void)
{
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4b8e33c615b1..fa03ec3ed56a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -127,9 +127,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
raw_spin_lock_init(&rt_b->rt_runtime_lock);
- hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_REL_HARD);
- rt_b->rt_period_timer.function = sched_rt_period_timer;
+ hrtimer_setup(&rt_b->rt_period_timer, sched_rt_period_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_HARD);
}
static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
@@ -169,9 +168,8 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
{
-#ifdef CONFIG_SCHED_DEBUG
WARN_ON_ONCE(!rt_entity_is_task(rt_se));
-#endif
+
return container_of(rt_se, struct task_struct, rt);
}
@@ -1713,7 +1711,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)
BUG_ON(idx >= MAX_RT_PRIO);
queue = array->queue + idx;
- if (SCHED_WARN_ON(list_empty(queue)))
+ if (WARN_ON_ONCE(list_empty(queue)))
return NULL;
next = list_entry(queue->next, struct sched_rt_entity, run_list);
@@ -2910,6 +2908,7 @@ static int sched_rt_handler(const struct ctl_table *table, int write, void *buff
int ret;
mutex_lock(&mutex);
+ sched_domains_mutex_lock();
old_period = sysctl_sched_rt_period;
old_runtime = sysctl_sched_rt_runtime;
@@ -2936,6 +2935,7 @@ undo:
sysctl_sched_rt_period = old_period;
sysctl_sched_rt_runtime = old_runtime;
}
+ sched_domains_mutex_unlock();
mutex_unlock(&mutex);
return ret;
@@ -2967,7 +2967,6 @@ static int sched_rr_handler(const struct ctl_table *table, int write, void *buff
}
#endif /* CONFIG_SYSCTL */
-#ifdef CONFIG_SCHED_DEBUG
void print_rt_stats(struct seq_file *m, int cpu)
{
rt_rq_iter_t iter;
@@ -2978,4 +2977,3 @@ void print_rt_stats(struct seq_file *m, int cpu)
print_rt_rq(m, cpu, rt_rq);
rcu_read_unlock();
}
-#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 023b844159c9..47972f34ea70 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -91,12 +91,6 @@ struct cpuidle_state;
#include "cpupri.h"
#include "cpudeadline.h"
-#ifdef CONFIG_SCHED_DEBUG
-# define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
-#else
-# define SCHED_WARN_ON(x) ({ (void)(x), 0; })
-#endif
-
/* task_struct::on_rq states: */
#define TASK_ON_RQ_QUEUED 1
#define TASK_ON_RQ_MIGRATING 2
@@ -998,7 +992,7 @@ struct root_domain {
* Also, some corner cases, like 'wrap around' is dangerous, but given
* that u64 is 'big enough'. So that shouldn't be a concern.
*/
- u64 visit_gen;
+ u64 visit_cookie;
#ifdef HAVE_RT_PUSH_IPI
/*
@@ -1180,10 +1174,8 @@ struct rq {
atomic_t nr_iowait;
-#ifdef CONFIG_SCHED_DEBUG
u64 last_seen_need_resched_ns;
int ticks_without_resched;
-#endif
#ifdef CONFIG_MEMBARRIER
int membarrier_state;
@@ -1571,7 +1563,7 @@ static inline void update_idle_core(struct rq *rq) { }
static inline struct task_struct *task_of(struct sched_entity *se)
{
- SCHED_WARN_ON(!entity_is_task(se));
+ WARN_ON_ONCE(!entity_is_task(se));
return container_of(se, struct task_struct, se);
}
@@ -1652,7 +1644,7 @@ static inline void assert_clock_updated(struct rq *rq)
* The only reason for not seeing a clock update since the
* last rq_pin_lock() is if we're currently skipping updates.
*/
- SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP);
+ WARN_ON_ONCE(rq->clock_update_flags < RQCF_ACT_SKIP);
}
static inline u64 rq_clock(struct rq *rq)
@@ -1699,7 +1691,7 @@ static inline void rq_clock_cancel_skipupdate(struct rq *rq)
static inline void rq_clock_start_loop_update(struct rq *rq)
{
lockdep_assert_rq_held(rq);
- SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP);
+ WARN_ON_ONCE(rq->clock_update_flags & RQCF_ACT_SKIP);
rq->clock_update_flags |= RQCF_ACT_SKIP;
}
@@ -1712,14 +1704,12 @@ static inline void rq_clock_stop_loop_update(struct rq *rq)
struct rq_flags {
unsigned long flags;
struct pin_cookie cookie;
-#ifdef CONFIG_SCHED_DEBUG
/*
* A copy of (rq::clock_update_flags & RQCF_UPDATED) for the
* current pin context is stashed here in case it needs to be
* restored in rq_repin_lock().
*/
unsigned int clock_update_flags;
-#endif
};
extern struct balance_callback balance_push_callback;
@@ -1770,21 +1760,18 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
{
rf->cookie = lockdep_pin_lock(__rq_lockp(rq));
-#ifdef CONFIG_SCHED_DEBUG
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
rf->clock_update_flags = 0;
-# ifdef CONFIG_SMP
- SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != &balance_push_callback);
-# endif
+#ifdef CONFIG_SMP
+ WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback);
#endif
}
static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
{
-#ifdef CONFIG_SCHED_DEBUG
if (rq->clock_update_flags > RQCF_ACT_SKIP)
rf->clock_update_flags = RQCF_UPDATED;
-#endif
+
scx_rq_clock_invalidate(rq);
lockdep_unpin_lock(__rq_lockp(rq), rf->cookie);
}
@@ -1793,12 +1780,10 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
{
lockdep_repin_lock(__rq_lockp(rq), rf->cookie);
-#ifdef CONFIG_SCHED_DEBUG
/*
* Restore the value we stashed in @rf for this pin context.
*/
rq->clock_update_flags |= rf->clock_update_flags;
-#endif
}
extern
@@ -2072,9 +2057,7 @@ struct sched_group_capacity {
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
-#ifdef CONFIG_SCHED_DEBUG
int id;
-#endif
unsigned long cpumask[]; /* Balance mask */
};
@@ -2114,13 +2097,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg)
extern int group_balance_cpu(struct sched_group *sg);
-#ifdef CONFIG_SCHED_DEBUG
extern void update_sched_domain_debugfs(void);
extern void dirty_sched_domain_sysctl(int cpu);
-#else
-static inline void update_sched_domain_debugfs(void) { }
-static inline void dirty_sched_domain_sysctl(int cpu) { }
-#endif
extern int sched_update_scaling(void);
@@ -2200,13 +2178,8 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
}
/*
- * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
+ * Tunables:
*/
-#ifdef CONFIG_SCHED_DEBUG
-# define const_debug __read_mostly
-#else
-# define const_debug const
-#endif
#define SCHED_FEAT(name, enabled) \
__SCHED_FEAT_##name ,
@@ -2218,13 +2191,11 @@ enum {
#undef SCHED_FEAT
-#ifdef CONFIG_SCHED_DEBUG
-
/*
* To support run-time toggling of sched features, all the translation units
* (but core.c) reference the sysctl_sched_features defined in core.c.
*/
-extern const_debug unsigned int sysctl_sched_features;
+extern __read_mostly unsigned int sysctl_sched_features;
#ifdef CONFIG_JUMP_LABEL
@@ -2246,24 +2217,6 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#endif /* !CONFIG_JUMP_LABEL */
-#else /* !SCHED_DEBUG: */
-
-/*
- * Each translation unit has its own copy of sysctl_sched_features to allow
- * constants propagation at compile time and compiler optimization based on
- * features default.
- */
-#define SCHED_FEAT(name, enabled) \
- (1UL << __SCHED_FEAT_##name) * enabled |
-static const_debug __maybe_unused unsigned int sysctl_sched_features =
-#include "features.h"
- 0;
-#undef SCHED_FEAT
-
-#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
-
-#endif /* !SCHED_DEBUG */
-
extern struct static_key_false sched_numa_balancing;
extern struct static_key_false sched_schedstats;
@@ -2685,7 +2638,7 @@ static inline void idle_set_state(struct rq *rq,
static inline struct cpuidle_state *idle_get_state(struct rq *rq)
{
- SCHED_WARN_ON(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held());
return rq->idle_state;
}
@@ -2843,12 +2796,11 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
# define SCHED_NR_MIGRATE_BREAK 32
#endif
-extern const_debug unsigned int sysctl_sched_nr_migrate;
-extern const_debug unsigned int sysctl_sched_migration_cost;
+extern __read_mostly unsigned int sysctl_sched_nr_migrate;
+extern __read_mostly unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_base_slice;
-#ifdef CONFIG_SCHED_DEBUG
extern int sysctl_resched_latency_warn_ms;
extern int sysctl_resched_latency_warn_once;
@@ -2859,7 +2811,6 @@ extern unsigned int sysctl_numa_balancing_scan_period_min;
extern unsigned int sysctl_numa_balancing_scan_period_max;
extern unsigned int sysctl_numa_balancing_scan_size;
extern unsigned int sysctl_numa_balancing_hot_threshold;
-#endif
#ifdef CONFIG_SCHED_HRTICK
@@ -2932,7 +2883,6 @@ unsigned long arch_scale_freq_capacity(int cpu)
}
#endif
-#ifdef CONFIG_SCHED_DEBUG
/*
* In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to
* acquire rq lock instead of rq_lock(). So at the end of these two functions
@@ -2947,9 +2897,6 @@ static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
#endif
}
-#else
-static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) { }
-#endif
#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \
__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \
@@ -3162,7 +3109,6 @@ extern struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
-#ifdef CONFIG_SCHED_DEBUG
extern bool sched_debug_verbose;
extern void print_cfs_stats(struct seq_file *m, int cpu);
@@ -3173,15 +3119,12 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
extern void resched_latency_warn(int cpu, u64 latency);
-# ifdef CONFIG_NUMA_BALANCING
+#ifdef CONFIG_NUMA_BALANCING
extern void show_numa_stats(struct task_struct *p, struct seq_file *m);
extern void
print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
unsigned long tpf, unsigned long gsf, unsigned long gpf);
-# endif /* CONFIG_NUMA_BALANCING */
-#else /* !CONFIG_SCHED_DEBUG: */
-static inline void resched_latency_warn(int cpu, u64 latency) { }
-#endif /* !CONFIG_SCHED_DEBUG */
+#endif /* CONFIG_NUMA_BALANCING */
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
extern void init_rt_rq(struct rt_rq *rt_rq);
@@ -3394,6 +3337,31 @@ static inline bool update_other_load_avgs(struct rq *rq) { return false; }
unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
+/*
+ * When uclamp is compiled in, the aggregation at rq level is 'turned off'
+ * by default in the fast path and only gets turned on once userspace performs
+ * an operation that requires it.
+ *
+ * Returns true if userspace opted-in to use uclamp and aggregation at rq level
+ * hence is active.
+ */
+static inline bool uclamp_is_used(void)
+{
+ return static_branch_likely(&sched_uclamp_used);
+}
+
+/*
+ * Enabling static branches would get the cpus_read_lock(),
+ * check whether uclamp_is_used before enable it to avoid always
+ * calling cpus_read_lock(). Because we never disable this
+ * static key once enable it.
+ */
+static inline void sched_uclamp_enable(void)
+{
+ if (!uclamp_is_used())
+ static_branch_enable(&sched_uclamp_used);
+}
+
static inline unsigned long uclamp_rq_get(struct rq *rq,
enum uclamp_id clamp_id)
{
@@ -3417,7 +3385,7 @@ static inline bool uclamp_rq_is_capped(struct rq *rq)
unsigned long rq_util;
unsigned long max_util;
- if (!static_branch_likely(&sched_uclamp_used))
+ if (!uclamp_is_used())
return false;
rq_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq);
@@ -3426,19 +3394,6 @@ static inline bool uclamp_rq_is_capped(struct rq *rq)
return max_util != SCHED_CAPACITY_SCALE && rq_util >= max_util;
}
-/*
- * When uclamp is compiled in, the aggregation at rq level is 'turned off'
- * by default in the fast path and only gets turned on once userspace performs
- * an operation that requires it.
- *
- * Returns true if userspace opted-in to use uclamp and aggregation at rq level
- * hence is active.
- */
-static inline bool uclamp_is_used(void)
-{
- return static_branch_likely(&sched_uclamp_used);
-}
-
#define for_each_clamp_id(clamp_id) \
for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
@@ -3486,6 +3441,8 @@ static inline bool uclamp_is_used(void)
return false;
}
+static inline void sched_uclamp_enable(void) {}
+
static inline unsigned long
uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id)
{
@@ -3619,6 +3576,7 @@ extern int preempt_dynamic_mode;
extern int sched_dynamic_mode(const char *str);
extern void sched_dynamic_update(int mode);
#endif
+extern const char *preempt_modes[];
#ifdef CONFIG_SCHED_MM_CID
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 19cdbe96f93d..452826df6ae1 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -144,7 +144,7 @@ static inline void psi_enqueue(struct task_struct *p, int flags)
if (p->se.sched_delayed) {
/* CPU migration of "sleeping" task */
- SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED));
+ WARN_ON_ONCE(!(flags & ENQUEUE_MIGRATED));
if (p->in_memstall)
set |= TSK_MEMSTALL;
if (p->in_iowait)
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 456d339be98f..c326de1344fb 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -368,7 +368,7 @@ static int uclamp_validate(struct task_struct *p,
* blocking operation which obviously cannot be done while holding
* scheduler locks.
*/
- static_branch_enable(&sched_uclamp_used);
+ sched_uclamp_enable();
return 0;
}
@@ -875,7 +875,7 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
{
struct sched_param lparam;
- if (!param || pid < 0)
+ if (unlikely(!param || pid < 0))
return -EINVAL;
if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
return -EFAULT;
@@ -984,7 +984,7 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
struct sched_attr attr;
int retval;
- if (!uattr || pid < 0 || flags)
+ if (unlikely(!uattr || pid < 0 || flags))
return -EINVAL;
retval = sched_copy_attr(uattr, &attr);
@@ -1049,7 +1049,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
struct task_struct *p;
int retval;
- if (!param || pid < 0)
+ if (unlikely(!param || pid < 0))
return -EINVAL;
scoped_guard (rcu) {
@@ -1085,8 +1085,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
struct task_struct *p;
int retval;
- if (!uattr || pid < 0 || usize > PAGE_SIZE ||
- usize < SCHED_ATTR_SIZE_VER0 || flags)
+ if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE ||
+ usize < SCHED_ATTR_SIZE_VER0 || flags))
return -EINVAL;
scoped_guard (rcu) {
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index c49aea8c1025..f1ebc60d967f 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -6,13 +6,19 @@
#include <linux/bsearch.h>
DEFINE_MUTEX(sched_domains_mutex);
+void sched_domains_mutex_lock(void)
+{
+ mutex_lock(&sched_domains_mutex);
+}
+void sched_domains_mutex_unlock(void)
+{
+ mutex_unlock(&sched_domains_mutex);
+}
/* Protected by sched_domains_mutex: */
static cpumask_var_t sched_domains_tmpmask;
static cpumask_var_t sched_domains_tmpmask2;
-#ifdef CONFIG_SCHED_DEBUG
-
static int __init sched_debug_setup(char *str)
{
sched_debug_verbose = true;
@@ -151,15 +157,6 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
break;
}
}
-#else /* !CONFIG_SCHED_DEBUG */
-
-# define sched_debug_verbose 0
-# define sched_domain_debug(sd, cpu) do { } while (0)
-static inline bool sched_debug(void)
-{
- return false;
-}
-#endif /* CONFIG_SCHED_DEBUG */
/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) |
@@ -560,7 +557,7 @@ static int init_rootdomain(struct root_domain *rd)
rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func);
#endif
- rd->visit_gen = 0;
+ rd->visit_cookie = 0;
init_dl_bw(&rd->dl_bw);
if (cpudl_init(&rd->cpudl) != 0)
goto free_rto_mask;
@@ -2275,9 +2272,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sgc)
return -ENOMEM;
-#ifdef CONFIG_SCHED_DEBUG
sgc->id = j;
-#endif
*per_cpu_ptr(sdd->sgc, j) = sgc;
}
@@ -2680,7 +2675,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
*
* Call with hotplug lock and sched_domains_mutex held
*/
-void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
+static void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
struct sched_domain_attr *dattr_new)
{
bool __maybe_unused has_eas = false;
@@ -2712,21 +2707,8 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
for (i = 0; i < ndoms_cur; i++) {
for (j = 0; j < n && !new_topology; j++) {
if (cpumask_equal(doms_cur[i], doms_new[j]) &&
- dattrs_equal(dattr_cur, i, dattr_new, j)) {
- struct root_domain *rd;
-
- /*
- * This domain won't be destroyed and as such
- * its dl_bw->total_bw needs to be cleared.
- * Tasks contribution will be then recomputed
- * in function dl_update_tasks_root_domain(),
- * dl_servers contribution in function
- * dl_restore_server_root_domain().
- */
- rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
- dl_clear_root_domain(rd);
+ dattrs_equal(dattr_cur, i, dattr_new, j))
goto match1;
- }
}
/* No match - a current sched domain not in new doms_new[] */
detach_destroy_domains(doms_cur[i]);
@@ -2783,6 +2765,7 @@ match3:
ndoms_cur = ndoms_new;
update_sched_domain_debugfs();
+ dl_rebuild_rd_accounting();
}
/*
@@ -2791,7 +2774,7 @@ match3:
void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
struct sched_domain_attr *dattr_new)
{
- mutex_lock(&sched_domains_mutex);
+ sched_domains_mutex_lock();
partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
- mutex_unlock(&sched_domains_mutex);
+ sched_domains_mutex_unlock();
}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 7bbb408431eb..41aa761c7738 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -29,13 +29,11 @@
#include <linux/syscalls.h>
#include <linux/sysctl.h>
+#include <asm/syscall.h>
+
/* Not exposed in headers: strictly internal use only. */
#define SECCOMP_MODE_DEAD (SECCOMP_MODE_FILTER + 1)
-#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
-#include <asm/syscall.h>
-#endif
-
#ifdef CONFIG_SECCOMP_FILTER
#include <linux/file.h>
#include <linux/filter.h>
@@ -576,6 +574,9 @@ void seccomp_filter_release(struct task_struct *tsk)
if (WARN_ON((tsk->flags & PF_EXITING) == 0))
return;
+ if (READ_ONCE(tsk->seccomp.filter) == NULL)
+ return;
+
spin_lock_irq(&tsk->sighand->siglock);
orig = tsk->seccomp.filter;
/* Detach task from its filter tree. */
@@ -601,6 +602,13 @@ static inline void seccomp_sync_threads(unsigned long flags)
BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
assert_spin_locked(&current->sighand->siglock);
+ /*
+ * Don't touch any of the threads if the process is being killed.
+ * This allows for a lockless check in seccomp_filter_release.
+ */
+ if (current->signal->flags & SIGNAL_GROUP_EXIT)
+ return;
+
/* Synchronize all threads. */
caller = current;
for_each_thread(caller, thread) {
@@ -1074,6 +1082,13 @@ void secure_computing_strict(int this_syscall)
else
BUG();
}
+int __secure_computing(void)
+{
+ int this_syscall = syscall_get_nr(current, current_pt_regs());
+
+ secure_computing_strict(this_syscall);
+ return 0;
+}
#else
#ifdef CONFIG_SECCOMP_FILTER
@@ -1225,13 +1240,12 @@ out:
return -1;
}
-static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
- const bool recheck_after_trace)
+static int __seccomp_filter(int this_syscall, const bool recheck_after_trace)
{
u32 filter_ret, action;
+ struct seccomp_data sd;
struct seccomp_filter *match = NULL;
int data;
- struct seccomp_data sd_local;
/*
* Make sure that any changes to mode from another thread have
@@ -1239,12 +1253,9 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
*/
smp_rmb();
- if (!sd) {
- populate_seccomp_data(&sd_local);
- sd = &sd_local;
- }
+ populate_seccomp_data(&sd);
- filter_ret = seccomp_run_filters(sd, &match);
+ filter_ret = seccomp_run_filters(&sd, &match);
data = filter_ret & SECCOMP_RET_DATA;
action = filter_ret & SECCOMP_RET_ACTION_FULL;
@@ -1302,13 +1313,13 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
* a reload of all registers. This does not goto skip since
* a skip would have already been reported.
*/
- if (__seccomp_filter(this_syscall, NULL, true))
+ if (__seccomp_filter(this_syscall, true))
return -1;
return 0;
case SECCOMP_RET_USER_NOTIF:
- if (seccomp_do_user_notification(this_syscall, match, sd))
+ if (seccomp_do_user_notification(this_syscall, match, &sd))
goto skip;
return 0;
@@ -1350,8 +1361,7 @@ skip:
return -1;
}
#else
-static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
- const bool recheck_after_trace)
+static int __seccomp_filter(int this_syscall, const bool recheck_after_trace)
{
BUG();
@@ -1359,7 +1369,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
}
#endif
-int __secure_computing(const struct seccomp_data *sd)
+int __secure_computing(void)
{
int mode = current->seccomp.mode;
int this_syscall;
@@ -1368,15 +1378,14 @@ int __secure_computing(const struct seccomp_data *sd)
unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
return 0;
- this_syscall = sd ? sd->nr :
- syscall_get_nr(current, current_pt_regs());
+ this_syscall = syscall_get_nr(current, current_pt_regs());
switch (mode) {
case SECCOMP_MODE_STRICT:
__secure_computing_strict(this_syscall); /* may call do_exit */
return 0;
case SECCOMP_MODE_FILTER:
- return __seccomp_filter(this_syscall, sd, false);
+ return __seccomp_filter(this_syscall, false);
/* Surviving SECCOMP_RET_KILL_* must be proactively impossible. */
case SECCOMP_MODE_DEAD:
WARN_ON_ONCE(1);
diff --git a/kernel/signal.c b/kernel/signal.c
index 875e97f6205a..86ba66d95da5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2092,7 +2092,7 @@ static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueu
* from a non-periodic timer, then just drop the reference
* count. Otherwise queue it on the ignored list.
*/
- if (tmr->it_signal && tmr->it_sig_periodic)
+ if (posixtimer_valid(tmr) && tmr->it_sig_periodic)
hlist_add_head(&tmr->ignored_list, &tsk->signal->ignored_posix_timers);
else
posixtimer_putref(tmr);
@@ -2180,8 +2180,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
WARN_ON_ONCE(!tsk->ptrace &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
/*
- * tsk is a group leader and has no threads, wake up the
- * non-PIDFD_THREAD waiters.
+ * Notify for thread-group leaders without subthreads.
*/
if (thread_group_empty(tsk))
do_notify_pidfd(tsk);
@@ -4009,6 +4008,47 @@ static struct pid *pidfd_to_pid(const struct file *file)
(PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \
PIDFD_SIGNAL_PROCESS_GROUP)
+static int do_pidfd_send_signal(struct pid *pid, int sig, enum pid_type type,
+ siginfo_t __user *info, unsigned int flags)
+{
+ kernel_siginfo_t kinfo;
+
+ switch (flags) {
+ case PIDFD_SIGNAL_THREAD:
+ type = PIDTYPE_PID;
+ break;
+ case PIDFD_SIGNAL_THREAD_GROUP:
+ type = PIDTYPE_TGID;
+ break;
+ case PIDFD_SIGNAL_PROCESS_GROUP:
+ type = PIDTYPE_PGID;
+ break;
+ }
+
+ if (info) {
+ int ret;
+
+ ret = copy_siginfo_from_user_any(&kinfo, info);
+ if (unlikely(ret))
+ return ret;
+
+ if (unlikely(sig != kinfo.si_signo))
+ return -EINVAL;
+
+ /* Only allow sending arbitrary signals to yourself. */
+ if ((task_pid(current) != pid || type > PIDTYPE_TGID) &&
+ (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
+ return -EPERM;
+ } else {
+ prepare_kill_siginfo(sig, &kinfo, type);
+ }
+
+ if (type == PIDTYPE_PGID)
+ return kill_pgrp_info(sig, &kinfo, pid);
+
+ return kill_pid_info_type(sig, &kinfo, pid, type);
+}
+
/**
* sys_pidfd_send_signal - Signal a process through a pidfd
* @pidfd: file descriptor of the process
@@ -4026,9 +4066,7 @@ static struct pid *pidfd_to_pid(const struct file *file)
SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
siginfo_t __user *, info, unsigned int, flags)
{
- int ret;
struct pid *pid;
- kernel_siginfo_t kinfo;
enum pid_type type;
/* Enforce flags be set to 0 until we add an extension. */
@@ -4039,57 +4077,39 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1)
return -EINVAL;
- CLASS(fd, f)(pidfd);
- if (fd_empty(f))
- return -EBADF;
+ switch (pidfd) {
+ case PIDFD_SELF_THREAD:
+ pid = get_task_pid(current, PIDTYPE_PID);
+ type = PIDTYPE_PID;
+ break;
+ case PIDFD_SELF_THREAD_GROUP:
+ pid = get_task_pid(current, PIDTYPE_TGID);
+ type = PIDTYPE_TGID;
+ break;
+ default: {
+ CLASS(fd, f)(pidfd);
+ if (fd_empty(f))
+ return -EBADF;
- /* Is this a pidfd? */
- pid = pidfd_to_pid(fd_file(f));
- if (IS_ERR(pid))
- return PTR_ERR(pid);
+ /* Is this a pidfd? */
+ pid = pidfd_to_pid(fd_file(f));
+ if (IS_ERR(pid))
+ return PTR_ERR(pid);
- if (!access_pidfd_pidns(pid))
- return -EINVAL;
+ if (!access_pidfd_pidns(pid))
+ return -EINVAL;
- switch (flags) {
- case 0:
/* Infer scope from the type of pidfd. */
if (fd_file(f)->f_flags & PIDFD_THREAD)
type = PIDTYPE_PID;
else
type = PIDTYPE_TGID;
- break;
- case PIDFD_SIGNAL_THREAD:
- type = PIDTYPE_PID;
- break;
- case PIDFD_SIGNAL_THREAD_GROUP:
- type = PIDTYPE_TGID;
- break;
- case PIDFD_SIGNAL_PROCESS_GROUP:
- type = PIDTYPE_PGID;
- break;
- }
- if (info) {
- ret = copy_siginfo_from_user_any(&kinfo, info);
- if (unlikely(ret))
- return ret;
-
- if (unlikely(sig != kinfo.si_signo))
- return -EINVAL;
-
- /* Only allow sending arbitrary signals to yourself. */
- if ((task_pid(current) != pid || type > PIDTYPE_TGID) &&
- (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
- return -EPERM;
- } else {
- prepare_kill_siginfo(sig, &kinfo, type);
+ return do_pidfd_send_signal(pid, sig, type, info, flags);
+ }
}
- if (type == PIDTYPE_PGID)
- return kill_pgrp_info(sig, &kinfo, pid);
- else
- return kill_pid_info_type(sig, &kinfo, pid, type);
+ return do_pidfd_send_signal(pid, sig, type, info, flags);
}
static int
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 8896d844d738..5d2d0562115b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -250,6 +250,7 @@ static int multi_cpu_stop(void *data)
* be detected and reported on their side.
*/
touch_nmi_watchdog();
+ /* Also suppress RCU CPU stall warnings. */
rcu_momentary_eqs();
}
} while (curstate != MULTI_STOP_EXIT);
diff --git a/kernel/sys.c b/kernel/sys.c
index cb366ff8703a..c434968e9f5d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1085,6 +1085,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
{
struct task_struct *p;
struct task_struct *group_leader = current->group_leader;
+ struct pid *pids[PIDTYPE_MAX] = { 0 };
struct pid *pgrp;
int err;
@@ -1142,13 +1143,14 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
goto out;
if (task_pgrp(p) != pgrp)
- change_pid(p, PIDTYPE_PGID, pgrp);
+ change_pid(pids, p, PIDTYPE_PGID, pgrp);
err = 0;
out:
/* All paths lead to here, thus we are safe. -DaveM */
write_unlock_irq(&tasklist_lock);
rcu_read_unlock();
+ free_pids(pids);
return err;
}
@@ -1222,21 +1224,22 @@ out:
return retval;
}
-static void set_special_pids(struct pid *pid)
+static void set_special_pids(struct pid **pids, struct pid *pid)
{
struct task_struct *curr = current->group_leader;
if (task_session(curr) != pid)
- change_pid(curr, PIDTYPE_SID, pid);
+ change_pid(pids, curr, PIDTYPE_SID, pid);
if (task_pgrp(curr) != pid)
- change_pid(curr, PIDTYPE_PGID, pid);
+ change_pid(pids, curr, PIDTYPE_PGID, pid);
}
int ksys_setsid(void)
{
struct task_struct *group_leader = current->group_leader;
struct pid *sid = task_pid(group_leader);
+ struct pid *pids[PIDTYPE_MAX] = { 0 };
pid_t session = pid_vnr(sid);
int err = -EPERM;
@@ -1252,13 +1255,14 @@ int ksys_setsid(void)
goto out;
group_leader->signal->leader = 1;
- set_special_pids(sid);
+ set_special_pids(pids, sid);
proc_clear_tty(group_leader);
err = session;
out:
write_unlock_irq(&tasklist_lock);
+ free_pids(pids);
if (err > 0) {
proc_sid_connector(group_leader);
sched_autogroup_create_attach(group_leader);
@@ -2811,6 +2815,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
return -EINVAL;
error = arch_lock_shadow_stack_status(me, arg2);
break;
+ case PR_TIMER_CREATE_RESTORE_IDS:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = posixtimer_create_prctl(arg2);
+ break;
default:
trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
error = -EINVAL;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index cb57da499ebb..4ebe6136b08d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -54,7 +54,6 @@
#include <linux/acpi.h>
#include <linux/reboot.h>
#include <linux/ftrace.h>
-#include <linux/perf_event.h>
#include <linux/oom.h>
#include <linux/kmod.h>
#include <linux/capability.h>
@@ -91,12 +90,6 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals);
#if defined(CONFIG_SYSCTL)
/* Constants used for minimum and maximum */
-
-#ifdef CONFIG_PERF_EVENTS
-static const int six_hundred_forty_kb = 640 * 1024;
-#endif
-
-
static const int ngroups_max = NGROUPS_MAX;
static const int cap_last_cap = CAP_LAST_CAP;
@@ -1831,16 +1824,6 @@ static const struct ctl_table kern_table[] = {
.mode = 0444,
.proc_handler = proc_dointvec,
},
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
- {
- .procname = "unknown_nmi_panic",
- .data = &unknown_nmi_panic,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
-
#if (defined(CONFIG_X86_32) || defined(CONFIG_PARISC)) && \
defined(CONFIG_DEBUG_STACKOVERFLOW)
{
@@ -1851,43 +1834,6 @@ static const struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#if defined(CONFIG_X86)
- {
- .procname = "panic_on_unrecovered_nmi",
- .data = &panic_on_unrecovered_nmi,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "panic_on_io_nmi",
- .data = &panic_on_io_nmi,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "bootloader_type",
- .data = &bootloader_type,
- .maxlen = sizeof (int),
- .mode = 0444,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "bootloader_version",
- .data = &bootloader_version,
- .maxlen = sizeof (int),
- .mode = 0444,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "io_delay_type",
- .data = &io_delay_type,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
-#endif
#if defined(CONFIG_MMU)
{
.procname = "randomize_va_space",
@@ -1906,15 +1852,6 @@ static const struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86)
- {
- .procname = "acpi_video_flags",
- .data = &acpi_realmode_flags,
- .maxlen = sizeof (unsigned long),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- },
-#endif
#ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN
{
.procname = "ignore-unaligned-usertrap",
@@ -1933,63 +1870,6 @@ static const struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
-#ifdef CONFIG_PERF_EVENTS
- /*
- * User-space scripts rely on the existence of this file
- * as a feature check for perf_events being enabled.
- *
- * So it's an ABI, do not remove!
- */
- {
- .procname = "perf_event_paranoid",
- .data = &sysctl_perf_event_paranoid,
- .maxlen = sizeof(sysctl_perf_event_paranoid),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "perf_event_mlock_kb",
- .data = &sysctl_perf_event_mlock,
- .maxlen = sizeof(sysctl_perf_event_mlock),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "perf_event_max_sample_rate",
- .data = &sysctl_perf_event_sample_rate,
- .maxlen = sizeof(sysctl_perf_event_sample_rate),
- .mode = 0644,
- .proc_handler = perf_event_max_sample_rate_handler,
- .extra1 = SYSCTL_ONE,
- },
- {
- .procname = "perf_cpu_time_max_percent",
- .data = &sysctl_perf_cpu_time_max_percent,
- .maxlen = sizeof(sysctl_perf_cpu_time_max_percent),
- .mode = 0644,
- .proc_handler = perf_cpu_time_max_percent_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE_HUNDRED,
- },
- {
- .procname = "perf_event_max_stack",
- .data = &sysctl_perf_event_max_stack,
- .maxlen = sizeof(sysctl_perf_event_max_stack),
- .mode = 0644,
- .proc_handler = perf_event_max_stack_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = (void *)&six_hundred_forty_kb,
- },
- {
- .procname = "perf_event_max_contexts_per_stack",
- .data = &sysctl_perf_event_max_contexts_per_stack,
- .maxlen = sizeof(sysctl_perf_event_max_contexts_per_stack),
- .mode = 0644,
- .proc_handler = perf_event_max_stack_handler,
- .extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE_THOUSAND,
- },
-#endif
{
.procname = "panic_on_warn",
.data = &panic_on_warn,
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index fe0ae82124fe..e6e9b85d4db5 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,10 @@
# SPDX-License-Identifier: GPL-2.0
+
+# Branch profiling isn't noinstr-safe
+ifdef CONFIG_TRACE_BRANCH_PROFILING
+CFLAGS_sched_clock.o += -DDISABLE_BRANCH_PROFILING
+endif
+
obj-y += time.o timer.o hrtimer.o sleep_timeout.o
obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
obj-y += timeconv.o timecounter.o alarmtimer.o
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 2a7802ec480c..e0eeacbe2521 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -1510,7 +1510,7 @@ static int __init boot_override_clocksource(char* str)
{
mutex_lock(&clocksource_mutex);
if (str)
- strscpy(override_name, str, sizeof(override_name));
+ strscpy(override_name, str);
mutex_unlock(&clocksource_mutex);
return 1;
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index deb1aa32814e..22376a1a75b9 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -117,16 +117,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
.csd = CSD_INIT(retrigger_next_event, NULL)
};
-static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
- /* Make sure we catch unsupported clockids */
- [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES,
-
- [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
- [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
- [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
- [CLOCK_TAI] = HRTIMER_BASE_TAI,
-};
-
static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
{
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
@@ -1587,19 +1577,19 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude)
static inline int hrtimer_clockid_to_base(clockid_t clock_id)
{
- if (likely(clock_id < MAX_CLOCKS)) {
- int base = hrtimer_clock_to_base_table[clock_id];
-
- if (likely(base != HRTIMER_MAX_CLOCK_BASES))
- return base;
+ switch (clock_id) {
+ case CLOCK_REALTIME:
+ return HRTIMER_BASE_REALTIME;
+ case CLOCK_MONOTONIC:
+ return HRTIMER_BASE_MONOTONIC;
+ case CLOCK_BOOTTIME:
+ return HRTIMER_BASE_BOOTTIME;
+ case CLOCK_TAI:
+ return HRTIMER_BASE_TAI;
+ default:
+ WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
+ return HRTIMER_BASE_MONOTONIC;
}
- WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
- return HRTIMER_BASE_MONOTONIC;
-}
-
-static enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused)
-{
- return HRTIMER_NORESTART;
}
static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 0775b9ec952a..e3642278df43 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -165,26 +165,26 @@ static struct timens_offset offset_from_ts(struct timespec64 off)
* HVCLOCK
* VVAR
*
- * The check for vdso_data->clock_mode is in the unlikely path of
+ * The check for vdso_clock->clock_mode is in the unlikely path of
* the seq begin magic. So for the non-timens case most of the time
* 'seq' is even, so the branch is not taken.
*
* If 'seq' is odd, i.e. a concurrent update is in progress, the extra check
- * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the
+ * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the
* update to finish and for 'seq' to become even anyway.
*
- * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which
+ * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which
* enforces the time namespace handling path.
*/
-static void timens_setup_vdso_data(struct vdso_data *vdata,
- struct time_namespace *ns)
+static void timens_setup_vdso_clock_data(struct vdso_clock *vc,
+ struct time_namespace *ns)
{
- struct timens_offset *offset = vdata->offset;
+ struct timens_offset *offset = vc->offset;
struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic);
struct timens_offset boottime = offset_from_ts(ns->offsets.boottime);
- vdata->seq = 1;
- vdata->clock_mode = VDSO_CLOCKMODE_TIMENS;
+ vc->seq = 1;
+ vc->clock_mode = VDSO_CLOCKMODE_TIMENS;
offset[CLOCK_MONOTONIC] = monotonic;
offset[CLOCK_MONOTONIC_RAW] = monotonic;
offset[CLOCK_MONOTONIC_COARSE] = monotonic;
@@ -219,7 +219,8 @@ static DEFINE_MUTEX(offset_lock);
static void timens_set_vvar_page(struct task_struct *task,
struct time_namespace *ns)
{
- struct vdso_data *vdata;
+ struct vdso_time_data *vdata;
+ struct vdso_clock *vc;
unsigned int i;
if (ns == &init_time_ns)
@@ -235,10 +236,11 @@ static void timens_set_vvar_page(struct task_struct *task,
goto out;
ns->frozen_offsets = true;
- vdata = arch_get_vdso_data(page_address(ns->vvar_page));
+ vdata = page_address(ns->vvar_page);
+ vc = vdata->clock_data;
for (i = 0; i < CS_BASES; i++)
- timens_setup_vdso_data(&vdata[i], ns);
+ timens_setup_vdso_clock_data(&vc[i], ns);
out:
mutex_unlock(&offset_lock);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 163e7a2033b6..b837d3d9d325 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -678,8 +678,7 @@ void ntp_notify_cmos_timer(bool offset_set)
static void __init ntp_init_cmos_sync(void)
{
- hrtimer_init(&sync_hrtimer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
- sync_hrtimer.function = sync_timer_callback;
+ hrtimer_setup(&sync_hrtimer, sync_timer_callback, CLOCK_REALTIME, HRTIMER_MODE_ABS);
}
#else /* CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */
static inline void __init ntp_init_cmos_sync(void) { }
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 1af0bb2cc45c..7f4e4fb7381e 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -90,26 +90,6 @@ static long posix_clock_ioctl(struct file *fp,
return err;
}
-#ifdef CONFIG_COMPAT
-static long posix_clock_compat_ioctl(struct file *fp,
- unsigned int cmd, unsigned long arg)
-{
- struct posix_clock_context *pccontext = fp->private_data;
- struct posix_clock *clk = get_posix_clock(fp);
- int err = -ENOTTY;
-
- if (!clk)
- return -ENODEV;
-
- if (clk->ops.ioctl)
- err = clk->ops.ioctl(pccontext, cmd, arg);
-
- put_posix_clock(clk);
-
- return err;
-}
-#endif
-
static int posix_clock_open(struct inode *inode, struct file *fp)
{
int err;
@@ -171,11 +151,9 @@ static const struct file_operations posix_clock_file_operations = {
.read = posix_clock_read,
.poll = posix_clock_poll,
.unlocked_ioctl = posix_clock_ioctl,
+ .compat_ioctl = posix_clock_ioctl,
.open = posix_clock_open,
.release = posix_clock_release,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = posix_clock_compat_ioctl,
-#endif
};
int posix_clock_register(struct posix_clock *clk, struct device *dev)
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 1b675aee99a9..6222112533a7 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -9,28 +9,23 @@
*
* These are all the functions necessary to implement POSIX clocks & timers
*/
-#include <linux/mm.h>
+#include <linux/compat.h>
+#include <linux/compiler.h>
+#include <linux/init.h>
+#include <linux/jhash.h>
#include <linux/interrupt.h>
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/mutex.h>
-#include <linux/sched/task.h>
-
-#include <linux/uaccess.h>
#include <linux/list.h>
-#include <linux/init.h>
-#include <linux/compiler.h>
-#include <linux/hash.h>
+#include <linux/memblock.h>
+#include <linux/nospec.h>
#include <linux/posix-clock.h>
#include <linux/posix-timers.h>
+#include <linux/prctl.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
#include <linux/syscalls.h>
-#include <linux/wait.h>
-#include <linux/workqueue.h>
-#include <linux/export.h>
-#include <linux/hashtable.h>
-#include <linux/compat.h>
-#include <linux/nospec.h>
+#include <linux/time.h>
#include <linux/time_namespace.h>
+#include <linux/uaccess.h>
#include "timekeeping.h"
#include "posix-timers.h"
@@ -46,39 +41,65 @@ static struct kmem_cache *posix_timers_cache;
* This allows checkpoint/restore to reconstruct the exact timer IDs for
* a process.
*/
-static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
-static DEFINE_SPINLOCK(hash_lock);
+struct timer_hash_bucket {
+ spinlock_t lock;
+ struct hlist_head head;
+};
+
+static struct {
+ struct timer_hash_bucket *buckets;
+ unsigned long mask;
+} __timer_data __ro_after_init __aligned(2*sizeof(long));
+
+#define timer_buckets (__timer_data.buckets)
+#define timer_hashmask (__timer_data.mask)
static const struct k_clock * const posix_clocks[];
static const struct k_clock *clockid_to_kclock(const clockid_t id);
static const struct k_clock clock_realtime, clock_monotonic;
+#define TIMER_ANY_ID INT_MIN
+
/* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */
#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
#endif
-static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
+static struct k_itimer *__lock_timer(timer_t timer_id);
-#define lock_timer(tid, flags) \
-({ struct k_itimer *__timr; \
- __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \
- __timr; \
+#define lock_timer(tid) \
+({ struct k_itimer *__timr; \
+ __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid)); \
+ __timr; \
})
-static int hash(struct signal_struct *sig, unsigned int nr)
+static inline void unlock_timer(struct k_itimer *timr)
{
- return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable));
+ if (likely((timr)))
+ spin_unlock_irq(&timr->it_lock);
}
-static struct k_itimer *__posix_timers_find(struct hlist_head *head,
- struct signal_struct *sig,
- timer_t id)
+#define scoped_timer_get_or_fail(_id) \
+ scoped_cond_guard(lock_timer, return -EINVAL, _id)
+
+#define scoped_timer (scope)
+
+DEFINE_CLASS(lock_timer, struct k_itimer *, unlock_timer(_T), __lock_timer(id), timer_t id);
+DEFINE_CLASS_IS_COND_GUARD(lock_timer);
+
+static struct timer_hash_bucket *hash_bucket(struct signal_struct *sig, unsigned int nr)
{
+ return &timer_buckets[jhash2((u32 *)&sig, sizeof(sig) / sizeof(u32), nr) & timer_hashmask];
+}
+
+static struct k_itimer *posix_timer_by_id(timer_t id)
+{
+ struct signal_struct *sig = current->signal;
+ struct timer_hash_bucket *bucket = hash_bucket(sig, id);
struct k_itimer *timer;
- hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) {
+ hlist_for_each_entry_rcu(timer, &bucket->head, t_hash) {
/* timer->it_signal can be set concurrently */
if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id))
return timer;
@@ -86,46 +107,88 @@ static struct k_itimer *__posix_timers_find(struct hlist_head *head,
return NULL;
}
-static struct k_itimer *posix_timer_by_id(timer_t id)
+static inline struct signal_struct *posix_sig_owner(const struct k_itimer *timer)
{
- struct signal_struct *sig = current->signal;
- struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)];
+ unsigned long val = (unsigned long)timer->it_signal;
- return __posix_timers_find(head, sig, id);
+ /*
+ * Mask out bit 0, which acts as invalid marker to prevent
+ * posix_timer_by_id() detecting it as valid.
+ */
+ return (struct signal_struct *)(val & ~1UL);
}
-static int posix_timer_add(struct k_itimer *timer)
+static bool posix_timer_hashed(struct timer_hash_bucket *bucket, struct signal_struct *sig,
+ timer_t id)
{
- struct signal_struct *sig = current->signal;
- struct hlist_head *head;
- unsigned int cnt, id;
+ struct hlist_head *head = &bucket->head;
+ struct k_itimer *timer;
- /*
- * FIXME: Replace this by a per signal struct xarray once there is
- * a plan to handle the resulting CRIU regression gracefully.
- */
- for (cnt = 0; cnt <= INT_MAX; cnt++) {
- spin_lock(&hash_lock);
- id = sig->next_posix_timer_id;
+ hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&bucket->lock)) {
+ if ((posix_sig_owner(timer) == sig) && (timer->it_id == id))
+ return true;
+ }
+ return false;
+}
- /* Write the next ID back. Clamp it to the positive space */
- sig->next_posix_timer_id = (id + 1) & INT_MAX;
+static bool posix_timer_add_at(struct k_itimer *timer, struct signal_struct *sig, unsigned int id)
+{
+ struct timer_hash_bucket *bucket = hash_bucket(sig, id);
- head = &posix_timers_hashtable[hash(sig, id)];
- if (!__posix_timers_find(head, sig, id)) {
- hlist_add_head_rcu(&timer->t_hash, head);
- spin_unlock(&hash_lock);
- return id;
+ scoped_guard (spinlock, &bucket->lock) {
+ /*
+ * Validate under the lock as this could have raced against
+ * another thread ending up with the same ID, which is
+ * highly unlikely, but possible.
+ */
+ if (!posix_timer_hashed(bucket, sig, id)) {
+ /*
+ * Set the timer ID and the signal pointer to make
+ * it identifiable in the hash table. The signal
+ * pointer has bit 0 set to indicate that it is not
+ * yet fully initialized. posix_timer_hashed()
+ * masks this bit out, but the syscall lookup fails
+ * to match due to it being set. This guarantees
+ * that there can't be duplicate timer IDs handed
+ * out.
+ */
+ timer->it_id = (timer_t)id;
+ timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL);
+ hlist_add_head_rcu(&timer->t_hash, &bucket->head);
+ return true;
}
- spin_unlock(&hash_lock);
}
- /* POSIX return code when no timer ID could be allocated */
- return -EAGAIN;
+ return false;
}
-static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
+static int posix_timer_add(struct k_itimer *timer, int req_id)
{
- spin_unlock_irqrestore(&timr->it_lock, flags);
+ struct signal_struct *sig = current->signal;
+
+ if (unlikely(req_id != TIMER_ANY_ID)) {
+ if (!posix_timer_add_at(timer, sig, req_id))
+ return -EBUSY;
+
+ /*
+ * Move the ID counter past the requested ID, so that after
+ * switching back to normal mode the IDs are outside of the
+ * exact allocated region. That avoids ID collisions on the
+ * next regular timer_create() invocations.
+ */
+ atomic_set(&sig->next_posix_timer_id, req_id + 1);
+ return req_id;
+ }
+
+ for (unsigned int cnt = 0; cnt <= INT_MAX; cnt++) {
+ /* Get the next timer ID and clamp it to positive space */
+ unsigned int id = atomic_fetch_inc(&sig->next_posix_timer_id) & INT_MAX;
+
+ if (posix_timer_add_at(timer, sig, id))
+ return id;
+ cond_resched();
+ }
+ /* POSIX return code when no timer ID could be allocated */
+ return -EAGAIN;
}
static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp)
@@ -222,9 +285,8 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
static __init int init_posix_timers(void)
{
- posix_timers_cache = kmem_cache_create("posix_timers_cache",
- sizeof(struct k_itimer), 0,
- SLAB_PANIC | SLAB_ACCOUNT, NULL);
+ posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof(struct k_itimer),
+ __alignof__(struct k_itimer), SLAB_ACCOUNT, NULL);
return 0;
}
__initcall(init_posix_timers);
@@ -259,7 +321,7 @@ static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_it
* since the signal was queued. In either case, don't rearm and
* drop the signal.
*/
- if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!timr->it_signal))
+ if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!posixtimer_valid(timr)))
return false;
if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING))
@@ -304,6 +366,9 @@ void posix_timer_queue_signal(struct k_itimer *timr)
{
lockdep_assert_held(&timr->it_lock);
+ if (!posixtimer_valid(timr))
+ return;
+
timr->it_status = timr->it_interval ? POSIX_TIMER_REQUEUE_PENDING : POSIX_TIMER_DISARMED;
posixtimer_send_sigqueue(timr);
}
@@ -324,6 +389,21 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
+long posixtimer_create_prctl(unsigned long ctrl)
+{
+ switch (ctrl) {
+ case PR_TIMER_CREATE_RESTORE_IDS_OFF:
+ current->signal->timer_create_restore_ids = 0;
+ return 0;
+ case PR_TIMER_CREATE_RESTORE_IDS_ON:
+ current->signal->timer_create_restore_ids = 1;
+ return 0;
+ case PR_TIMER_CREATE_RESTORE_IDS_GET:
+ return current->signal->timer_create_restore_ids;
+ }
+ return -EINVAL;
+}
+
static struct pid *good_sigevent(sigevent_t * event)
{
struct pid *pid = task_tgid(current);
@@ -350,8 +430,12 @@ static struct pid *good_sigevent(sigevent_t * event)
static struct k_itimer *alloc_posix_timer(void)
{
- struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);
+ struct k_itimer *tmr;
+ if (unlikely(!posix_timers_cache))
+ return NULL;
+
+ tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);
if (!tmr)
return tmr;
@@ -373,15 +457,16 @@ void posixtimer_free_timer(struct k_itimer *tmr)
static void posix_timer_unhash_and_free(struct k_itimer *tmr)
{
- spin_lock(&hash_lock);
- hlist_del_rcu(&tmr->t_hash);
- spin_unlock(&hash_lock);
+ struct timer_hash_bucket *bucket = hash_bucket(posix_sig_owner(tmr), tmr->it_id);
+
+ scoped_guard (spinlock, &bucket->lock)
+ hlist_del_rcu(&tmr->t_hash);
posixtimer_putref(tmr);
}
static int common_timer_create(struct k_itimer *new_timer)
{
- hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
+ hrtimer_setup(&new_timer->it.real.timer, posix_timer_fn, new_timer->it_clock, 0);
return 0;
}
@@ -390,6 +475,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
timer_t __user *created_timer_id)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
+ timer_t req_id = TIMER_ANY_ID;
struct k_itimer *new_timer;
int error, new_timer_id;
@@ -404,26 +490,32 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
spin_lock_init(&new_timer->it_lock);
+ /* Special case for CRIU to restore timers with a given timer ID. */
+ if (unlikely(current->signal->timer_create_restore_ids)) {
+ if (copy_from_user(&req_id, created_timer_id, sizeof(req_id)))
+ return -EFAULT;
+ /* Valid IDs are 0..INT_MAX */
+ if ((unsigned int)req_id > INT_MAX)
+ return -EINVAL;
+ }
+
/*
* Add the timer to the hash table. The timer is not yet valid
- * because new_timer::it_signal is still NULL. The timer id is also
- * not yet visible to user space.
+ * after insertion, but has a unique ID allocated.
*/
- new_timer_id = posix_timer_add(new_timer);
+ new_timer_id = posix_timer_add(new_timer, req_id);
if (new_timer_id < 0) {
posixtimer_free_timer(new_timer);
return new_timer_id;
}
- new_timer->it_id = (timer_t) new_timer_id;
new_timer->it_clock = which_clock;
new_timer->kclock = kc;
new_timer->it_overrun = -1LL;
if (event) {
- rcu_read_lock();
- new_timer->it_pid = get_pid(good_sigevent(event));
- rcu_read_unlock();
+ scoped_guard (rcu)
+ new_timer->it_pid = get_pid(good_sigevent(event));
if (!new_timer->it_pid) {
error = -EINVAL;
goto out;
@@ -434,7 +526,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
} else {
new_timer->it_sigev_notify = SIGEV_SIGNAL;
new_timer->sigq.info.si_signo = SIGALRM;
- memset(&new_timer->sigq.info.si_value, 0, sizeof(sigval_t));
new_timer->sigq.info.si_value.sival_int = new_timer->it_id;
new_timer->it_pid = get_pid(task_tgid(current));
}
@@ -453,7 +544,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
}
/*
* After succesful copy out, the timer ID is visible to user space
- * now but not yet valid because new_timer::signal is still NULL.
+ * now but not yet valid because new_timer::signal low order bit is 1.
*
* Complete the initialization with the clock specific create
* callback.
@@ -462,14 +553,25 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
if (error)
goto out;
- spin_lock_irq(&current->sighand->siglock);
- /* This makes the timer valid in the hash table */
- WRITE_ONCE(new_timer->it_signal, current->signal);
- hlist_add_head(&new_timer->list, &current->signal->posix_timers);
- spin_unlock_irq(&current->sighand->siglock);
/*
- * After unlocking sighand::siglock @new_timer is subject to
- * concurrent removal and cannot be touched anymore
+ * timer::it_lock ensures that __lock_timer() observes a fully
+ * initialized timer when it observes a valid timer::it_signal.
+ *
+ * sighand::siglock is required to protect signal::posix_timers.
+ */
+ scoped_guard (spinlock_irq, &new_timer->it_lock) {
+ guard(spinlock)(&current->sighand->siglock);
+ /*
+ * new_timer::it_signal contains the signal pointer with
+ * bit 0 set, which makes it invalid for syscall operations.
+ * Store the unmodified signal pointer to make it valid.
+ */
+ WRITE_ONCE(new_timer->it_signal, current->signal);
+ hlist_add_head_rcu(&new_timer->list, &current->signal->posix_timers);
+ }
+ /*
+ * After unlocking @new_timer is subject to concurrent removal and
+ * cannot be touched anymore
*/
return 0;
out:
@@ -507,7 +609,7 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
}
#endif
-static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
+static struct k_itimer *__lock_timer(timer_t timer_id)
{
struct k_itimer *timr;
@@ -522,11 +624,11 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
* The hash lookup and the timers are RCU protected.
*
* Timers are added to the hash in invalid state where
- * timr::it_signal == NULL. timer::it_signal is only set after the
- * rest of the initialization succeeded.
+ * timr::it_signal is marked invalid. timer::it_signal is only set
+ * after the rest of the initialization succeeded.
*
* Timer destruction happens in steps:
- * 1) Set timr::it_signal to NULL with timr::it_lock held
+ * 1) Set timr::it_signal marked invalid with timr::it_lock held
* 2) Release timr::it_lock
* 3) Remove from the hash under hash_lock
* 4) Put the reference count.
@@ -543,25 +645,21 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
*
* The lookup validates locklessly that timr::it_signal ==
* current::it_signal and timr::it_id == @timer_id. timr::it_id
- * can't change, but timr::it_signal becomes NULL during
- * destruction.
+ * can't change, but timr::it_signal can become invalid during
+ * destruction, which makes the locked check fail.
*/
- rcu_read_lock();
+ guard(rcu)();
timr = posix_timer_by_id(timer_id);
if (timr) {
- spin_lock_irqsave(&timr->it_lock, *flags);
+ spin_lock_irq(&timr->it_lock);
/*
* Validate under timr::it_lock that timr::it_signal is
* still valid. Pairs with #1 above.
*/
- if (timr->it_signal == current->signal) {
- rcu_read_unlock();
+ if (timr->it_signal == current->signal)
return timr;
- }
- spin_unlock_irqrestore(&timr->it_lock, *flags);
+ spin_unlock_irq(&timr->it_lock);
}
- rcu_read_unlock();
-
return NULL;
}
@@ -652,24 +750,10 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting)
{
- const struct k_clock *kc;
- struct k_itimer *timr;
- unsigned long flags;
- int ret = 0;
-
- timr = lock_timer(timer_id, &flags);
- if (!timr)
- return -EINVAL;
-
memset(setting, 0, sizeof(*setting));
- kc = timr->kclock;
- if (WARN_ON_ONCE(!kc || !kc->timer_get))
- ret = -EINVAL;
- else
- kc->timer_get(timr, setting);
-
- unlock_timer(timr, flags);
- return ret;
+ scoped_timer_get_or_fail(timer_id)
+ scoped_timer->kclock->timer_get(scoped_timer, setting);
+ return 0;
}
/* Get the time remaining on a POSIX.1b interval timer. */
@@ -723,18 +807,8 @@ SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id,
*/
SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
{
- struct k_itimer *timr;
- unsigned long flags;
- int overrun;
-
- timr = lock_timer(timer_id, &flags);
- if (!timr)
- return -EINVAL;
-
- overrun = timer_overrun_to_int(timr);
- unlock_timer(timr, flags);
-
- return overrun;
+ scoped_timer_get_or_fail(timer_id)
+ return timer_overrun_to_int(scoped_timer);
}
static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
@@ -747,7 +821,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
/*
* Posix magic: Relative CLOCK_REALTIME timers are not affected by
* clock modifications, so they become CLOCK_MONOTONIC based under the
- * hood. See hrtimer_init(). Update timr->kclock, so the generic
+ * hood. See hrtimer_setup(). Update timr->kclock, so the generic
* functions which use timr->kclock->clock_get_*() work.
*
* Note: it_clock stays unmodified, because the next timer_set() might
@@ -756,8 +830,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
if (timr->it_clock == CLOCK_REALTIME)
timr->kclock = absolute ? &clock_realtime : &clock_monotonic;
- hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
- timr->it.real.timer.function = posix_timer_fn;
+ hrtimer_setup(&timr->it.real.timer, posix_timer_fn, timr->it_clock, mode);
if (!absolute)
expires = ktime_add_safe(expires, timer->base->get_time());
@@ -791,26 +864,13 @@ static void common_timer_wait_running(struct k_itimer *timer)
* when the task which tries to delete or disarm the timer has preempted
* the task which runs the expiry in task work context.
*/
-static struct k_itimer *timer_wait_running(struct k_itimer *timer,
- unsigned long *flags)
+static void timer_wait_running(struct k_itimer *timer)
{
- const struct k_clock *kc = READ_ONCE(timer->kclock);
- timer_t timer_id = READ_ONCE(timer->it_id);
-
- /* Prevent kfree(timer) after dropping the lock */
- rcu_read_lock();
- unlock_timer(timer, *flags);
-
/*
* kc->timer_wait_running() might drop RCU lock. So @timer
* cannot be touched anymore after the function returns!
*/
- if (!WARN_ON_ONCE(!kc->timer_wait_running))
- kc->timer_wait_running(timer);
-
- rcu_read_unlock();
- /* Relock the timer. It might be not longer hashed. */
- return lock_timer(timer_id, flags);
+ timer->kclock->timer_wait_running(timer);
}
/*
@@ -865,15 +925,9 @@ int common_timer_set(struct k_itimer *timr, int flags,
return 0;
}
-static int do_timer_settime(timer_t timer_id, int tmr_flags,
- struct itimerspec64 *new_spec64,
+static int do_timer_settime(timer_t timer_id, int tmr_flags, struct itimerspec64 *new_spec64,
struct itimerspec64 *old_spec64)
{
- const struct k_clock *kc;
- struct k_itimer *timr;
- unsigned long flags;
- int error;
-
if (!timespec64_valid(&new_spec64->it_interval) ||
!timespec64_valid(&new_spec64->it_value))
return -EINVAL;
@@ -881,33 +935,28 @@ static int do_timer_settime(timer_t timer_id, int tmr_flags,
if (old_spec64)
memset(old_spec64, 0, sizeof(*old_spec64));
- timr = lock_timer(timer_id, &flags);
-retry:
- if (!timr)
- return -EINVAL;
+ for (; ; old_spec64 = NULL) {
+ struct k_itimer *timr;
- if (old_spec64)
- old_spec64->it_interval = ktime_to_timespec64(timr->it_interval);
+ scoped_timer_get_or_fail(timer_id) {
+ timr = scoped_timer;
- /* Prevent signal delivery and rearming. */
- timr->it_signal_seq++;
+ if (old_spec64)
+ old_spec64->it_interval = ktime_to_timespec64(timr->it_interval);
- kc = timr->kclock;
- if (WARN_ON_ONCE(!kc || !kc->timer_set))
- error = -EINVAL;
- else
- error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64);
-
- if (error == TIMER_RETRY) {
- // We already got the old time...
- old_spec64 = NULL;
- /* Unlocks and relocks the timer if it still exists */
- timr = timer_wait_running(timr, &flags);
- goto retry;
- }
- unlock_timer(timr, flags);
+ /* Prevent signal delivery and rearming. */
+ timr->it_signal_seq++;
- return error;
+ int ret = timr->kclock->timer_set(timr, tmr_flags, new_spec64, old_spec64);
+ if (ret != TIMER_RETRY)
+ return ret;
+
+ /* Protect the timer from being freed when leaving the lock scope */
+ rcu_read_lock();
+ }
+ timer_wait_running(timr);
+ rcu_read_unlock();
+ }
}
/* Set a POSIX.1b interval timer */
@@ -978,110 +1027,58 @@ static inline void posix_timer_cleanup_ignored(struct k_itimer *tmr)
}
}
-static inline int timer_delete_hook(struct k_itimer *timer)
+static void posix_timer_delete(struct k_itimer *timer)
{
- const struct k_clock *kc = timer->kclock;
-
- /* Prevent signal delivery and rearming. */
+ /*
+ * Invalidate the timer, remove it from the linked list and remove
+ * it from the ignored list if pending.
+ *
+ * The invalidation must be written with siglock held so that the
+ * signal code observes the invalidated timer::it_signal in
+ * do_sigaction(), which prevents it from moving a pending signal
+ * of a deleted timer to the ignore list.
+ *
+ * The invalidation also prevents signal queueing, signal delivery
+ * and therefore rearming from the signal delivery path.
+ *
+ * A concurrent lookup can still find the timer in the hash, but it
+ * will check timer::it_signal with timer::it_lock held and observe
+ * bit 0 set, which invalidates it. That also prevents the timer ID
+ * from being handed out before this timer is completely gone.
+ */
timer->it_signal_seq++;
- if (WARN_ON_ONCE(!kc || !kc->timer_del))
- return -EINVAL;
- return kc->timer_del(timer);
+ scoped_guard (spinlock, &current->sighand->siglock) {
+ unsigned long sig = (unsigned long)timer->it_signal | 1UL;
+
+ WRITE_ONCE(timer->it_signal, (struct signal_struct *)sig);
+ hlist_del_rcu(&timer->list);
+ posix_timer_cleanup_ignored(timer);
+ }
+
+ while (timer->kclock->timer_del(timer) == TIMER_RETRY) {
+ guard(rcu)();
+ spin_unlock_irq(&timer->it_lock);
+ timer_wait_running(timer);
+ spin_lock_irq(&timer->it_lock);
+ }
}
/* Delete a POSIX.1b interval timer. */
SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
{
struct k_itimer *timer;
- unsigned long flags;
- timer = lock_timer(timer_id, &flags);
-
-retry_delete:
- if (!timer)
- return -EINVAL;
-
- if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) {
- /* Unlocks and relocks the timer if it still exists */
- timer = timer_wait_running(timer, &flags);
- goto retry_delete;
+ scoped_timer_get_or_fail(timer_id) {
+ timer = scoped_timer;
+ posix_timer_delete(timer);
}
-
- spin_lock(&current->sighand->siglock);
- hlist_del(&timer->list);
- posix_timer_cleanup_ignored(timer);
- /*
- * A concurrent lookup could check timer::it_signal lockless. It
- * will reevaluate with timer::it_lock held and observe the NULL.
- *
- * It must be written with siglock held so that the signal code
- * observes timer->it_signal == NULL in do_sigaction(SIG_IGN),
- * which prevents it from moving a pending signal of a deleted
- * timer to the ignore list.
- */
- WRITE_ONCE(timer->it_signal, NULL);
- spin_unlock(&current->sighand->siglock);
-
- unlock_timer(timer, flags);
+ /* Remove it from the hash, which frees up the timer ID */
posix_timer_unhash_and_free(timer);
return 0;
}
/*
- * Delete a timer if it is armed, remove it from the hash and schedule it
- * for RCU freeing.
- */
-static void itimer_delete(struct k_itimer *timer)
-{
- unsigned long flags;
-
- /*
- * irqsave is required to make timer_wait_running() work.
- */
- spin_lock_irqsave(&timer->it_lock, flags);
-
-retry_delete:
- /*
- * Even if the timer is not longer accessible from other tasks
- * it still might be armed and queued in the underlying timer
- * mechanism. Worse, that timer mechanism might run the expiry
- * function concurrently.
- */
- if (timer_delete_hook(timer) == TIMER_RETRY) {
- /*
- * Timer is expired concurrently, prevent livelocks
- * and pointless spinning on RT.
- *
- * timer_wait_running() drops timer::it_lock, which opens
- * the possibility for another task to delete the timer.
- *
- * That's not possible here because this is invoked from
- * do_exit() only for the last thread of the thread group.
- * So no other task can access and delete that timer.
- */
- if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer))
- return;
-
- goto retry_delete;
- }
- hlist_del(&timer->list);
-
- posix_timer_cleanup_ignored(timer);
-
- /*
- * Setting timer::it_signal to NULL is technically not required
- * here as nothing can access the timer anymore legitimately via
- * the hash table. Set it to NULL nevertheless so that all deletion
- * paths are consistent.
- */
- WRITE_ONCE(timer->it_signal, NULL);
-
- spin_unlock_irqrestore(&timer->it_lock, flags);
- posix_timer_unhash_and_free(timer);
-}
-
-/*
* Invoked from do_exit() when the last thread of a thread group exits.
* At that point no other task can access the timers of the dying
* task anymore.
@@ -1089,18 +1086,26 @@ retry_delete:
void exit_itimers(struct task_struct *tsk)
{
struct hlist_head timers;
+ struct hlist_node *next;
+ struct k_itimer *timer;
+
+ /* Clear restore mode for exec() */
+ tsk->signal->timer_create_restore_ids = 0;
if (hlist_empty(&tsk->signal->posix_timers))
return;
/* Protect against concurrent read via /proc/$PID/timers */
- spin_lock_irq(&tsk->sighand->siglock);
- hlist_move_list(&tsk->signal->posix_timers, &timers);
- spin_unlock_irq(&tsk->sighand->siglock);
+ scoped_guard (spinlock_irq, &tsk->sighand->siglock)
+ hlist_move_list(&tsk->signal->posix_timers, &timers);
/* The timers are not longer accessible via tsk::signal */
- while (!hlist_empty(&timers))
- itimer_delete(hlist_entry(timers.first, struct k_itimer, list));
+ hlist_for_each_entry_safe(timer, next, &timers, list) {
+ scoped_guard (spinlock_irq, &timer->it_lock)
+ posix_timer_delete(timer);
+ posix_timer_unhash_and_free(timer);
+ cond_resched();
+ }
/*
* There should be no timers on the ignored list. itimer_delete() has
@@ -1545,3 +1550,26 @@ static const struct k_clock *clockid_to_kclock(const clockid_t id)
return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))];
}
+
+static int __init posixtimer_init(void)
+{
+ unsigned long i, size;
+ unsigned int shift;
+
+ if (IS_ENABLED(CONFIG_BASE_SMALL))
+ size = 512;
+ else
+ size = roundup_pow_of_two(512 * num_possible_cpus());
+
+ timer_buckets = alloc_large_system_hash("posixtimers", sizeof(*timer_buckets),
+ size, 0, 0, &shift, NULL, size, size);
+ size = 1UL << shift;
+ timer_hashmask = size - 1;
+
+ for (i = 0; i < size; i++) {
+ spin_lock_init(&timer_buckets[i].lock);
+ INIT_HLIST_HEAD(&timer_buckets[i].head);
+ }
+ return 0;
+}
+core_initcall(posixtimer_init);
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index fcca4e72f1ef..cc15fe293719 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -263,8 +263,7 @@ void __init generic_sched_clock_init(void)
* Start the timer to keep sched_clock() properly updated and
* sets the initial epoch.
*/
- hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
- sched_clock_timer.function = sched_clock_poll;
+ hrtimer_setup(&sched_clock_timer, sched_clock_poll, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD);
}
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index e28f9210f8a1..a88b72b0f35e 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -100,7 +100,6 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
void tick_setup_hrtimer_broadcast(void)
{
- hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
- bctimer.function = bc_handler;
+ hrtimer_setup(&bctimer, bc_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
clockevents_register_device(&ce_broadcast_hrtimer);
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index fa058510af9c..c527b421c865 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1573,12 +1573,10 @@ void tick_setup_sched_timer(bool hrtimer)
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
/* Emulate tick processing via per-CPU hrtimers: */
- hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
+ hrtimer_setup(&ts->sched_timer, tick_nohz_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
- if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) {
+ if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer)
tick_sched_flag_set(ts, TS_FLAG_HIGHRES);
- ts->sched_timer.function = tick_nohz_handler;
- }
/* Get the next period (per-CPU) */
hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 1e67d076f195..929846b8b45a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -682,20 +682,19 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act
}
/**
- * timekeeping_forward_now - update clock to the current time
+ * timekeeping_forward - update clock to given cycle now value
* @tk: Pointer to the timekeeper to update
+ * @cycle_now: Current clocksource read value
*
* Forward the current clock to update its state since the last call to
* update_wall_time(). This is useful before significant clock changes,
* as it avoids having to deal with this time offset explicitly.
*/
-static void timekeeping_forward_now(struct timekeeper *tk)
+static void timekeeping_forward(struct timekeeper *tk, u64 cycle_now)
{
- u64 cycle_now, delta;
+ u64 delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
+ tk->tkr_mono.clock->max_raw_delta);
- cycle_now = tk_clock_read(&tk->tkr_mono);
- delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
- tk->tkr_mono.clock->max_raw_delta);
tk->tkr_mono.cycle_last = cycle_now;
tk->tkr_raw.cycle_last = cycle_now;
@@ -711,6 +710,21 @@ static void timekeeping_forward_now(struct timekeeper *tk)
}
/**
+ * timekeeping_forward_now - update clock to the current time
+ * @tk: Pointer to the timekeeper to update
+ *
+ * Forward the current clock to update its state since the last call to
+ * update_wall_time(). This is useful before significant clock changes,
+ * as it avoids having to deal with this time offset explicitly.
+ */
+static void timekeeping_forward_now(struct timekeeper *tk)
+{
+ u64 cycle_now = tk_clock_read(&tk->tkr_mono);
+
+ timekeeping_forward(tk, cycle_now);
+}
+
+/**
* ktime_get_real_ts64 - Returns the time of day in a timespec64.
* @ts: pointer to the timespec to be set
*
@@ -2151,6 +2165,54 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
return offset;
}
+static u64 timekeeping_accumulate(struct timekeeper *tk, u64 offset,
+ enum timekeeping_adv_mode mode,
+ unsigned int *clock_set)
+{
+ int shift = 0, maxshift;
+
+ /*
+ * TK_ADV_FREQ indicates that adjtimex(2) directly set the
+ * frequency or the tick length.
+ *
+ * Accumulate the offset, so that the new multiplier starts from
+ * now. This is required as otherwise for offsets, which are
+ * smaller than tk::cycle_interval, timekeeping_adjust() could set
+ * xtime_nsec backwards, which subsequently causes time going
+ * backwards in the coarse time getters. But even for the case
+ * where offset is greater than tk::cycle_interval the periodic
+ * accumulation does not have much value.
+ *
+ * Also reset tk::ntp_error as it does not make sense to keep the
+ * old accumulated error around in this case.
+ */
+ if (mode == TK_ADV_FREQ) {
+ timekeeping_forward(tk, tk->tkr_mono.cycle_last + offset);
+ tk->ntp_error = 0;
+ return 0;
+ }
+
+ /*
+ * With NO_HZ we may have to accumulate many cycle_intervals
+ * (think "ticks") worth of time at once. To do this efficiently,
+ * we calculate the largest doubling multiple of cycle_intervals
+ * that is smaller than the offset. We then accumulate that
+ * chunk in one go, and then try to consume the next smaller
+ * doubled multiple.
+ */
+ shift = ilog2(offset) - ilog2(tk->cycle_interval);
+ shift = max(0, shift);
+ /* Bound shift to one less than what overflows tick_length */
+ maxshift = (64 - (ilog2(ntp_tick_length()) + 1)) - 1;
+ shift = min(shift, maxshift);
+ while (offset >= tk->cycle_interval) {
+ offset = logarithmic_accumulation(tk, offset, shift, clock_set);
+ if (offset < tk->cycle_interval << shift)
+ shift--;
+ }
+ return offset;
+}
+
/*
* timekeeping_advance - Updates the timekeeper to the current time and
* current NTP tick length
@@ -2160,7 +2222,6 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
struct timekeeper *tk = &tk_core.shadow_timekeeper;
struct timekeeper *real_tk = &tk_core.timekeeper;
unsigned int clock_set = 0;
- int shift = 0, maxshift;
u64 offset;
guard(raw_spinlock_irqsave)(&tk_core.lock);
@@ -2177,24 +2238,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
return false;
- /*
- * With NO_HZ we may have to accumulate many cycle_intervals
- * (think "ticks") worth of time at once. To do this efficiently,
- * we calculate the largest doubling multiple of cycle_intervals
- * that is smaller than the offset. We then accumulate that
- * chunk in one go, and then try to consume the next smaller
- * doubled multiple.
- */
- shift = ilog2(offset) - ilog2(tk->cycle_interval);
- shift = max(0, shift);
- /* Bound shift to one less than what overflows tick_length */
- maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
- shift = min(shift, maxshift);
- while (offset >= tk->cycle_interval) {
- offset = logarithmic_accumulation(tk, offset, shift, &clock_set);
- if (offset < tk->cycle_interval<<shift)
- shift--;
- }
+ offset = timekeeping_accumulate(tk, offset, mode, &clock_set);
/* Adjust the multiplier to correct NTP error */
timekeeping_adjust(tk, offset);
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1c311c46da50..cfbb46cc4e76 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -46,7 +46,7 @@ static void
print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
int idx, u64 now)
{
- SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function);
+ SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, timer->function);
SEQ_printf(m, ", S:%02x", timer->state);
SEQ_printf(m, "\n");
SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
@@ -98,7 +98,7 @@ next_one:
static void
print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
{
- SEQ_printf(m, " .base: %pK\n", base);
+ SEQ_printf(m, " .base: %p\n", base);
SEQ_printf(m, " .index: %d\n", base->index);
SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution);
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index 05d383143165..01c2ab1e8971 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -15,29 +15,29 @@
#include "timekeeping_internal.h"
-static inline void update_vdso_data(struct vdso_data *vdata,
- struct timekeeper *tk)
+static inline void update_vdso_time_data(struct vdso_time_data *vdata, struct timekeeper *tk)
{
+ struct vdso_clock *vc = vdata->clock_data;
struct vdso_timestamp *vdso_ts;
u64 nsec, sec;
- vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last;
+ vc[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last;
#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
- vdata[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles;
+ vc[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles;
#endif
- vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask;
- vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult;
- vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift;
- vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last;
+ vc[CS_HRES_COARSE].mask = tk->tkr_mono.mask;
+ vc[CS_HRES_COARSE].mult = tk->tkr_mono.mult;
+ vc[CS_HRES_COARSE].shift = tk->tkr_mono.shift;
+ vc[CS_RAW].cycle_last = tk->tkr_raw.cycle_last;
#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
- vdata[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles;
+ vc[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles;
#endif
- vdata[CS_RAW].mask = tk->tkr_raw.mask;
- vdata[CS_RAW].mult = tk->tkr_raw.mult;
- vdata[CS_RAW].shift = tk->tkr_raw.shift;
+ vc[CS_RAW].mask = tk->tkr_raw.mask;
+ vc[CS_RAW].mult = tk->tkr_raw.mult;
+ vc[CS_RAW].shift = tk->tkr_raw.shift;
/* CLOCK_MONOTONIC */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC];
+ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC];
vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
nsec = tk->tkr_mono.xtime_nsec;
@@ -55,7 +55,7 @@ static inline void update_vdso_data(struct vdso_data *vdata,
nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift;
/* CLOCK_BOOTTIME */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME];
+ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME];
vdso_ts->sec = sec;
while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
@@ -65,19 +65,20 @@ static inline void update_vdso_data(struct vdso_data *vdata,
vdso_ts->nsec = nsec;
/* CLOCK_MONOTONIC_RAW */
- vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW];
+ vdso_ts = &vc[CS_RAW].basetime[CLOCK_MONOTONIC_RAW];
vdso_ts->sec = tk->raw_sec;
vdso_ts->nsec = tk->tkr_raw.xtime_nsec;
/* CLOCK_TAI */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI];
+ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_TAI];
vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset;
vdso_ts->nsec = tk->tkr_mono.xtime_nsec;
}
void update_vsyscall(struct timekeeper *tk)
{
- struct vdso_data *vdata = __arch_get_k_vdso_data();
+ struct vdso_time_data *vdata = vdso_k_time_data;
+ struct vdso_clock *vc = vdata->clock_data;
struct vdso_timestamp *vdso_ts;
s32 clock_mode;
u64 nsec;
@@ -86,21 +87,21 @@ void update_vsyscall(struct timekeeper *tk)
vdso_write_begin(vdata);
clock_mode = tk->tkr_mono.clock->vdso_clock_mode;
- vdata[CS_HRES_COARSE].clock_mode = clock_mode;
- vdata[CS_RAW].clock_mode = clock_mode;
+ vc[CS_HRES_COARSE].clock_mode = clock_mode;
+ vc[CS_RAW].clock_mode = clock_mode;
/* CLOCK_REALTIME also required for time() */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME];
+ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME];
vdso_ts->sec = tk->xtime_sec;
vdso_ts->nsec = tk->tkr_mono.xtime_nsec;
/* CLOCK_REALTIME_COARSE */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE];
+ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE];
vdso_ts->sec = tk->xtime_sec;
vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
/* CLOCK_MONOTONIC_COARSE */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE];
+ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE];
vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
nsec = nsec + tk->wall_to_monotonic.tv_nsec;
@@ -108,32 +109,31 @@ void update_vsyscall(struct timekeeper *tk)
/*
* Read without the seqlock held by clock_getres().
- * Note: No need to have a second copy.
*/
- WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution);
+ WRITE_ONCE(vdata->hrtimer_res, hrtimer_resolution);
/*
* If the current clocksource is not VDSO capable, then spare the
* update of the high resolution parts.
*/
if (clock_mode != VDSO_CLOCKMODE_NONE)
- update_vdso_data(vdata, tk);
+ update_vdso_time_data(vdata, tk);
__arch_update_vsyscall(vdata);
vdso_write_end(vdata);
- __arch_sync_vdso_data(vdata);
+ __arch_sync_vdso_time_data(vdata);
}
void update_vsyscall_tz(void)
{
- struct vdso_data *vdata = __arch_get_k_vdso_data();
+ struct vdso_time_data *vdata = vdso_k_time_data;
- vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest;
- vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime;
+ vdata->tz_minuteswest = sys_tz.tz_minuteswest;
+ vdata->tz_dsttime = sys_tz.tz_dsttime;
- __arch_sync_vdso_data(vdata);
+ __arch_sync_vdso_time_data(vdata);
}
/**
@@ -150,7 +150,7 @@ void update_vsyscall_tz(void)
*/
unsigned long vdso_update_begin(void)
{
- struct vdso_data *vdata = __arch_get_k_vdso_data();
+ struct vdso_time_data *vdata = vdso_k_time_data;
unsigned long flags = timekeeper_lock_irqsave();
vdso_write_begin(vdata);
@@ -167,9 +167,9 @@ unsigned long vdso_update_begin(void)
*/
void vdso_update_end(unsigned long flags)
{
- struct vdso_data *vdata = __arch_get_k_vdso_data();
+ struct vdso_time_data *vdata = vdso_k_time_data;
vdso_write_end(vdata);
- __arch_sync_vdso_data(vdata);
+ __arch_sync_vdso_time_data(vdata);
timekeeper_unlock_irqrestore(flags);
}
diff --git a/kernel/torture.c b/kernel/torture.c
index dede150aef01..3a0a8cc60401 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -792,6 +792,8 @@ static void torture_stutter_cleanup(void)
stutter_task = NULL;
}
+static unsigned long torture_init_jiffies;
+
static void
torture_print_module_parms(void)
{
@@ -821,6 +823,7 @@ bool torture_init_begin(char *ttype, int v)
torture_type = ttype;
verbose = v;
fullstop = FULLSTOP_DONTSTOP;
+ WRITE_ONCE(torture_init_jiffies, jiffies); // Lockless reads.
torture_print_module_parms();
return true;
}
@@ -837,6 +840,15 @@ void torture_init_end(void)
EXPORT_SYMBOL_GPL(torture_init_end);
/*
+ * Get the torture_init_begin()-time value of the jiffies counter.
+ */
+unsigned long get_torture_init_jiffies(void)
+{
+ return READ_ONCE(torture_init_jiffies);
+}
+EXPORT_SYMBOL_GPL(get_torture_init_jiffies);
+
+/*
* Clean up torture module. Please note that this is -not- invoked via
* the usual module_exit() mechanism, but rather by an explicit call from
* the client torture module. Returns true if a race with system shutdown
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index adc947587eb8..997fb2a47c92 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1038,27 +1038,14 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_tracing = {
.arg1_type = ARG_PTR_TO_CTX,
};
-#ifdef CONFIG_X86_KERNEL_IBT
-static unsigned long get_entry_ip(unsigned long fentry_ip)
+static inline unsigned long get_entry_ip(unsigned long fentry_ip)
{
- u32 instr;
-
- /* We want to be extra safe in case entry ip is on the page edge,
- * but otherwise we need to avoid get_kernel_nofault()'s overhead.
- */
- if ((fentry_ip & ~PAGE_MASK) < ENDBR_INSN_SIZE) {
- if (get_kernel_nofault(instr, (u32 *)(fentry_ip - ENDBR_INSN_SIZE)))
- return fentry_ip;
- } else {
- instr = *(u32 *)(fentry_ip - ENDBR_INSN_SIZE);
- }
- if (is_endbr(instr))
+#ifdef CONFIG_X86_KERNEL_IBT
+ if (is_endbr((void *)(fentry_ip - ENDBR_INSN_SIZE)))
fentry_ip -= ENDBR_INSN_SIZE;
+#endif
return fentry_ip;
}
-#else
-#define get_entry_ip(fentry_ip) fentry_ip
-#endif
BPF_CALL_1(bpf_get_func_ip_kprobe, struct pt_regs *, regs)
{
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0e6d517e74e0..fd3cb2b2ab82 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4100,12 +4100,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
entries,
total,
buf->cpu,
- preempt_model_none() ? "server" :
- preempt_model_voluntary() ? "desktop" :
- preempt_model_full() ? "preempt" :
- preempt_model_lazy() ? "lazy" :
- preempt_model_rt() ? "preempt_rt" :
- "unknown",
+ preempt_model_str(),
/* These are reserved for later use */
0, 0, 0, 0);
#ifdef CONFIG_SMP
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 3ff9caa4a71b..a6bb7577e8c5 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -49,7 +49,7 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
/* The ftrace function trace is allowed only for root. */
if (ftrace_event_is_function(tp_event)) {
- ret = perf_allow_tracepoint(&p_event->attr);
+ ret = perf_allow_tracepoint();
if (ret)
return ret;
@@ -86,7 +86,7 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
* ...otherwise raw tracepoint data can be a severe data leak,
* only allow root to have these.
*/
- ret = perf_allow_tracepoint(&p_event->attr);
+ ret = perf_allow_tracepoint();
if (ret)
return ret;
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index f3a2722ee4c0..92e16f03fa4e 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -1542,27 +1542,25 @@ static int run_osnoise(void)
/*
* In some cases, notably when running on a nohz_full CPU with
- * a stopped tick PREEMPT_RCU has no way to account for QSs.
- * This will eventually cause unwarranted noise as PREEMPT_RCU
- * will force preemption as the means of ending the current
- * grace period. We avoid this problem by calling
- * rcu_momentary_eqs(), which performs a zero duration
- * EQS allowing PREEMPT_RCU to end the current grace period.
- * This call shouldn't be wrapped inside an RCU critical
- * section.
+ * a stopped tick PREEMPT_RCU or PREEMPT_LAZY have no way to
+ * account for QSs. This will eventually cause unwarranted
+ * noise as RCU forces preemption as the means of ending the
+ * current grace period. We avoid this by calling
+ * rcu_momentary_eqs(), which performs a zero duration EQS
+ * allowing RCU to end the current grace period. This call
+ * shouldn't be wrapped inside an RCU critical section.
*
- * Note that in non PREEMPT_RCU kernels QSs are handled through
- * cond_resched()
+ * Normally QSs for other cases are handled through cond_resched().
+ * For simplicity, however, we call rcu_momentary_eqs() for all
+ * configurations here.
*/
- if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {
- if (!disable_irq)
- local_irq_disable();
+ if (!disable_irq)
+ local_irq_disable();
- rcu_momentary_eqs();
+ rcu_momentary_eqs();
- if (!disable_irq)
- local_irq_enable();
- }
+ if (!disable_irq)
+ local_irq_enable();
/*
* For the non-preemptive kernel config: let threads runs, if
@@ -1901,8 +1899,7 @@ static int timerlat_main(void *data)
tlat->count = 0;
tlat->tracing_thread = false;
- hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
- tlat->timer.function = timerlat_irq;
+ hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
tlat->kthread = current;
osn_var->pid = current->pid;
/*
@@ -2456,8 +2453,7 @@ static int timerlat_fd_open(struct inode *inode, struct file *file)
tlat = this_cpu_tmr_var();
tlat->count = 0;
- hrtimer_init(&tlat->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
- tlat->timer.function = timerlat_irq;
+ hrtimer_setup(&tlat->timer, timerlat_irq, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED_HARD);
migrate_enable();
return 0;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index aa0b2e47f2f2..682f40d5632d 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -238,7 +238,7 @@ EXPORT_SYMBOL(__put_user_ns);
struct idmap_key {
bool map_up; /* true -> id from kid; false -> kid from id */
u32 id; /* id to find */
- u32 count; /* == 0 unless used with map_id_range_down() */
+ u32 count;
};
/*
@@ -343,16 +343,19 @@ u32 map_id_down(struct uid_gid_map *map, u32 id)
* UID_GID_MAP_MAX_BASE_EXTENTS.
*/
static struct uid_gid_extent *
-map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id)
+map_id_range_up_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
{
unsigned idx;
- u32 first, last;
+ u32 first, last, id2;
+
+ id2 = id + count - 1;
/* Find the matching extent */
for (idx = 0; idx < extents; idx++) {
first = map->extent[idx].lower_first;
last = first + map->extent[idx].count - 1;
- if (id >= first && id <= last)
+ if (id >= first && id <= last &&
+ (id2 >= first && id2 <= last))
return &map->extent[idx];
}
return NULL;
@@ -363,28 +366,28 @@ map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id)
* Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
*/
static struct uid_gid_extent *
-map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id)
+map_id_range_up_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
{
struct idmap_key key;
key.map_up = true;
- key.count = 1;
+ key.count = count;
key.id = id;
return bsearch(&key, map->reverse, extents,
sizeof(struct uid_gid_extent), cmp_map_id);
}
-u32 map_id_up(struct uid_gid_map *map, u32 id)
+u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count)
{
struct uid_gid_extent *extent;
unsigned extents = map->nr_extents;
smp_rmb();
if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
- extent = map_id_up_base(extents, map, id);
+ extent = map_id_range_up_base(extents, map, id, count);
else
- extent = map_id_up_max(extents, map, id);
+ extent = map_id_range_up_max(extents, map, id, count);
/* Map the id or note failure */
if (extent)
@@ -395,6 +398,11 @@ u32 map_id_up(struct uid_gid_map *map, u32 id)
return id;
}
+u32 map_id_up(struct uid_gid_map *map, u32 id)
+{
+ return map_id_range_up(map, id, 1);
+}
+
/**
* make_kuid - Map a user-namespace uid pair into a kuid.
* @ns: User namespace that the uid is in
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index 5267adeaa403..7e45559521af 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -101,12 +101,11 @@ static bool post_one_notification(struct watch_queue *wqueue,
struct pipe_inode_info *pipe = wqueue->pipe;
struct pipe_buffer *buf;
struct page *page;
- unsigned int head, tail, mask, note, offset, len;
+ unsigned int head, tail, note, offset, len;
bool done = false;
spin_lock_irq(&pipe->rd_wait.lock);
- mask = pipe->ring_size - 1;
head = pipe->head;
tail = pipe->tail;
if (pipe_full(head, tail, pipe->ring_size))
@@ -124,7 +123,7 @@ static bool post_one_notification(struct watch_queue *wqueue,
memcpy(p + offset, n, len);
kunmap_atomic(p);
- buf = &pipe->bufs[head & mask];
+ buf = pipe_buf(pipe, head);
buf->page = page;
buf->private = (unsigned long)wqueue;
buf->ops = &watch_queue_pipe_buf_ops;
@@ -147,7 +146,7 @@ out:
return done;
lost:
- buf = &pipe->bufs[(head - 1) & mask];
+ buf = pipe_buf(pipe, head - 1);
buf->flags |= PIPE_BUF_FLAG_LOSS;
goto out;
}
@@ -269,6 +268,15 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
if (ret < 0)
goto error;
+ /*
+ * pipe_resize_ring() does not update nr_accounted for watch_queue
+ * pipes, because the above vastly overprovisions. Set nr_accounted on
+ * and max_usage this pipe to the number that was actually charged to
+ * the user above via account_pipe_buffers.
+ */
+ pipe->max_usage = nr_pages;
+ pipe->nr_accounted = nr_pages;
+
ret = -ENOMEM;
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
if (!pages)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index b2da7de39d06..9fa2af9dbf2c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -347,8 +347,6 @@ static int __init watchdog_thresh_setup(char *str)
}
__setup("watchdog_thresh=", watchdog_thresh_setup);
-static void __lockup_detector_cleanup(void);
-
#ifdef CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM
enum stats_per_group {
STATS_SYSTEM,
@@ -797,8 +795,7 @@ static void watchdog_enable(unsigned int cpu)
* Start the timer first to prevent the hardlockup watchdog triggering
* before the timer has a chance to fire.
*/
- hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
- hrtimer->function = watchdog_timer_fn;
+ hrtimer_setup(hrtimer, watchdog_timer_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
hrtimer_start(hrtimer, ns_to_ktime(sample_period),
HRTIMER_MODE_REL_PINNED_HARD);
@@ -886,11 +883,6 @@ static void __lockup_detector_reconfigure(void)
watchdog_hardlockup_start();
cpus_read_unlock();
- /*
- * Must be called outside the cpus locked section to prevent
- * recursive locking in the perf code.
- */
- __lockup_detector_cleanup();
}
void lockup_detector_reconfigure(void)
@@ -940,24 +932,6 @@ static inline void lockup_detector_setup(void)
}
#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */
-static void __lockup_detector_cleanup(void)
-{
- lockdep_assert_held(&watchdog_mutex);
- hardlockup_detector_perf_cleanup();
-}
-
-/**
- * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes
- *
- * Caller must not hold the cpu hotplug rwsem.
- */
-void lockup_detector_cleanup(void)
-{
- mutex_lock(&watchdog_mutex);
- __lockup_detector_cleanup();
- mutex_unlock(&watchdog_mutex);
-}
-
/**
* lockup_detector_soft_poweroff - Interface to stop lockup detector(s)
*
diff --git a/kernel/watchdog_perf.c b/kernel/watchdog_perf.c
index 59c1d86a73a2..a78ff092d636 100644
--- a/kernel/watchdog_perf.c
+++ b/kernel/watchdog_perf.c
@@ -21,8 +21,6 @@
#include <linux/perf_event.h>
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
-static DEFINE_PER_CPU(struct perf_event *, dead_event);
-static struct cpumask dead_events_mask;
static atomic_t watchdog_cpus = ATOMIC_INIT(0);
@@ -146,6 +144,7 @@ static int hardlockup_detector_event_create(void)
PTR_ERR(evt));
return PTR_ERR(evt);
}
+ WARN_ONCE(this_cpu_read(watchdog_ev), "unexpected watchdog_ev leak");
this_cpu_write(watchdog_ev, evt);
return 0;
}
@@ -181,37 +180,13 @@ void watchdog_hardlockup_disable(unsigned int cpu)
if (event) {
perf_event_disable(event);
+ perf_event_release_kernel(event);
this_cpu_write(watchdog_ev, NULL);
- this_cpu_write(dead_event, event);
- cpumask_set_cpu(smp_processor_id(), &dead_events_mask);
atomic_dec(&watchdog_cpus);
}
}
/**
- * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them
- *
- * Called from lockup_detector_cleanup(). Serialized by the caller.
- */
-void hardlockup_detector_perf_cleanup(void)
-{
- int cpu;
-
- for_each_cpu(cpu, &dead_events_mask) {
- struct perf_event *event = per_cpu(dead_event, cpu);
-
- /*
- * Required because for_each_cpu() reports unconditionally
- * CPU0 as set on UP kernels. Sigh.
- */
- if (event)
- perf_event_release_kernel(event);
- per_cpu(dead_event, cpu) = NULL;
- }
- cpumask_clear(&dead_events_mask);
-}
-
-/**
* hardlockup_detector_perf_stop - Globally stop watchdog events
*
* Special interface for x86 to handle the perf HT bug.