summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/Makefile5
-rw-r--r--kernel/sched/build_policy.c1
-rw-r--r--kernel/sched/build_utility.c4
-rw-r--r--kernel/sched/core.c146
-rw-r--r--kernel/sched/core_sched.c2
-rw-r--r--kernel/sched/deadline.c51
-rw-r--r--kernel/sched/debug.c18
-rw-r--r--kernel/sched/ext.c1087
-rw-r--r--kernel/sched/ext.h10
-rw-r--r--kernel/sched/ext_idle.c1171
-rw-r--r--kernel/sched/ext_idle.h35
-rw-r--r--kernel/sched/fair.c131
-rw-r--r--kernel/sched/rt.c9
-rw-r--r--kernel/sched/sched.h128
-rw-r--r--kernel/sched/stats.h2
-rw-r--r--kernel/sched/syscalls.c12
-rw-r--r--kernel/sched/topology.c45
17 files changed, 1815 insertions, 1042 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 976092b7bd45..8ae86371ddcd 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -22,6 +22,11 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
+# Branch profiling isn't noinstr-safe
+ifdef CONFIG_TRACE_BRANCH_PROFILING
+CFLAGS_build_policy.o += -DDISABLE_BRANCH_PROFILING
+CFLAGS_build_utility.o += -DDISABLE_BRANCH_PROFILING
+endif
#
# Build efficiency:
#
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index fae1f5c921eb..72d97aa8b726 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -61,6 +61,7 @@
#ifdef CONFIG_SCHED_CLASS_EXT
# include "ext.c"
+# include "ext_idle.c"
#endif
#include "syscalls.c"
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
index 80a3df49ab47..bf9d8db94b70 100644
--- a/kernel/sched/build_utility.c
+++ b/kernel/sched/build_utility.c
@@ -68,9 +68,7 @@
# include "cpufreq_schedutil.c"
#endif
-#ifdef CONFIG_SCHED_DEBUG
-# include "debug.c"
-#endif
+#include "debug.c"
#ifdef CONFIG_SCHEDSTATS
# include "stats.c"
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 67189907214d..5bd8d7e7347d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -91,7 +91,6 @@
#include "autogroup.h"
#include "pelt.h"
#include "smp.h"
-#include "stats.h"
#include "../workqueue_internal.h"
#include "../../io_uring/io-wq.h"
@@ -119,7 +118,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-#ifdef CONFIG_SCHED_DEBUG
/*
* Debugging: various feature bits
*
@@ -129,7 +127,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
*/
#define SCHED_FEAT(name, enabled) \
(1UL << __SCHED_FEAT_##name) * enabled |
-const_debug unsigned int sysctl_sched_features =
+__read_mostly unsigned int sysctl_sched_features =
#include "features.h"
0;
#undef SCHED_FEAT
@@ -143,13 +141,12 @@ const_debug unsigned int sysctl_sched_features =
*/
__read_mostly int sysctl_resched_latency_warn_ms = 100;
__read_mostly int sysctl_resched_latency_warn_once = 1;
-#endif /* CONFIG_SCHED_DEBUG */
/*
* Number of tasks to iterate in a single balance run.
* Limited because this is done with IRQs disabled.
*/
-const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
+__read_mostly unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK;
__read_mostly int scheduler_running;
@@ -800,11 +797,10 @@ void update_rq_clock(struct rq *rq)
if (rq->clock_update_flags & RQCF_ACT_SKIP)
return;
-#ifdef CONFIG_SCHED_DEBUG
if (sched_feat(WARN_DOUBLE_CLOCK))
- SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
+ WARN_ON_ONCE(rq->clock_update_flags & RQCF_UPDATED);
rq->clock_update_flags |= RQCF_UPDATED;
-#endif
+
clock = sched_clock_cpu(cpu_of(rq));
scx_rq_clock_update(rq, clock);
@@ -1720,7 +1716,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
bucket = &uc_rq->bucket[uc_se->bucket_id];
- SCHED_WARN_ON(!bucket->tasks);
+ WARN_ON_ONCE(!bucket->tasks);
if (likely(bucket->tasks))
bucket->tasks--;
@@ -1740,7 +1736,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
* Defensive programming: this should never happen. If it happens,
* e.g. due to future modification, warn and fix up the expected value.
*/
- SCHED_WARN_ON(bucket->value > rq_clamp);
+ WARN_ON_ONCE(bucket->value > rq_clamp);
if (bucket->value >= rq_clamp) {
bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
uclamp_rq_set(rq, clamp_id, bkt_clamp);
@@ -1757,7 +1753,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
* The condition is constructed such that a NOP is generated when
* sched_uclamp_used is disabled.
*/
- if (!static_branch_unlikely(&sched_uclamp_used))
+ if (!uclamp_is_used())
return;
if (unlikely(!p->sched_class->uclamp_enabled))
@@ -1784,7 +1780,7 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
* The condition is constructed such that a NOP is generated when
* sched_uclamp_used is disabled.
*/
- if (!static_branch_unlikely(&sched_uclamp_used))
+ if (!uclamp_is_used())
return;
if (unlikely(!p->sched_class->uclamp_enabled))
@@ -1942,12 +1938,12 @@ static int sysctl_sched_uclamp_handler(const struct ctl_table *table, int write,
}
if (update_root_tg) {
- static_branch_enable(&sched_uclamp_used);
+ sched_uclamp_enable();
uclamp_update_root_tg();
}
if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
- static_branch_enable(&sched_uclamp_used);
+ sched_uclamp_enable();
uclamp_sync_util_min_rt_default();
}
@@ -2122,7 +2118,7 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
- SCHED_WARN_ON(flags & DEQUEUE_SLEEP);
+ WARN_ON_ONCE(flags & DEQUEUE_SLEEP);
WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
ASSERT_EXCLUSIVE_WRITER(p->on_rq);
@@ -2727,7 +2723,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
* XXX do further audits, this smells like something putrid.
*/
if (ctx->flags & SCA_MIGRATE_DISABLE)
- SCHED_WARN_ON(!p->on_cpu);
+ WARN_ON_ONCE(!p->on_cpu);
else
lockdep_assert_held(&p->pi_lock);
@@ -3292,7 +3288,6 @@ void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
-#ifdef CONFIG_SCHED_DEBUG
unsigned int state = READ_ONCE(p->__state);
/*
@@ -3330,7 +3325,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
WARN_ON_ONCE(!cpu_online(new_cpu));
WARN_ON_ONCE(is_migration_disabled(p));
-#endif
trace_sched_migrate_task(p, new_cpu);
@@ -3922,13 +3916,8 @@ bool cpus_share_resources(int this_cpu, int that_cpu)
static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
{
- /*
- * The BPF scheduler may depend on select_task_rq() being invoked during
- * wakeups. In addition, @p may end up executing on a different CPU
- * regardless of what happens in the wakeup path making the ttwu_queue
- * optimization less meaningful. Skip if on SCX.
- */
- if (task_on_scx(p))
+ /* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */
+ if (!scx_allow_ttwu_queue(p))
return false;
/*
@@ -4196,7 +4185,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* - we're serialized against set_special_state() by virtue of
* it disabling IRQs (this allows not taking ->pi_lock).
*/
- SCHED_WARN_ON(p->se.sched_delayed);
+ WARN_ON_ONCE(p->se.sched_delayed);
if (!ttwu_state_match(p, state, &success))
goto out;
@@ -4490,7 +4479,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
INIT_LIST_HEAD(&p->se.group_node);
/* A delayed task cannot be in clone(). */
- SCHED_WARN_ON(p->se.sched_delayed);
+ WARN_ON_ONCE(p->se.sched_delayed);
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = NULL;
@@ -5578,7 +5567,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}
-#ifdef CONFIG_SCHED_DEBUG
static u64 cpu_resched_latency(struct rq *rq)
{
int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
@@ -5623,9 +5611,6 @@ static int __init setup_resched_latency_warn_ms(char *str)
return 1;
}
__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
-#else
-static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
-#endif /* CONFIG_SCHED_DEBUG */
/*
* This function gets called by the timer code, with HZ frequency.
@@ -5746,7 +5731,7 @@ static void sched_tick_remote(struct work_struct *work)
* we are always sure that there is no proxy (only a
* single task is running).
*/
- SCHED_WARN_ON(rq->curr != rq->donor);
+ WARN_ON_ONCE(rq->curr != rq->donor);
update_rq_clock(rq);
if (!is_idle_task(curr)) {
@@ -5966,7 +5951,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
preempt_count_set(PREEMPT_DISABLED);
}
rcu_sleep_check();
- SCHED_WARN_ON(ct_state() == CT_STATE_USER);
+ WARN_ON_ONCE(ct_state() == CT_STATE_USER);
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -6719,9 +6704,7 @@ static void __sched notrace __schedule(int sched_mode)
picked:
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
-#ifdef CONFIG_SCHED_DEBUG
rq->last_seen_need_resched_ns = 0;
-#endif
if (likely(prev != next)) {
rq->nr_switches++;
@@ -6812,7 +6795,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
* deadlock if the callback attempts to acquire a lock which is
* already acquired.
*/
- SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT);
+ WARN_ON_ONCE(current->__state & TASK_RTLOCK_WAIT);
/*
* If we are going to sleep and we have plugged IO queued,
@@ -7095,7 +7078,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
void *key)
{
- WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~(WF_SYNC|WF_CURRENT_CPU));
+ WARN_ON_ONCE(wake_flags & ~(WF_SYNC|WF_CURRENT_CPU));
return try_to_wake_up(curr->private, mode, wake_flags);
}
EXPORT_SYMBOL(default_wake_function);
@@ -7290,7 +7273,7 @@ int __sched __cond_resched(void)
return 1;
}
/*
- * In preemptible kernels, ->rcu_read_lock_nesting tells the tick
+ * In PREEMPT_RCU kernels, ->rcu_read_lock_nesting tells the tick
* whether the current CPU is in an RCU read-side critical section,
* so the tick can report quiescent states even for CPUs looping
* in kernel context. In contrast, in non-preemptible kernels,
@@ -7299,6 +7282,8 @@ int __sched __cond_resched(void)
* RCU quiescent state. Therefore, the following code causes
* cond_resched() to report a quiescent state, but only when RCU
* is in urgent need of one.
+ * A third case, preemptible, but non-PREEMPT_RCU provides for
+ * urgently needed quiescent states via rcu_flavor_sched_clock_irq().
*/
#ifndef CONFIG_PREEMPT_RCU
rcu_all_qs();
@@ -7647,10 +7632,57 @@ PREEMPT_MODEL_ACCESSOR(lazy);
#else /* !CONFIG_PREEMPT_DYNAMIC: */
+#define preempt_dynamic_mode -1
+
static inline void preempt_dynamic_init(void) { }
#endif /* CONFIG_PREEMPT_DYNAMIC */
+const char *preempt_modes[] = {
+ "none", "voluntary", "full", "lazy", NULL,
+};
+
+const char *preempt_model_str(void)
+{
+ bool brace = IS_ENABLED(CONFIG_PREEMPT_RT) &&
+ (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC) ||
+ IS_ENABLED(CONFIG_PREEMPT_LAZY));
+ static char buf[128];
+
+ if (IS_ENABLED(CONFIG_PREEMPT_BUILD)) {
+ struct seq_buf s;
+
+ seq_buf_init(&s, buf, sizeof(buf));
+ seq_buf_puts(&s, "PREEMPT");
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ seq_buf_printf(&s, "%sRT%s",
+ brace ? "_{" : "_",
+ brace ? "," : "");
+
+ if (IS_ENABLED(CONFIG_PREEMPT_DYNAMIC)) {
+ seq_buf_printf(&s, "(%s)%s",
+ preempt_dynamic_mode > 0 ?
+ preempt_modes[preempt_dynamic_mode] : "undef",
+ brace ? "}" : "");
+ return seq_buf_str(&s);
+ }
+
+ if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) {
+ seq_buf_printf(&s, "LAZY%s",
+ brace ? "}" : "");
+ return seq_buf_str(&s);
+ }
+
+ return seq_buf_str(&s);
+ }
+
+ if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY_BUILD))
+ return "VOLUNTARY";
+
+ return "NONE";
+}
+
int io_schedule_prepare(void)
{
int old_iowait = current->in_iowait;
@@ -7765,10 +7797,9 @@ void show_state_filter(unsigned int state_filter)
sched_show_task(p);
}
-#ifdef CONFIG_SCHED_DEBUG
if (!state_filter)
sysrq_sched_debug_show();
-#endif
+
rcu_read_unlock();
/*
* Only show locks if all tasks are dumped:
@@ -8183,7 +8214,7 @@ static void cpuset_cpu_active(void)
* operation in the resume sequence, just build a single sched
* domain, ignoring cpusets.
*/
- partition_sched_domains(1, NULL, NULL);
+ cpuset_reset_sched_domains();
if (--num_cpus_frozen)
return;
/*
@@ -8202,7 +8233,7 @@ static void cpuset_cpu_inactive(unsigned int cpu)
cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
- partition_sched_domains(1, NULL, NULL);
+ cpuset_reset_sched_domains();
}
}
@@ -8424,9 +8455,9 @@ void __init sched_init_smp(void)
* CPU masks are stable and all blatant races in the below code cannot
* happen.
*/
- mutex_lock(&sched_domains_mutex);
+ sched_domains_mutex_lock();
sched_init_domains(cpu_active_mask);
- mutex_unlock(&sched_domains_mutex);
+ sched_domains_mutex_unlock();
/* Move init over to a non-isolated CPU */
if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_DOMAIN)) < 0)
@@ -9016,7 +9047,7 @@ void sched_release_group(struct task_group *tg)
spin_unlock_irqrestore(&task_group_lock, flags);
}
-static struct task_group *sched_get_task_group(struct task_struct *tsk)
+static void sched_change_group(struct task_struct *tsk)
{
struct task_group *tg;
@@ -9028,13 +9059,7 @@ static struct task_group *sched_get_task_group(struct task_struct *tsk)
tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
-
- return tg;
-}
-
-static void sched_change_group(struct task_struct *tsk, struct task_group *group)
-{
- tsk->sched_task_group = group;
+ tsk->sched_task_group = tg;
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_change_group)
@@ -9055,20 +9080,11 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
int queued, running, queue_flags =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
- struct task_group *group;
struct rq *rq;
CLASS(task_rq_lock, rq_guard)(tsk);
rq = rq_guard.rq;
- /*
- * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous
- * group changes.
- */
- group = sched_get_task_group(tsk);
- if (group == tsk->sched_task_group)
- return;
-
update_rq_clock(rq);
running = task_current_donor(rq, tsk);
@@ -9079,7 +9095,7 @@ void sched_move_task(struct task_struct *tsk, bool for_autogroup)
if (running)
put_prev_task(rq, tsk);
- sched_change_group(tsk, group);
+ sched_change_group(tsk);
if (!for_autogroup)
scx_cgroup_move_task(tsk);
@@ -9203,7 +9219,7 @@ static void cpu_util_update_eff(struct cgroup_subsys_state *css)
unsigned int clamps;
lockdep_assert_held(&uclamp_mutex);
- SCHED_WARN_ON(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held());
css_for_each_descendant_pre(css, top_css) {
uc_parent = css_tg(css)->parent
@@ -9295,7 +9311,7 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
if (req.ret)
return req.ret;
- static_branch_enable(&sched_uclamp_used);
+ sched_uclamp_enable();
guard(mutex)(&uclamp_mutex);
guard(rcu)();
@@ -10538,7 +10554,7 @@ static void task_mm_cid_work(struct callback_head *work)
struct mm_struct *mm;
int weight, cpu;
- SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work));
+ WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work));
work->next = work; /* Prevent double-add */
if (t->flags & PF_EXITING)
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index 1ef98a93eb1d..c4606ca89210 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -65,7 +65,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p,
* a cookie until after we've removed it, we must have core scheduling
* enabled here.
*/
- SCHED_WARN_ON((p->core_cookie || cookie) && !sched_core_enabled(rq));
+ WARN_ON_ONCE((p->core_cookie || cookie) && !sched_core_enabled(rq));
if (sched_core_enqueued(p))
sched_core_dequeue(rq, p, DEQUEUE_SAVE);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index ff4df16b5186..03a33b597768 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -166,14 +166,14 @@ static inline unsigned long dl_bw_capacity(int i)
}
}
-static inline bool dl_bw_visited(int cpu, u64 gen)
+bool dl_bw_visited(int cpu, u64 cookie)
{
struct root_domain *rd = cpu_rq(cpu)->rd;
- if (rd->visit_gen == gen)
+ if (rd->visit_cookie == cookie)
return true;
- rd->visit_gen = gen;
+ rd->visit_cookie = cookie;
return false;
}
@@ -207,7 +207,7 @@ static inline unsigned long dl_bw_capacity(int i)
return SCHED_CAPACITY_SCALE;
}
-static inline bool dl_bw_visited(int cpu, u64 gen)
+bool dl_bw_visited(int cpu, u64 cookie)
{
return false;
}
@@ -249,8 +249,8 @@ void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->running_bw += dl_bw;
- SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
- SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
+ WARN_ON_ONCE(dl_rq->running_bw < old); /* overflow */
+ WARN_ON_ONCE(dl_rq->running_bw > dl_rq->this_bw);
/* kick cpufreq (see the comment in kernel/sched/sched.h). */
cpufreq_update_util(rq_of_dl_rq(dl_rq), 0);
}
@@ -262,7 +262,7 @@ void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->running_bw -= dl_bw;
- SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
+ WARN_ON_ONCE(dl_rq->running_bw > old); /* underflow */
if (dl_rq->running_bw > old)
dl_rq->running_bw = 0;
/* kick cpufreq (see the comment in kernel/sched/sched.h). */
@@ -276,7 +276,7 @@ void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->this_bw += dl_bw;
- SCHED_WARN_ON(dl_rq->this_bw < old); /* overflow */
+ WARN_ON_ONCE(dl_rq->this_bw < old); /* overflow */
}
static inline
@@ -286,10 +286,10 @@ void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
lockdep_assert_rq_held(rq_of_dl_rq(dl_rq));
dl_rq->this_bw -= dl_bw;
- SCHED_WARN_ON(dl_rq->this_bw > old); /* underflow */
+ WARN_ON_ONCE(dl_rq->this_bw > old); /* underflow */
if (dl_rq->this_bw > old)
dl_rq->this_bw = 0;
- SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
+ WARN_ON_ONCE(dl_rq->running_bw > dl_rq->this_bw);
}
static inline
@@ -2956,7 +2956,7 @@ void dl_add_task_root_domain(struct task_struct *p)
struct dl_bw *dl_b;
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
- if (!dl_task(p)) {
+ if (!dl_task(p) || dl_entity_is_special(&p->dl)) {
raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
return;
}
@@ -2981,18 +2981,22 @@ void dl_clear_root_domain(struct root_domain *rd)
rd->dl_bw.total_bw = 0;
/*
- * dl_server bandwidth is only restored when CPUs are attached to root
- * domains (after domains are created or CPUs moved back to the
- * default root doamin).
+ * dl_servers are not tasks. Since dl_add_task_root_domain ignores
+ * them, we need to account for them here explicitly.
*/
for_each_cpu(i, rd->span) {
struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server;
if (dl_server(dl_se) && cpu_active(i))
- rd->dl_bw.total_bw += dl_se->dl_bw;
+ __dl_add(&rd->dl_bw, dl_se->dl_bw, dl_bw_cpus(i));
}
}
+void dl_clear_root_domain_cpu(int cpu)
+{
+ dl_clear_root_domain(cpu_rq(cpu)->rd);
+}
+
#endif /* CONFIG_SMP */
static void switched_from_dl(struct rq *rq, struct task_struct *p)
@@ -3171,15 +3175,18 @@ DEFINE_SCHED_CLASS(dl) = {
#endif
};
-/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */
-static u64 dl_generation;
+/*
+ * Used for dl_bw check and update, used under sched_rt_handler()::mutex and
+ * sched_domains_mutex.
+ */
+u64 dl_cookie;
int sched_dl_global_validate(void)
{
u64 runtime = global_rt_runtime();
u64 period = global_rt_period();
u64 new_bw = to_ratio(period, runtime);
- u64 gen = ++dl_generation;
+ u64 cookie = ++dl_cookie;
struct dl_bw *dl_b;
int cpu, cpus, ret = 0;
unsigned long flags;
@@ -3192,7 +3199,7 @@ int sched_dl_global_validate(void)
for_each_online_cpu(cpu) {
rcu_read_lock_sched();
- if (dl_bw_visited(cpu, gen))
+ if (dl_bw_visited(cpu, cookie))
goto next;
dl_b = dl_bw_of(cpu);
@@ -3229,7 +3236,7 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
void sched_dl_do_global(void)
{
u64 new_bw = -1;
- u64 gen = ++dl_generation;
+ u64 cookie = ++dl_cookie;
struct dl_bw *dl_b;
int cpu;
unsigned long flags;
@@ -3240,7 +3247,7 @@ void sched_dl_do_global(void)
for_each_possible_cpu(cpu) {
rcu_read_lock_sched();
- if (dl_bw_visited(cpu, gen)) {
+ if (dl_bw_visited(cpu, cookie)) {
rcu_read_unlock_sched();
continue;
}
@@ -3567,9 +3574,7 @@ void dl_bw_free(int cpu, u64 dl_bw)
}
#endif
-#ifdef CONFIG_SCHED_DEBUG
void print_dl_stats(struct seq_file *m, int cpu)
{
print_dl_rq(m, cpu, &cpu_rq(cpu)->dl);
}
-#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index ef047add7f9e..56ae54e0ce6a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -244,11 +244,13 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
static int sched_dynamic_show(struct seq_file *m, void *v)
{
- static const char * preempt_modes[] = {
- "none", "voluntary", "full", "lazy",
- };
- int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2;
+ int j;
+
+ /* Count entries in NULL terminated preempt_modes */
+ for (j = 0; preempt_modes[j]; j++)
+ ;
+ j -= !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
for (; i < j; i++) {
if (preempt_dynamic_mode == i)
@@ -292,7 +294,7 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf,
bool orig;
cpus_read_lock();
- mutex_lock(&sched_domains_mutex);
+ sched_domains_mutex_lock();
orig = sched_debug_verbose;
result = debugfs_write_file_bool(filp, ubuf, cnt, ppos);
@@ -304,7 +306,7 @@ static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf,
sd_dentry = NULL;
}
- mutex_unlock(&sched_domains_mutex);
+ sched_domains_mutex_unlock();
cpus_read_unlock();
return result;
@@ -515,9 +517,9 @@ static __init int sched_init_debug(void)
debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
- mutex_lock(&sched_domains_mutex);
+ sched_domains_mutex_lock();
update_sched_domain_debugfs();
- mutex_unlock(&sched_domains_mutex);
+ sched_domains_mutex_unlock();
#endif
#ifdef CONFIG_NUMA_BALANCING
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7b9dfee858e7..21575d39c376 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6,6 +6,9 @@
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
*/
+#include <linux/btf_ids.h>
+#include "ext_idle.h"
+
#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
enum scx_consts {
@@ -93,7 +96,7 @@ enum scx_ops_flags {
/*
* Keep built-in idle tracking even if ops.update_idle() is implemented.
*/
- SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
+ SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
/*
* By default, if there are no other task to run on the CPU, ext core
@@ -101,7 +104,7 @@ enum scx_ops_flags {
* flag is specified, such tasks are passed to ops.enqueue() with
* %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
*/
- SCX_OPS_ENQ_LAST = 1LLU << 1,
+ SCX_OPS_ENQ_LAST = 1LLU << 1,
/*
* An exiting task may schedule after PF_EXITING is set. In such cases,
@@ -114,13 +117,13 @@ enum scx_ops_flags {
* depend on pid lookups and wants to handle these tasks directly, the
* following flag can be used.
*/
- SCX_OPS_ENQ_EXITING = 1LLU << 2,
+ SCX_OPS_ENQ_EXITING = 1LLU << 2,
/*
* If set, only tasks with policy set to SCHED_EXT are attached to
* sched_ext. If clear, SCHED_NORMAL tasks are also included.
*/
- SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
+ SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
/*
* A migration disabled task can only execute on its current CPU. By
@@ -133,7 +136,29 @@ enum scx_ops_flags {
* current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
* and thus may disagree with cpumask_weight(p->cpus_ptr).
*/
- SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
+ SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
+
+ /*
+ * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes
+ * ops.enqueue() on the ops.select_cpu() selected or the wakee's
+ * previous CPU via IPI (inter-processor interrupt) to reduce cacheline
+ * transfers. When this optimization is enabled, ops.select_cpu() is
+ * skipped in some cases (when racing against the wakee switching out).
+ * As the BPF scheduler may depend on ops.select_cpu() being invoked
+ * during wakeups, queued wakeup is disabled by default.
+ *
+ * If this ops flag is set, queued wakeup optimization is enabled and
+ * the BPF scheduler must be able to handle ops.enqueue() invoked on the
+ * wakee's CPU without preceding ops.select_cpu() even for tasks which
+ * may be executed on multiple CPUs.
+ */
+ SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5,
+
+ /*
+ * If set, enable per-node idle cpumasks. If clear, use a single global
+ * flat idle cpumask.
+ */
+ SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6,
/*
* CPU cgroup support flags
@@ -144,7 +169,9 @@ enum scx_ops_flags {
SCX_OPS_ENQ_LAST |
SCX_OPS_ENQ_EXITING |
SCX_OPS_ENQ_MIGRATION_DISABLED |
+ SCX_OPS_ALLOW_QUEUED_WAKEUP |
SCX_OPS_SWITCH_PARTIAL |
+ SCX_OPS_BUILTIN_IDLE_PER_NODE |
SCX_OPS_HAS_CGROUP_WEIGHT,
};
@@ -779,6 +806,7 @@ enum scx_deq_flags {
enum scx_pick_idle_cpu_flags {
SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */
+ SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */
};
enum scx_kick_flags {
@@ -894,16 +922,11 @@ DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
static struct sched_ext_ops scx_ops;
static bool scx_warned_zero_slice;
+DEFINE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled);
static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
-static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
-
-#ifdef CONFIG_SMP
-static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
-static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa);
-#endif
static struct static_key_false scx_has_op[SCX_OPI_END] =
{ [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
@@ -938,21 +961,6 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
static struct delayed_work scx_watchdog_work;
-/* idle tracking */
-#ifdef CONFIG_SMP
-#ifdef CONFIG_CPUMASK_OFFSTACK
-#define CL_ALIGNED_IF_ONSTACK
-#else
-#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp
-#endif
-
-static struct {
- cpumask_var_t cpu;
- cpumask_var_t smt;
-} idle_masks CL_ALIGNED_IF_ONSTACK;
-
-#endif /* CONFIG_SMP */
-
/* for %SCX_KICK_WAIT */
static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
@@ -1473,6 +1481,117 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
return p;
}
+/*
+ * Collection of event counters. Event types are placed in descending order.
+ */
+struct scx_event_stats {
+ /*
+ * If ops.select_cpu() returns a CPU which can't be used by the task,
+ * the core scheduler code silently picks a fallback CPU.
+ */
+ s64 SCX_EV_SELECT_CPU_FALLBACK;
+
+ /*
+ * When dispatching to a local DSQ, the CPU may have gone offline in
+ * the meantime. In this case, the task is bounced to the global DSQ.
+ */
+ s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
+
+ /*
+ * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
+ * continued to run because there were no other tasks on the CPU.
+ */
+ s64 SCX_EV_DISPATCH_KEEP_LAST;
+
+ /*
+ * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
+ * is dispatched to a local DSQ when exiting.
+ */
+ s64 SCX_EV_ENQ_SKIP_EXITING;
+
+ /*
+ * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
+ * migration disabled task skips ops.enqueue() and is dispatched to its
+ * local DSQ.
+ */
+ s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
+
+ /*
+ * The total number of tasks enqueued (or pick_task-ed) with a
+ * default time slice (SCX_SLICE_DFL).
+ */
+ s64 SCX_EV_ENQ_SLICE_DFL;
+
+ /*
+ * The total duration of bypass modes in nanoseconds.
+ */
+ s64 SCX_EV_BYPASS_DURATION;
+
+ /*
+ * The number of tasks dispatched in the bypassing mode.
+ */
+ s64 SCX_EV_BYPASS_DISPATCH;
+
+ /*
+ * The number of times the bypassing mode has been activated.
+ */
+ s64 SCX_EV_BYPASS_ACTIVATE;
+};
+
+/*
+ * The event counter is organized by a per-CPU variable to minimize the
+ * accounting overhead without synchronization. A system-wide view on the
+ * event counter is constructed when requested by scx_bpf_get_event_stat().
+ */
+static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
+
+/**
+ * scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @name: an event name defined in struct scx_event_stats
+ * @cnt: the number of the event occured
+ *
+ * This can be used when preemption is not disabled.
+ */
+#define scx_add_event(name, cnt) do { \
+ this_cpu_add(event_stats_cpu.name, cnt); \
+ trace_sched_ext_event(#name, cnt); \
+} while(0)
+
+/**
+ * __scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @name: an event name defined in struct scx_event_stats
+ * @cnt: the number of the event occured
+ *
+ * This should be used only when preemption is disabled.
+ */
+#define __scx_add_event(name, cnt) do { \
+ __this_cpu_add(event_stats_cpu.name, cnt); \
+ trace_sched_ext_event(#name, cnt); \
+} while(0)
+
+/**
+ * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e'
+ * @dst_e: destination event stats
+ * @src_e: source event stats
+ * @kind: a kind of event to be aggregated
+ */
+#define scx_agg_event(dst_e, src_e, kind) do { \
+ (dst_e)->kind += READ_ONCE((src_e)->kind); \
+} while(0)
+
+/**
+ * scx_dump_event - Dump an event 'kind' in 'events' to 's'
+ * @s: output seq_buf
+ * @events: event stats
+ * @kind: a kind of event to dump
+ */
+#define scx_dump_event(s, events, kind) do { \
+ dump_line(&(s), "%40s: %16lld", #kind, (events)->kind); \
+} while (0)
+
+
+static void scx_bpf_events(struct scx_event_stats *events, size_t events__sz);
+
static enum scx_ops_enable_state scx_ops_enable_state(void)
{
return atomic_read(&scx_ops_enable_state_var);
@@ -2018,21 +2137,27 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
if (!scx_rq_online(rq))
goto local;
- if (scx_rq_bypassing(rq))
+ if (scx_rq_bypassing(rq)) {
+ __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
goto global;
+ }
if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
goto direct;
/* see %SCX_OPS_ENQ_EXITING */
if (!static_branch_unlikely(&scx_ops_enq_exiting) &&
- unlikely(p->flags & PF_EXITING))
+ unlikely(p->flags & PF_EXITING)) {
+ __scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1);
goto local;
+ }
/* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) &&
- is_migration_disabled(p))
+ is_migration_disabled(p)) {
+ __scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
goto local;
+ }
if (!SCX_HAS_OP(enqueue))
goto global;
@@ -2072,6 +2197,7 @@ local:
*/
touch_core_sched(rq, p);
p->scx.slice = SCX_SLICE_DFL;
+ __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
local_norefill:
dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
return;
@@ -2079,6 +2205,7 @@ local_norefill:
global:
touch_core_sched(rq, p); /* see the comment in local: */
p->scx.slice = SCX_SLICE_DFL;
+ __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
dispatch_enqueue(find_global_dsq(p), p, enq_flags);
}
@@ -2150,6 +2277,10 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
do_enqueue_task(rq, p, enq_flags, sticky_cpu);
out:
rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;
+
+ if ((enq_flags & SCX_ENQ_CPU_SELECTED) &&
+ unlikely(cpu_of(rq) != p->scx.selected_cpu))
+ __scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1);
}
static void ops_dequeue(struct task_struct *p, u64 deq_flags)
@@ -2337,11 +2468,11 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
* The caller must ensure that @p and @rq are on different CPUs.
*/
static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
- bool trigger_error)
+ bool enforce)
{
int cpu = cpu_of(rq);
- SCHED_WARN_ON(task_cpu(p) == cpu);
+ WARN_ON_ONCE(task_cpu(p) == cpu);
/*
* If @p has migration disabled, @p->cpus_ptr is updated to contain only
@@ -2356,7 +2487,7 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
* easily be masked if task_allowed_on_cpu() is done first.
*/
if (unlikely(is_migration_disabled(p))) {
- if (trigger_error)
+ if (enforce)
scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d",
p->comm, p->pid, task_cpu(p), cpu);
return false;
@@ -2369,14 +2500,17 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
* picked CPU is outside the allowed mask.
*/
if (!task_allowed_on_cpu(p, cpu)) {
- if (trigger_error)
+ if (enforce)
scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]",
cpu, p->comm, p->pid);
return false;
}
- if (!scx_rq_online(rq))
+ if (!scx_rq_online(rq)) {
+ if (enforce)
+ __scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
return false;
+ }
return true;
}
@@ -2446,7 +2580,7 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p,
}
#else /* CONFIG_SMP */
static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); }
-static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; }
+static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool enforce) { return false; }
static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; }
#endif /* CONFIG_SMP */
@@ -2893,6 +3027,7 @@ no_tasks:
if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) ||
scx_rq_bypassing(rq))) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
+ __scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1);
goto has_tasks;
}
rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
@@ -3159,8 +3294,10 @@ static struct task_struct *pick_task_scx(struct rq *rq)
*/
if (keep_prev) {
p = prev;
- if (!p->scx.slice)
+ if (!p->scx.slice) {
p->scx.slice = SCX_SLICE_DFL;
+ __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
+ }
} else {
p = first_local_task(rq);
if (!p) {
@@ -3176,6 +3313,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
scx_warned_zero_slice = true;
}
p->scx.slice = SCX_SLICE_DFL;
+ __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
}
}
@@ -3220,418 +3358,10 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
#ifdef CONFIG_SMP
-static bool test_and_clear_cpu_idle(int cpu)
-{
-#ifdef CONFIG_SCHED_SMT
- /*
- * SMT mask should be cleared whether we can claim @cpu or not. The SMT
- * cluster is not wholly idle either way. This also prevents
- * scx_pick_idle_cpu() from getting caught in an infinite loop.
- */
- if (sched_smt_active()) {
- const struct cpumask *smt = cpu_smt_mask(cpu);
-
- /*
- * If offline, @cpu is not its own sibling and
- * scx_pick_idle_cpu() can get caught in an infinite loop as
- * @cpu is never cleared from idle_masks.smt. Ensure that @cpu
- * is eventually cleared.
- *
- * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to
- * reduce memory writes, which may help alleviate cache
- * coherence pressure.
- */
- if (cpumask_intersects(smt, idle_masks.smt))
- cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
- else if (cpumask_test_cpu(cpu, idle_masks.smt))
- __cpumask_clear_cpu(cpu, idle_masks.smt);
- }
-#endif
- return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
-}
-
-static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
-{
- int cpu;
-
-retry:
- if (sched_smt_active()) {
- cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed);
- if (cpu < nr_cpu_ids)
- goto found;
-
- if (flags & SCX_PICK_IDLE_CORE)
- return -EBUSY;
- }
-
- cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed);
- if (cpu >= nr_cpu_ids)
- return -EBUSY;
-
-found:
- if (test_and_clear_cpu_idle(cpu))
- return cpu;
- else
- goto retry;
-}
-
-/*
- * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC
- * domain is not defined).
- */
-static unsigned int llc_weight(s32 cpu)
-{
- struct sched_domain *sd;
-
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
- if (!sd)
- return 0;
-
- return sd->span_weight;
-}
-
-/*
- * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC
- * domain is not defined).
- */
-static struct cpumask *llc_span(s32 cpu)
-{
- struct sched_domain *sd;
-
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
- if (!sd)
- return 0;
-
- return sched_domain_span(sd);
-}
-
-/*
- * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the
- * NUMA domain is not defined).
- */
-static unsigned int numa_weight(s32 cpu)
-{
- struct sched_domain *sd;
- struct sched_group *sg;
-
- sd = rcu_dereference(per_cpu(sd_numa, cpu));
- if (!sd)
- return 0;
- sg = sd->groups;
- if (!sg)
- return 0;
-
- return sg->group_weight;
-}
-
-/*
- * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA
- * domain is not defined).
- */
-static struct cpumask *numa_span(s32 cpu)
-{
- struct sched_domain *sd;
- struct sched_group *sg;
-
- sd = rcu_dereference(per_cpu(sd_numa, cpu));
- if (!sd)
- return NULL;
- sg = sd->groups;
- if (!sg)
- return NULL;
-
- return sched_group_span(sg);
-}
-
-/*
- * Return true if the LLC domains do not perfectly overlap with the NUMA
- * domains, false otherwise.
- */
-static bool llc_numa_mismatch(void)
-{
- int cpu;
-
- /*
- * We need to scan all online CPUs to verify whether their scheduling
- * domains overlap.
- *
- * While it is rare to encounter architectures with asymmetric NUMA
- * topologies, CPU hotplugging or virtualized environments can result
- * in asymmetric configurations.
- *
- * For example:
- *
- * NUMA 0:
- * - LLC 0: cpu0..cpu7
- * - LLC 1: cpu8..cpu15 [offline]
- *
- * NUMA 1:
- * - LLC 0: cpu16..cpu23
- * - LLC 1: cpu24..cpu31
- *
- * In this case, if we only check the first online CPU (cpu0), we might
- * incorrectly assume that the LLC and NUMA domains are fully
- * overlapping, which is incorrect (as NUMA 1 has two distinct LLC
- * domains).
- */
- for_each_online_cpu(cpu)
- if (llc_weight(cpu) != numa_weight(cpu))
- return true;
-
- return false;
-}
-
-/*
- * Initialize topology-aware scheduling.
- *
- * Detect if the system has multiple LLC or multiple NUMA domains and enable
- * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
- * selection policy.
- *
- * Assumption: the kernel's internal topology representation assumes that each
- * CPU belongs to a single LLC domain, and that each LLC domain is entirely
- * contained within a single NUMA node.
- */
-static void update_selcpu_topology(void)
-{
- bool enable_llc = false, enable_numa = false;
- unsigned int nr_cpus;
- s32 cpu = cpumask_first(cpu_online_mask);
-
- /*
- * Enable LLC domain optimization only when there are multiple LLC
- * domains among the online CPUs. If all online CPUs are part of a
- * single LLC domain, the idle CPU selection logic can choose any
- * online CPU without bias.
- *
- * Note that it is sufficient to check the LLC domain of the first
- * online CPU to determine whether a single LLC domain includes all
- * CPUs.
- */
- rcu_read_lock();
- nr_cpus = llc_weight(cpu);
- if (nr_cpus > 0) {
- if (nr_cpus < num_online_cpus())
- enable_llc = true;
- pr_debug("sched_ext: LLC=%*pb weight=%u\n",
- cpumask_pr_args(llc_span(cpu)), llc_weight(cpu));
- }
-
- /*
- * Enable NUMA optimization only when there are multiple NUMA domains
- * among the online CPUs and the NUMA domains don't perfectly overlaps
- * with the LLC domains.
- *
- * If all CPUs belong to the same NUMA node and the same LLC domain,
- * enabling both NUMA and LLC optimizations is unnecessary, as checking
- * for an idle CPU in the same domain twice is redundant.
- */
- nr_cpus = numa_weight(cpu);
- if (nr_cpus > 0) {
- if (nr_cpus < num_online_cpus() && llc_numa_mismatch())
- enable_numa = true;
- pr_debug("sched_ext: NUMA=%*pb weight=%u\n",
- cpumask_pr_args(numa_span(cpu)), numa_weight(cpu));
- }
- rcu_read_unlock();
-
- pr_debug("sched_ext: LLC idle selection %s\n",
- str_enabled_disabled(enable_llc));
- pr_debug("sched_ext: NUMA idle selection %s\n",
- str_enabled_disabled(enable_numa));
-
- if (enable_llc)
- static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
- else
- static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
- if (enable_numa)
- static_branch_enable_cpuslocked(&scx_selcpu_topo_numa);
- else
- static_branch_disable_cpuslocked(&scx_selcpu_topo_numa);
-}
-
-/*
- * Built-in CPU idle selection policy:
- *
- * 1. Prioritize full-idle cores:
- * - always prioritize CPUs from fully idle cores (both logical CPUs are
- * idle) to avoid interference caused by SMT.
- *
- * 2. Reuse the same CPU:
- * - prefer the last used CPU to take advantage of cached data (L1, L2) and
- * branch prediction optimizations.
- *
- * 3. Pick a CPU within the same LLC (Last-Level Cache):
- * - if the above conditions aren't met, pick a CPU that shares the same LLC
- * to maintain cache locality.
- *
- * 4. Pick a CPU within the same NUMA node, if enabled:
- * - choose a CPU from the same NUMA node to reduce memory access latency.
- *
- * 5. Pick any idle CPU usable by the task.
- *
- * Step 3 and 4 are performed only if the system has, respectively, multiple
- * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and
- * scx_selcpu_topo_numa).
- *
- * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
- * we never call ops.select_cpu() for them, see select_task_rq().
- */
-static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
- u64 wake_flags, bool *found)
-{
- const struct cpumask *llc_cpus = NULL;
- const struct cpumask *numa_cpus = NULL;
- s32 cpu;
-
- *found = false;
-
- /*
- * This is necessary to protect llc_cpus.
- */
- rcu_read_lock();
-
- /*
- * Determine the scheduling domain only if the task is allowed to run
- * on all CPUs.
- *
- * This is done primarily for efficiency, as it avoids the overhead of
- * updating a cpumask every time we need to select an idle CPU (which
- * can be costly in large SMP systems), but it also aligns logically:
- * if a task's scheduling domain is restricted by user-space (through
- * CPU affinity), the task will simply use the flat scheduling domain
- * defined by user-space.
- */
- if (p->nr_cpus_allowed >= num_possible_cpus()) {
- if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
- numa_cpus = numa_span(prev_cpu);
-
- if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc))
- llc_cpus = llc_span(prev_cpu);
- }
-
- /*
- * If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
- */
- if (wake_flags & SCX_WAKE_SYNC) {
- cpu = smp_processor_id();
-
- /*
- * If the waker's CPU is cache affine and prev_cpu is idle,
- * then avoid a migration.
- */
- if (cpus_share_cache(cpu, prev_cpu) &&
- test_and_clear_cpu_idle(prev_cpu)) {
- cpu = prev_cpu;
- goto cpu_found;
- }
-
- /*
- * If the waker's local DSQ is empty, and the system is under
- * utilized, try to wake up @p to the local DSQ of the waker.
- *
- * Checking only for an empty local DSQ is insufficient as it
- * could give the wakee an unfair advantage when the system is
- * oversaturated.
- *
- * Checking only for the presence of idle CPUs is also
- * insufficient as the local DSQ of the waker could have tasks
- * piled up on it even if there is an idle core elsewhere on
- * the system.
- */
- if (!cpumask_empty(idle_masks.cpu) &&
- !(current->flags & PF_EXITING) &&
- cpu_rq(cpu)->scx.local_dsq.nr == 0) {
- if (cpumask_test_cpu(cpu, p->cpus_ptr))
- goto cpu_found;
- }
- }
-
- /*
- * If CPU has SMT, any wholly idle CPU is likely a better pick than
- * partially idle @prev_cpu.
- */
- if (sched_smt_active()) {
- /*
- * Keep using @prev_cpu if it's part of a fully idle core.
- */
- if (cpumask_test_cpu(prev_cpu, idle_masks.smt) &&
- test_and_clear_cpu_idle(prev_cpu)) {
- cpu = prev_cpu;
- goto cpu_found;
- }
-
- /*
- * Search for any fully idle core in the same LLC domain.
- */
- if (llc_cpus) {
- cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE);
- if (cpu >= 0)
- goto cpu_found;
- }
-
- /*
- * Search for any fully idle core in the same NUMA node.
- */
- if (numa_cpus) {
- cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE);
- if (cpu >= 0)
- goto cpu_found;
- }
-
- /*
- * Search for any full idle core usable by the task.
- */
- cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE);
- if (cpu >= 0)
- goto cpu_found;
- }
-
- /*
- * Use @prev_cpu if it's idle.
- */
- if (test_and_clear_cpu_idle(prev_cpu)) {
- cpu = prev_cpu;
- goto cpu_found;
- }
-
- /*
- * Search for any idle CPU in the same LLC domain.
- */
- if (llc_cpus) {
- cpu = scx_pick_idle_cpu(llc_cpus, 0);
- if (cpu >= 0)
- goto cpu_found;
- }
-
- /*
- * Search for any idle CPU in the same NUMA node.
- */
- if (numa_cpus) {
- cpu = scx_pick_idle_cpu(numa_cpus, 0);
- if (cpu >= 0)
- goto cpu_found;
- }
-
- /*
- * Search for any idle CPU usable by the task.
- */
- cpu = scx_pick_idle_cpu(p->cpus_ptr, 0);
- if (cpu >= 0)
- goto cpu_found;
-
- rcu_read_unlock();
- return prev_cpu;
-
-cpu_found:
- rcu_read_unlock();
-
- *found = true;
- return cpu;
-}
-
static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
{
+ bool rq_bypass;
+
/*
* sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it
* can be a good migration opportunity with low cache and memory
@@ -3645,7 +3375,8 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
if (unlikely(wake_flags & WF_EXEC))
return prev_cpu;
- if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) {
+ rq_bypass = scx_rq_bypassing(task_rq(p));
+ if (SCX_HAS_OP(select_cpu) && !rq_bypass) {
s32 cpu;
struct task_struct **ddsp_taskp;
@@ -3655,20 +3386,27 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
select_cpu, p, prev_cpu, wake_flags);
+ p->scx.selected_cpu = cpu;
*ddsp_taskp = NULL;
if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
return cpu;
else
return prev_cpu;
} else {
- bool found;
s32 cpu;
- cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found);
- if (found) {
+ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0);
+ if (cpu >= 0) {
p->scx.slice = SCX_SLICE_DFL;
p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
+ __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
+ } else {
+ cpu = prev_cpu;
}
+ p->scx.selected_cpu = cpu;
+
+ if (rq_bypass)
+ __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
return cpu;
}
}
@@ -3696,90 +3434,6 @@ static void set_cpus_allowed_scx(struct task_struct *p,
(struct cpumask *)p->cpus_ptr);
}
-static void reset_idle_masks(void)
-{
- /*
- * Consider all online cpus idle. Should converge to the actual state
- * quickly.
- */
- cpumask_copy(idle_masks.cpu, cpu_online_mask);
- cpumask_copy(idle_masks.smt, cpu_online_mask);
-}
-
-static void update_builtin_idle(int cpu, bool idle)
-{
- assign_cpu(cpu, idle_masks.cpu, idle);
-
-#ifdef CONFIG_SCHED_SMT
- if (sched_smt_active()) {
- const struct cpumask *smt = cpu_smt_mask(cpu);
-
- if (idle) {
- /*
- * idle_masks.smt handling is racy but that's fine as
- * it's only for optimization and self-correcting.
- */
- if (!cpumask_subset(smt, idle_masks.cpu))
- return;
- cpumask_or(idle_masks.smt, idle_masks.smt, smt);
- } else {
- cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
- }
- }
-#endif
-}
-
-/*
- * Update the idle state of a CPU to @idle.
- *
- * If @do_notify is true, ops.update_idle() is invoked to notify the scx
- * scheduler of an actual idle state transition (idle to busy or vice
- * versa). If @do_notify is false, only the idle state in the idle masks is
- * refreshed without invoking ops.update_idle().
- *
- * This distinction is necessary, because an idle CPU can be "reserved" and
- * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as
- * busy even if no tasks are dispatched. In this case, the CPU may return
- * to idle without a true state transition. Refreshing the idle masks
- * without invoking ops.update_idle() ensures accurate idle state tracking
- * while avoiding unnecessary updates and maintaining balanced state
- * transitions.
- */
-void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
-{
- int cpu = cpu_of(rq);
-
- lockdep_assert_rq_held(rq);
-
- /*
- * Trigger ops.update_idle() only when transitioning from a task to
- * the idle thread and vice versa.
- *
- * Idle transitions are indicated by do_notify being set to true,
- * managed by put_prev_task_idle()/set_next_task_idle().
- */
- if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
- SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
-
- /*
- * Update the idle masks:
- * - for real idle transitions (do_notify == true)
- * - for idle-to-idle transitions (indicated by the previous task
- * being the idle thread, managed by pick_task_idle())
- *
- * Skip updating idle masks if the previous task is not the idle
- * thread, since set_next_task_idle() has already handled it when
- * transitioning from a task to the idle thread (calling this
- * function with do_notify == true).
- *
- * In this way we can avoid updating the idle masks twice,
- * unnecessarily.
- */
- if (static_branch_likely(&scx_builtin_idle_enabled))
- if (do_notify || is_idle_task(rq->curr))
- update_builtin_idle(cpu, idle);
-}
-
static void handle_hotplug(struct rq *rq, bool online)
{
int cpu = cpu_of(rq);
@@ -3787,7 +3441,7 @@ static void handle_hotplug(struct rq *rq, bool online)
atomic_long_inc(&scx_hotplug_seq);
if (scx_enabled())
- update_selcpu_topology();
+ scx_idle_update_selcpu_topology(&scx_ops);
if (online && SCX_HAS_OP(cpu_online))
SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
@@ -3819,12 +3473,6 @@ static void rq_offline_scx(struct rq *rq)
rq->scx.flags &= ~SCX_RQ_ONLINE;
}
-#else /* CONFIG_SMP */
-
-static bool test_and_clear_cpu_idle(int cpu) { return false; }
-static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; }
-static void reset_idle_masks(void) {}
-
#endif /* CONFIG_SMP */
static bool check_rq_for_timeouts(struct rq *rq)
@@ -4749,8 +4397,33 @@ static ssize_t scx_attr_ops_show(struct kobject *kobj,
}
SCX_ATTR(ops);
+#define scx_attr_event_show(buf, at, events, kind) ({ \
+ sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind); \
+})
+
+static ssize_t scx_attr_events_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ struct scx_event_stats events;
+ int at = 0;
+
+ scx_bpf_events(&events, sizeof(events));
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SLICE_DFL);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE);
+ return at;
+}
+SCX_ATTR(events);
+
static struct attribute *scx_sched_attrs[] = {
&scx_attr_ops.attr,
+ &scx_attr_events.attr,
NULL,
};
ATTRIBUTE_GROUPS(scx_sched);
@@ -4862,6 +4535,8 @@ static void scx_clear_softlockup(void)
static void scx_ops_bypass(bool bypass)
{
static DEFINE_RAW_SPINLOCK(bypass_lock);
+ static unsigned long bypass_timestamp;
+
int cpu;
unsigned long flags;
@@ -4871,11 +4546,15 @@ static void scx_ops_bypass(bool bypass)
WARN_ON_ONCE(scx_ops_bypass_depth <= 0);
if (scx_ops_bypass_depth != 1)
goto unlock;
+ bypass_timestamp = ktime_get_ns();
+ scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1);
} else {
scx_ops_bypass_depth--;
WARN_ON_ONCE(scx_ops_bypass_depth < 0);
if (scx_ops_bypass_depth != 0)
goto unlock;
+ scx_add_event(SCX_EV_BYPASS_DURATION,
+ ktime_get_ns() - bypass_timestamp);
}
atomic_inc(&scx_ops_breather_depth);
@@ -5095,11 +4774,12 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
static_branch_disable(&__scx_ops_enabled);
for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
static_branch_disable(&scx_has_op[i]);
+ static_branch_disable(&scx_ops_allow_queued_wakeup);
static_branch_disable(&scx_ops_enq_last);
static_branch_disable(&scx_ops_enq_exiting);
static_branch_disable(&scx_ops_enq_migration_disabled);
static_branch_disable(&scx_ops_cpu_preempt);
- static_branch_disable(&scx_builtin_idle_enabled);
+ scx_idle_disable();
synchronize_rcu();
if (ei->kind >= SCX_EXIT_ERROR) {
@@ -5349,6 +5029,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
.at_jiffies = jiffies,
};
struct seq_buf s;
+ struct scx_event_stats events;
unsigned long flags;
char *buf;
int cpu;
@@ -5457,6 +5138,21 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
rq_unlock(rq, &rf);
}
+ dump_newline(&s);
+ dump_line(&s, "Event counters");
+ dump_line(&s, "--------------");
+
+ scx_bpf_events(&events, sizeof(events));
+ scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK);
+ scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+ scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
+ scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING);
+ scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+ scx_dump_event(s, &events, SCX_EV_ENQ_SLICE_DFL);
+ scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION);
+ scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH);
+ scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE);
+
if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
memcpy(ei->dump + dump_len - sizeof(trunc_marker),
trunc_marker, sizeof(trunc_marker));
@@ -5546,6 +5242,16 @@ static int validate_ops(const struct sched_ext_ops *ops)
return -EINVAL;
}
+ /*
+ * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle
+ * selection policy to be enabled.
+ */
+ if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) &&
+ (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) {
+ scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled");
+ return -EINVAL;
+ }
+
return 0;
}
@@ -5564,6 +5270,15 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
mutex_lock(&scx_ops_enable_mutex);
+ /*
+ * Clear event counters so a new scx scheduler gets
+ * fresh event counter values.
+ */
+ for_each_possible_cpu(cpu) {
+ struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu);
+ memset(e, 0, sizeof(*e));
+ }
+
if (!scx_ops_helper) {
WRITE_ONCE(scx_ops_helper,
scx_create_rt_helper("sched_ext_ops_helper"));
@@ -5661,9 +5376,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
static_branch_enable_cpuslocked(&scx_has_op[i]);
check_hotplug_seq(ops);
-#ifdef CONFIG_SMP
- update_selcpu_topology();
-#endif
+ scx_idle_update_selcpu_topology(ops);
+
cpus_read_unlock();
ret = validate_ops(ops);
@@ -5702,9 +5416,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (((void (**)(void))ops)[i])
static_branch_enable(&scx_has_op[i]);
+ if (ops->flags & SCX_OPS_ALLOW_QUEUED_WAKEUP)
+ static_branch_enable(&scx_ops_allow_queued_wakeup);
if (ops->flags & SCX_OPS_ENQ_LAST)
static_branch_enable(&scx_ops_enq_last);
-
if (ops->flags & SCX_OPS_ENQ_EXITING)
static_branch_enable(&scx_ops_enq_exiting);
if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED)
@@ -5712,12 +5427,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (scx_ops.cpu_acquire || scx_ops.cpu_release)
static_branch_enable(&scx_ops_cpu_preempt);
- if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
- reset_idle_masks();
- static_branch_enable(&scx_builtin_idle_enabled);
- } else {
- static_branch_disable(&scx_builtin_idle_enabled);
- }
+ scx_idle_enable(ops);
/*
* Lock out forks, cgroup on/offlining and moves before opening the
@@ -6356,10 +6066,8 @@ void __init init_sched_ext_class(void)
SCX_TG_ONLINE);
BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
-#ifdef CONFIG_SMP
- BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
- BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
-#endif
+ scx_idle_init_masks();
+
scx_kick_cpus_pnt_seqs =
__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
__alignof__(scx_kick_cpus_pnt_seqs[0]));
@@ -6367,15 +6075,16 @@ void __init init_sched_ext_class(void)
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
+ int n = cpu_to_node(cpu);
init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
INIT_LIST_HEAD(&rq->scx.runnable_list);
INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
- BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
- BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL));
- BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
- BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n));
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn);
init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
@@ -6392,65 +6101,6 @@ void __init init_sched_ext_class(void)
/********************************************************************************
* Helpers that can be called from the BPF scheduler.
*/
-#include <linux/btf_ids.h>
-
-__bpf_kfunc_start_defs();
-
-static bool check_builtin_idle_enabled(void)
-{
- if (static_branch_likely(&scx_builtin_idle_enabled))
- return true;
-
- scx_ops_error("built-in idle tracking is disabled");
- return false;
-}
-
-/**
- * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
- * @p: task_struct to select a CPU for
- * @prev_cpu: CPU @p was on previously
- * @wake_flags: %SCX_WAKE_* flags
- * @is_idle: out parameter indicating whether the returned CPU is idle
- *
- * Can only be called from ops.select_cpu() if the built-in CPU selection is
- * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
- * @p, @prev_cpu and @wake_flags match ops.select_cpu().
- *
- * Returns the picked CPU with *@is_idle indicating whether the picked CPU is
- * currently idle and thus a good candidate for direct dispatching.
- */
-__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
- u64 wake_flags, bool *is_idle)
-{
- if (!ops_cpu_valid(prev_cpu, NULL))
- goto prev_cpu;
-
- if (!check_builtin_idle_enabled())
- goto prev_cpu;
-
- if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
- goto prev_cpu;
-
-#ifdef CONFIG_SMP
- return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle);
-#endif
-
-prev_cpu:
- *is_idle = false;
- return prev_cpu;
-}
-
-__bpf_kfunc_end_defs();
-
-BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
-BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
-BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
-
-static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
- .owner = THIS_MODULE,
- .set = &scx_kfunc_ids_select_cpu,
-};
-
static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
{
if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
@@ -6972,8 +6622,12 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void)
* CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to
* the current local DSQ for running tasks and thus are not
* visible to the BPF scheduler.
+ *
+ * Also skip re-enqueueing tasks that can only run on this
+ * CPU, as they would just be re-added to the same local
+ * DSQ without any benefit.
*/
- if (p->migration_pending)
+ if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1)
continue;
dispatch_dequeue(rq, p);
@@ -7470,6 +7124,16 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
}
/**
+ * scx_bpf_nr_node_ids - Return the number of possible node IDs
+ *
+ * All valid node IDs in the system are smaller than the returned value.
+ */
+__bpf_kfunc u32 scx_bpf_nr_node_ids(void)
+{
+ return nr_node_ids;
+}
+
+/**
* scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs
*
* All valid CPU IDs in the system are smaller than the returned value.
@@ -7510,142 +7174,6 @@ __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask)
}
/**
- * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
- * per-CPU cpumask.
- *
- * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
- */
-__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
-{
- if (!check_builtin_idle_enabled())
- return cpu_none_mask;
-
-#ifdef CONFIG_SMP
- return idle_masks.cpu;
-#else
- return cpu_none_mask;
-#endif
-}
-
-/**
- * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
- * per-physical-core cpumask. Can be used to determine if an entire physical
- * core is free.
- *
- * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
- */
-__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
-{
- if (!check_builtin_idle_enabled())
- return cpu_none_mask;
-
-#ifdef CONFIG_SMP
- if (sched_smt_active())
- return idle_masks.smt;
- else
- return idle_masks.cpu;
-#else
- return cpu_none_mask;
-#endif
-}
-
-/**
- * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
- * either the percpu, or SMT idle-tracking cpumask.
- * @idle_mask: &cpumask to use
- */
-__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
-{
- /*
- * Empty function body because we aren't actually acquiring or releasing
- * a reference to a global idle cpumask, which is read-only in the
- * caller and is never released. The acquire / release semantics here
- * are just used to make the cpumask a trusted pointer in the caller.
- */
-}
-
-/**
- * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
- * @cpu: cpu to test and clear idle for
- *
- * Returns %true if @cpu was idle and its idle state was successfully cleared.
- * %false otherwise.
- *
- * Unavailable if ops.update_idle() is implemented and
- * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
- */
-__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
-{
- if (!check_builtin_idle_enabled())
- return false;
-
- if (ops_cpu_valid(cpu, NULL))
- return test_and_clear_cpu_idle(cpu);
- else
- return false;
-}
-
-/**
- * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
- * @cpus_allowed: Allowed cpumask
- * @flags: %SCX_PICK_IDLE_CPU_* flags
- *
- * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
- * number on success. -%EBUSY if no matching cpu was found.
- *
- * Idle CPU tracking may race against CPU scheduling state transitions. For
- * example, this function may return -%EBUSY as CPUs are transitioning into the
- * idle state. If the caller then assumes that there will be dispatch events on
- * the CPUs as they were all busy, the scheduler may end up stalling with CPUs
- * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and
- * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch
- * event in the near future.
- *
- * Unavailable if ops.update_idle() is implemented and
- * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
- */
-__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
- u64 flags)
-{
- if (!check_builtin_idle_enabled())
- return -EBUSY;
-
- return scx_pick_idle_cpu(cpus_allowed, flags);
-}
-
-/**
- * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
- * @cpus_allowed: Allowed cpumask
- * @flags: %SCX_PICK_IDLE_CPU_* flags
- *
- * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
- * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
- * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
- * empty.
- *
- * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
- * set, this function can't tell which CPUs are idle and will always pick any
- * CPU.
- */
-__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
- u64 flags)
-{
- s32 cpu;
-
- if (static_branch_likely(&scx_builtin_idle_enabled)) {
- cpu = scx_pick_idle_cpu(cpus_allowed, flags);
- if (cpu >= 0)
- return cpu;
- }
-
- cpu = cpumask_any_distribute(cpus_allowed);
- if (cpu < nr_cpu_ids)
- return cpu;
- else
- return -EBUSY;
-}
-
-/**
* scx_bpf_task_running - Is task currently running?
* @p: task of interest
*/
@@ -7765,6 +7293,43 @@ __bpf_kfunc u64 scx_bpf_now(void)
return clock;
}
+/*
+ * scx_bpf_events - Get a system-wide event counter to
+ * @events: output buffer from a BPF program
+ * @events__sz: @events len, must end in '__sz'' for the verifier
+ */
+__bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
+ size_t events__sz)
+{
+ struct scx_event_stats e_sys, *e_cpu;
+ int cpu;
+
+ /* Aggregate per-CPU event counters into the system-wide counters. */
+ memset(&e_sys, 0, sizeof(e_sys));
+ for_each_possible_cpu(cpu) {
+ e_cpu = per_cpu_ptr(&event_stats_cpu, cpu);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SLICE_DFL);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE);
+ }
+
+ /*
+ * We cannot entirely trust a BPF-provided size since a BPF program
+ * might be compiled against a different vmlinux.h, of which
+ * scx_event_stats would be larger (a newer vmlinux.h) or smaller
+ * (an older vmlinux.h). Hence, we use the smaller size to avoid
+ * memory corruption.
+ */
+ events__sz = min(events__sz, sizeof(*events));
+ memcpy(events, &e_sys, events__sz);
+}
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_any)
@@ -7780,6 +7345,7 @@ BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_set)
+BTF_ID_FLAGS(func, scx_bpf_nr_node_ids)
BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
@@ -7797,6 +7363,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
#endif
BTF_ID_FLAGS(func, scx_bpf_now)
+BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(scx_kfunc_ids_any)
static const struct btf_kfunc_id_set scx_kfunc_set_any = {
@@ -7820,8 +7387,6 @@ static int __init scx_init(void)
* check using scx_kf_allowed().
*/
if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
- &scx_kfunc_set_select_cpu)) ||
- (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
&scx_kfunc_set_enqueue_dispatch)) ||
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
&scx_kfunc_set_dispatch)) ||
@@ -7841,6 +7406,12 @@ static int __init scx_init(void)
return ret;
}
+ ret = scx_idle_init();
+ if (ret) {
+ pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret);
+ return ret;
+ }
+
ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops);
if (ret) {
pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret);
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index 1079b56b0f7a..1bda96b19a1b 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -8,6 +8,8 @@
*/
#ifdef CONFIG_SCHED_CLASS_EXT
+DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
+
void scx_tick(struct rq *rq);
void init_scx_entity(struct sched_ext_entity *scx);
void scx_pre_fork(struct task_struct *p);
@@ -34,6 +36,13 @@ static inline bool task_on_scx(const struct task_struct *p)
return scx_enabled() && p->sched_class == &ext_sched_class;
}
+static inline bool scx_allow_ttwu_queue(const struct task_struct *p)
+{
+ return !scx_enabled() ||
+ static_branch_likely(&scx_ops_allow_queued_wakeup) ||
+ p->sched_class != &ext_sched_class;
+}
+
#ifdef CONFIG_SCHED_CORE
bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
bool in_fi);
@@ -52,6 +61,7 @@ static inline void scx_rq_activate(struct rq *rq) {}
static inline void scx_rq_deactivate(struct rq *rq) {}
static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; }
static inline bool task_on_scx(const struct task_struct *p) { return false; }
+static inline bool scx_allow_ttwu_queue(const struct task_struct *p) { return true; }
static inline void init_sched_ext_class(void) {}
#endif /* CONFIG_SCHED_CLASS_EXT */
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
new file mode 100644
index 000000000000..52c36a70a3d0
--- /dev/null
+++ b/kernel/sched/ext_idle.c
@@ -0,0 +1,1171 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Built-in idle CPU tracking policy.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com>
+ */
+#include "ext_idle.h"
+
+/* Enable/disable built-in idle CPU selection policy */
+static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
+
+/* Enable/disable per-node idle cpumasks */
+static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node);
+
+#ifdef CONFIG_SMP
+/* Enable/disable LLC aware optimizations */
+static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
+
+/* Enable/disable NUMA aware optimizations */
+static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa);
+
+/*
+ * cpumasks to track idle CPUs within each NUMA node.
+ *
+ * If SCX_OPS_BUILTIN_IDLE_PER_NODE is not enabled, a single global cpumask
+ * from is used to track all the idle CPUs in the system.
+ */
+struct scx_idle_cpus {
+ cpumask_var_t cpu;
+ cpumask_var_t smt;
+};
+
+/*
+ * Global host-wide idle cpumasks (used when SCX_OPS_BUILTIN_IDLE_PER_NODE
+ * is not enabled).
+ */
+static struct scx_idle_cpus scx_idle_global_masks;
+
+/*
+ * Per-node idle cpumasks.
+ */
+static struct scx_idle_cpus **scx_idle_node_masks;
+
+/*
+ * Return the idle masks associated to a target @node.
+ *
+ * NUMA_NO_NODE identifies the global idle cpumask.
+ */
+static struct scx_idle_cpus *idle_cpumask(int node)
+{
+ return node == NUMA_NO_NODE ? &scx_idle_global_masks : scx_idle_node_masks[node];
+}
+
+/*
+ * Returns the NUMA node ID associated with a @cpu, or NUMA_NO_NODE if
+ * per-node idle cpumasks are disabled.
+ */
+static int scx_cpu_node_if_enabled(int cpu)
+{
+ if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node))
+ return NUMA_NO_NODE;
+
+ return cpu_to_node(cpu);
+}
+
+bool scx_idle_test_and_clear_cpu(int cpu)
+{
+ int node = scx_cpu_node_if_enabled(cpu);
+ struct cpumask *idle_cpus = idle_cpumask(node)->cpu;
+
+#ifdef CONFIG_SCHED_SMT
+ /*
+ * SMT mask should be cleared whether we can claim @cpu or not. The SMT
+ * cluster is not wholly idle either way. This also prevents
+ * scx_pick_idle_cpu() from getting caught in an infinite loop.
+ */
+ if (sched_smt_active()) {
+ const struct cpumask *smt = cpu_smt_mask(cpu);
+ struct cpumask *idle_smts = idle_cpumask(node)->smt;
+
+ /*
+ * If offline, @cpu is not its own sibling and
+ * scx_pick_idle_cpu() can get caught in an infinite loop as
+ * @cpu is never cleared from the idle SMT mask. Ensure that
+ * @cpu is eventually cleared.
+ *
+ * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to
+ * reduce memory writes, which may help alleviate cache
+ * coherence pressure.
+ */
+ if (cpumask_intersects(smt, idle_smts))
+ cpumask_andnot(idle_smts, idle_smts, smt);
+ else if (cpumask_test_cpu(cpu, idle_smts))
+ __cpumask_clear_cpu(cpu, idle_smts);
+ }
+#endif
+
+ return cpumask_test_and_clear_cpu(cpu, idle_cpus);
+}
+
+/*
+ * Pick an idle CPU in a specific NUMA node.
+ */
+static s32 pick_idle_cpu_in_node(const struct cpumask *cpus_allowed, int node, u64 flags)
+{
+ int cpu;
+
+retry:
+ if (sched_smt_active()) {
+ cpu = cpumask_any_and_distribute(idle_cpumask(node)->smt, cpus_allowed);
+ if (cpu < nr_cpu_ids)
+ goto found;
+
+ if (flags & SCX_PICK_IDLE_CORE)
+ return -EBUSY;
+ }
+
+ cpu = cpumask_any_and_distribute(idle_cpumask(node)->cpu, cpus_allowed);
+ if (cpu >= nr_cpu_ids)
+ return -EBUSY;
+
+found:
+ if (scx_idle_test_and_clear_cpu(cpu))
+ return cpu;
+ else
+ goto retry;
+}
+
+/*
+ * Tracks nodes that have not yet been visited when searching for an idle
+ * CPU across all available nodes.
+ */
+static DEFINE_PER_CPU(nodemask_t, per_cpu_unvisited);
+
+/*
+ * Search for an idle CPU across all nodes, excluding @node.
+ */
+static s32 pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u64 flags)
+{
+ nodemask_t *unvisited;
+ s32 cpu = -EBUSY;
+
+ preempt_disable();
+ unvisited = this_cpu_ptr(&per_cpu_unvisited);
+
+ /*
+ * Restrict the search to the online nodes (excluding the current
+ * node that has been visited already).
+ */
+ nodes_copy(*unvisited, node_states[N_ONLINE]);
+ node_clear(node, *unvisited);
+
+ /*
+ * Traverse all nodes in order of increasing distance, starting
+ * from @node.
+ *
+ * This loop is O(N^2), with N being the amount of NUMA nodes,
+ * which might be quite expensive in large NUMA systems. However,
+ * this complexity comes into play only when a scheduler enables
+ * SCX_OPS_BUILTIN_IDLE_PER_NODE and it's requesting an idle CPU
+ * without specifying a target NUMA node, so it shouldn't be a
+ * bottleneck is most cases.
+ *
+ * As a future optimization we may want to cache the list of nodes
+ * in a per-node array, instead of actually traversing them every
+ * time.
+ */
+ for_each_node_numadist(node, *unvisited) {
+ cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags);
+ if (cpu >= 0)
+ break;
+ }
+ preempt_enable();
+
+ return cpu;
+}
+
+/*
+ * Find an idle CPU in the system, starting from @node.
+ */
+s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags)
+{
+ s32 cpu;
+
+ /*
+ * Always search in the starting node first (this is an
+ * optimization that can save some cycles even when the search is
+ * not limited to a single node).
+ */
+ cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags);
+ if (cpu >= 0)
+ return cpu;
+
+ /*
+ * Stop the search if we are using only a single global cpumask
+ * (NUMA_NO_NODE) or if the search is restricted to the first node
+ * only.
+ */
+ if (node == NUMA_NO_NODE || flags & SCX_PICK_IDLE_IN_NODE)
+ return -EBUSY;
+
+ /*
+ * Extend the search to the other online nodes.
+ */
+ return pick_idle_cpu_from_online_nodes(cpus_allowed, node, flags);
+}
+
+/*
+ * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC
+ * domain is not defined).
+ */
+static unsigned int llc_weight(s32 cpu)
+{
+ struct sched_domain *sd;
+
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ if (!sd)
+ return 0;
+
+ return sd->span_weight;
+}
+
+/*
+ * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC
+ * domain is not defined).
+ */
+static struct cpumask *llc_span(s32 cpu)
+{
+ struct sched_domain *sd;
+
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ if (!sd)
+ return 0;
+
+ return sched_domain_span(sd);
+}
+
+/*
+ * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the
+ * NUMA domain is not defined).
+ */
+static unsigned int numa_weight(s32 cpu)
+{
+ struct sched_domain *sd;
+ struct sched_group *sg;
+
+ sd = rcu_dereference(per_cpu(sd_numa, cpu));
+ if (!sd)
+ return 0;
+ sg = sd->groups;
+ if (!sg)
+ return 0;
+
+ return sg->group_weight;
+}
+
+/*
+ * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA
+ * domain is not defined).
+ */
+static struct cpumask *numa_span(s32 cpu)
+{
+ struct sched_domain *sd;
+ struct sched_group *sg;
+
+ sd = rcu_dereference(per_cpu(sd_numa, cpu));
+ if (!sd)
+ return NULL;
+ sg = sd->groups;
+ if (!sg)
+ return NULL;
+
+ return sched_group_span(sg);
+}
+
+/*
+ * Return true if the LLC domains do not perfectly overlap with the NUMA
+ * domains, false otherwise.
+ */
+static bool llc_numa_mismatch(void)
+{
+ int cpu;
+
+ /*
+ * We need to scan all online CPUs to verify whether their scheduling
+ * domains overlap.
+ *
+ * While it is rare to encounter architectures with asymmetric NUMA
+ * topologies, CPU hotplugging or virtualized environments can result
+ * in asymmetric configurations.
+ *
+ * For example:
+ *
+ * NUMA 0:
+ * - LLC 0: cpu0..cpu7
+ * - LLC 1: cpu8..cpu15 [offline]
+ *
+ * NUMA 1:
+ * - LLC 0: cpu16..cpu23
+ * - LLC 1: cpu24..cpu31
+ *
+ * In this case, if we only check the first online CPU (cpu0), we might
+ * incorrectly assume that the LLC and NUMA domains are fully
+ * overlapping, which is incorrect (as NUMA 1 has two distinct LLC
+ * domains).
+ */
+ for_each_online_cpu(cpu)
+ if (llc_weight(cpu) != numa_weight(cpu))
+ return true;
+
+ return false;
+}
+
+/*
+ * Initialize topology-aware scheduling.
+ *
+ * Detect if the system has multiple LLC or multiple NUMA domains and enable
+ * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
+ * selection policy.
+ *
+ * Assumption: the kernel's internal topology representation assumes that each
+ * CPU belongs to a single LLC domain, and that each LLC domain is entirely
+ * contained within a single NUMA node.
+ */
+void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
+{
+ bool enable_llc = false, enable_numa = false;
+ unsigned int nr_cpus;
+ s32 cpu = cpumask_first(cpu_online_mask);
+
+ /*
+ * Enable LLC domain optimization only when there are multiple LLC
+ * domains among the online CPUs. If all online CPUs are part of a
+ * single LLC domain, the idle CPU selection logic can choose any
+ * online CPU without bias.
+ *
+ * Note that it is sufficient to check the LLC domain of the first
+ * online CPU to determine whether a single LLC domain includes all
+ * CPUs.
+ */
+ rcu_read_lock();
+ nr_cpus = llc_weight(cpu);
+ if (nr_cpus > 0) {
+ if (nr_cpus < num_online_cpus())
+ enable_llc = true;
+ pr_debug("sched_ext: LLC=%*pb weight=%u\n",
+ cpumask_pr_args(llc_span(cpu)), llc_weight(cpu));
+ }
+
+ /*
+ * Enable NUMA optimization only when there are multiple NUMA domains
+ * among the online CPUs and the NUMA domains don't perfectly overlaps
+ * with the LLC domains.
+ *
+ * If all CPUs belong to the same NUMA node and the same LLC domain,
+ * enabling both NUMA and LLC optimizations is unnecessary, as checking
+ * for an idle CPU in the same domain twice is redundant.
+ *
+ * If SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled ignore the NUMA
+ * optimization, as we would naturally select idle CPUs within
+ * specific NUMA nodes querying the corresponding per-node cpumask.
+ */
+ if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) {
+ nr_cpus = numa_weight(cpu);
+ if (nr_cpus > 0) {
+ if (nr_cpus < num_online_cpus() && llc_numa_mismatch())
+ enable_numa = true;
+ pr_debug("sched_ext: NUMA=%*pb weight=%u\n",
+ cpumask_pr_args(numa_span(cpu)), nr_cpus);
+ }
+ }
+ rcu_read_unlock();
+
+ pr_debug("sched_ext: LLC idle selection %s\n",
+ str_enabled_disabled(enable_llc));
+ pr_debug("sched_ext: NUMA idle selection %s\n",
+ str_enabled_disabled(enable_numa));
+
+ if (enable_llc)
+ static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
+ else
+ static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
+ if (enable_numa)
+ static_branch_enable_cpuslocked(&scx_selcpu_topo_numa);
+ else
+ static_branch_disable_cpuslocked(&scx_selcpu_topo_numa);
+}
+
+/*
+ * Built-in CPU idle selection policy:
+ *
+ * 1. Prioritize full-idle cores:
+ * - always prioritize CPUs from fully idle cores (both logical CPUs are
+ * idle) to avoid interference caused by SMT.
+ *
+ * 2. Reuse the same CPU:
+ * - prefer the last used CPU to take advantage of cached data (L1, L2) and
+ * branch prediction optimizations.
+ *
+ * 3. Pick a CPU within the same LLC (Last-Level Cache):
+ * - if the above conditions aren't met, pick a CPU that shares the same LLC
+ * to maintain cache locality.
+ *
+ * 4. Pick a CPU within the same NUMA node, if enabled:
+ * - choose a CPU from the same NUMA node to reduce memory access latency.
+ *
+ * 5. Pick any idle CPU usable by the task.
+ *
+ * Step 3 and 4 are performed only if the system has, respectively,
+ * multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and
+ * scx_selcpu_topo_numa) and they don't contain the same subset of CPUs.
+ *
+ * If %SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled, the search will always
+ * begin in @prev_cpu's node and proceed to other nodes in order of
+ * increasing distance.
+ *
+ * Return the picked CPU if idle, or a negative value otherwise.
+ *
+ * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
+ * we never call ops.select_cpu() for them, see select_task_rq().
+ */
+s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags)
+{
+ const struct cpumask *llc_cpus = NULL;
+ const struct cpumask *numa_cpus = NULL;
+ int node = scx_cpu_node_if_enabled(prev_cpu);
+ s32 cpu;
+
+ /*
+ * This is necessary to protect llc_cpus.
+ */
+ rcu_read_lock();
+
+ /*
+ * Determine the scheduling domain only if the task is allowed to run
+ * on all CPUs.
+ *
+ * This is done primarily for efficiency, as it avoids the overhead of
+ * updating a cpumask every time we need to select an idle CPU (which
+ * can be costly in large SMP systems), but it also aligns logically:
+ * if a task's scheduling domain is restricted by user-space (through
+ * CPU affinity), the task will simply use the flat scheduling domain
+ * defined by user-space.
+ */
+ if (p->nr_cpus_allowed >= num_possible_cpus()) {
+ if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
+ numa_cpus = numa_span(prev_cpu);
+
+ if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc))
+ llc_cpus = llc_span(prev_cpu);
+ }
+
+ /*
+ * If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
+ */
+ if (wake_flags & SCX_WAKE_SYNC) {
+ int waker_node;
+
+ /*
+ * If the waker's CPU is cache affine and prev_cpu is idle,
+ * then avoid a migration.
+ */
+ cpu = smp_processor_id();
+ if (cpus_share_cache(cpu, prev_cpu) &&
+ scx_idle_test_and_clear_cpu(prev_cpu)) {
+ cpu = prev_cpu;
+ goto out_unlock;
+ }
+
+ /*
+ * If the waker's local DSQ is empty, and the system is under
+ * utilized, try to wake up @p to the local DSQ of the waker.
+ *
+ * Checking only for an empty local DSQ is insufficient as it
+ * could give the wakee an unfair advantage when the system is
+ * oversaturated.
+ *
+ * Checking only for the presence of idle CPUs is also
+ * insufficient as the local DSQ of the waker could have tasks
+ * piled up on it even if there is an idle core elsewhere on
+ * the system.
+ */
+ waker_node = cpu_to_node(cpu);
+ if (!(current->flags & PF_EXITING) &&
+ cpu_rq(cpu)->scx.local_dsq.nr == 0 &&
+ (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) &&
+ !cpumask_empty(idle_cpumask(waker_node)->cpu)) {
+ if (cpumask_test_cpu(cpu, p->cpus_ptr))
+ goto out_unlock;
+ }
+ }
+
+ /*
+ * If CPU has SMT, any wholly idle CPU is likely a better pick than
+ * partially idle @prev_cpu.
+ */
+ if (sched_smt_active()) {
+ /*
+ * Keep using @prev_cpu if it's part of a fully idle core.
+ */
+ if (cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) &&
+ scx_idle_test_and_clear_cpu(prev_cpu)) {
+ cpu = prev_cpu;
+ goto out_unlock;
+ }
+
+ /*
+ * Search for any fully idle core in the same LLC domain.
+ */
+ if (llc_cpus) {
+ cpu = pick_idle_cpu_in_node(llc_cpus, node, SCX_PICK_IDLE_CORE);
+ if (cpu >= 0)
+ goto out_unlock;
+ }
+
+ /*
+ * Search for any fully idle core in the same NUMA node.
+ */
+ if (numa_cpus) {
+ cpu = pick_idle_cpu_in_node(numa_cpus, node, SCX_PICK_IDLE_CORE);
+ if (cpu >= 0)
+ goto out_unlock;
+ }
+
+ /*
+ * Search for any full-idle core usable by the task.
+ *
+ * If the node-aware idle CPU selection policy is enabled
+ * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always
+ * begin in prev_cpu's node and proceed to other nodes in
+ * order of increasing distance.
+ */
+ cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags | SCX_PICK_IDLE_CORE);
+ if (cpu >= 0)
+ goto out_unlock;
+
+ /*
+ * Give up if we're strictly looking for a full-idle SMT
+ * core.
+ */
+ if (flags & SCX_PICK_IDLE_CORE) {
+ cpu = prev_cpu;
+ goto out_unlock;
+ }
+ }
+
+ /*
+ * Use @prev_cpu if it's idle.
+ */
+ if (scx_idle_test_and_clear_cpu(prev_cpu)) {
+ cpu = prev_cpu;
+ goto out_unlock;
+ }
+
+ /*
+ * Search for any idle CPU in the same LLC domain.
+ */
+ if (llc_cpus) {
+ cpu = pick_idle_cpu_in_node(llc_cpus, node, 0);
+ if (cpu >= 0)
+ goto out_unlock;
+ }
+
+ /*
+ * Search for any idle CPU in the same NUMA node.
+ */
+ if (numa_cpus) {
+ cpu = pick_idle_cpu_in_node(numa_cpus, node, 0);
+ if (cpu >= 0)
+ goto out_unlock;
+ }
+
+ /*
+ * Search for any idle CPU usable by the task.
+ *
+ * If the node-aware idle CPU selection policy is enabled
+ * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always begin
+ * in prev_cpu's node and proceed to other nodes in order of
+ * increasing distance.
+ */
+ cpu = scx_pick_idle_cpu(p->cpus_ptr, node, flags);
+ if (cpu >= 0)
+ goto out_unlock;
+
+out_unlock:
+ rcu_read_unlock();
+
+ return cpu;
+}
+
+/*
+ * Initialize global and per-node idle cpumasks.
+ */
+void scx_idle_init_masks(void)
+{
+ int node;
+
+ /* Allocate global idle cpumasks */
+ BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL));
+ BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.smt, GFP_KERNEL));
+
+ /* Allocate per-node idle cpumasks */
+ scx_idle_node_masks = kcalloc(num_possible_nodes(),
+ sizeof(*scx_idle_node_masks), GFP_KERNEL);
+ BUG_ON(!scx_idle_node_masks);
+
+ for_each_node(node) {
+ scx_idle_node_masks[node] = kzalloc_node(sizeof(**scx_idle_node_masks),
+ GFP_KERNEL, node);
+ BUG_ON(!scx_idle_node_masks[node]);
+
+ BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->cpu, GFP_KERNEL, node));
+ BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->smt, GFP_KERNEL, node));
+ }
+}
+
+static void update_builtin_idle(int cpu, bool idle)
+{
+ int node = scx_cpu_node_if_enabled(cpu);
+ struct cpumask *idle_cpus = idle_cpumask(node)->cpu;
+
+ assign_cpu(cpu, idle_cpus, idle);
+
+#ifdef CONFIG_SCHED_SMT
+ if (sched_smt_active()) {
+ const struct cpumask *smt = cpu_smt_mask(cpu);
+ struct cpumask *idle_smts = idle_cpumask(node)->smt;
+
+ if (idle) {
+ /*
+ * idle_smt handling is racy but that's fine as it's
+ * only for optimization and self-correcting.
+ */
+ if (!cpumask_subset(smt, idle_cpus))
+ return;
+ cpumask_or(idle_smts, idle_smts, smt);
+ } else {
+ cpumask_andnot(idle_smts, idle_smts, smt);
+ }
+ }
+#endif
+}
+
+/*
+ * Update the idle state of a CPU to @idle.
+ *
+ * If @do_notify is true, ops.update_idle() is invoked to notify the scx
+ * scheduler of an actual idle state transition (idle to busy or vice
+ * versa). If @do_notify is false, only the idle state in the idle masks is
+ * refreshed without invoking ops.update_idle().
+ *
+ * This distinction is necessary, because an idle CPU can be "reserved" and
+ * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as
+ * busy even if no tasks are dispatched. In this case, the CPU may return
+ * to idle without a true state transition. Refreshing the idle masks
+ * without invoking ops.update_idle() ensures accurate idle state tracking
+ * while avoiding unnecessary updates and maintaining balanced state
+ * transitions.
+ */
+void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
+{
+ int cpu = cpu_of(rq);
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * Trigger ops.update_idle() only when transitioning from a task to
+ * the idle thread and vice versa.
+ *
+ * Idle transitions are indicated by do_notify being set to true,
+ * managed by put_prev_task_idle()/set_next_task_idle().
+ */
+ if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
+ SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
+
+ /*
+ * Update the idle masks:
+ * - for real idle transitions (do_notify == true)
+ * - for idle-to-idle transitions (indicated by the previous task
+ * being the idle thread, managed by pick_task_idle())
+ *
+ * Skip updating idle masks if the previous task is not the idle
+ * thread, since set_next_task_idle() has already handled it when
+ * transitioning from a task to the idle thread (calling this
+ * function with do_notify == true).
+ *
+ * In this way we can avoid updating the idle masks twice,
+ * unnecessarily.
+ */
+ if (static_branch_likely(&scx_builtin_idle_enabled))
+ if (do_notify || is_idle_task(rq->curr))
+ update_builtin_idle(cpu, idle);
+}
+
+static void reset_idle_masks(struct sched_ext_ops *ops)
+{
+ int node;
+
+ /*
+ * Consider all online cpus idle. Should converge to the actual state
+ * quickly.
+ */
+ if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) {
+ cpumask_copy(idle_cpumask(NUMA_NO_NODE)->cpu, cpu_online_mask);
+ cpumask_copy(idle_cpumask(NUMA_NO_NODE)->smt, cpu_online_mask);
+ return;
+ }
+
+ for_each_node(node) {
+ const struct cpumask *node_mask = cpumask_of_node(node);
+
+ cpumask_and(idle_cpumask(node)->cpu, cpu_online_mask, node_mask);
+ cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask);
+ }
+}
+#endif /* CONFIG_SMP */
+
+void scx_idle_enable(struct sched_ext_ops *ops)
+{
+ if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))
+ static_branch_enable(&scx_builtin_idle_enabled);
+ else
+ static_branch_disable(&scx_builtin_idle_enabled);
+
+ if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)
+ static_branch_enable(&scx_builtin_idle_per_node);
+ else
+ static_branch_disable(&scx_builtin_idle_per_node);
+
+#ifdef CONFIG_SMP
+ reset_idle_masks(ops);
+#endif
+}
+
+void scx_idle_disable(void)
+{
+ static_branch_disable(&scx_builtin_idle_enabled);
+ static_branch_disable(&scx_builtin_idle_per_node);
+}
+
+/********************************************************************************
+ * Helpers that can be called from the BPF scheduler.
+ */
+
+static int validate_node(int node)
+{
+ if (!static_branch_likely(&scx_builtin_idle_per_node)) {
+ scx_ops_error("per-node idle tracking is disabled");
+ return -EOPNOTSUPP;
+ }
+
+ /* Return no entry for NUMA_NO_NODE (not a critical scx error) */
+ if (node == NUMA_NO_NODE)
+ return -ENOENT;
+
+ /* Make sure node is in a valid range */
+ if (node < 0 || node >= nr_node_ids) {
+ scx_ops_error("invalid node %d", node);
+ return -EINVAL;
+ }
+
+ /* Make sure the node is part of the set of possible nodes */
+ if (!node_possible(node)) {
+ scx_ops_error("unavailable node %d", node);
+ return -EINVAL;
+ }
+
+ return node;
+}
+
+__bpf_kfunc_start_defs();
+
+static bool check_builtin_idle_enabled(void)
+{
+ if (static_branch_likely(&scx_builtin_idle_enabled))
+ return true;
+
+ scx_ops_error("built-in idle tracking is disabled");
+ return false;
+}
+
+/**
+ * scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or
+ * trigger an error if @cpu is invalid
+ * @cpu: target CPU
+ */
+__bpf_kfunc int scx_bpf_cpu_node(s32 cpu)
+{
+#ifdef CONFIG_NUMA
+ if (!ops_cpu_valid(cpu, NULL))
+ return NUMA_NO_NODE;
+
+ return cpu_to_node(cpu);
+#else
+ return 0;
+#endif
+}
+
+/**
+ * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
+ * @p: task_struct to select a CPU for
+ * @prev_cpu: CPU @p was on previously
+ * @wake_flags: %SCX_WAKE_* flags
+ * @is_idle: out parameter indicating whether the returned CPU is idle
+ *
+ * Can only be called from ops.select_cpu() if the built-in CPU selection is
+ * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
+ * @p, @prev_cpu and @wake_flags match ops.select_cpu().
+ *
+ * Returns the picked CPU with *@is_idle indicating whether the picked CPU is
+ * currently idle and thus a good candidate for direct dispatching.
+ */
+__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
+ u64 wake_flags, bool *is_idle)
+{
+#ifdef CONFIG_SMP
+ s32 cpu;
+#endif
+ if (!ops_cpu_valid(prev_cpu, NULL))
+ goto prev_cpu;
+
+ if (!check_builtin_idle_enabled())
+ goto prev_cpu;
+
+ if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
+ goto prev_cpu;
+
+#ifdef CONFIG_SMP
+ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0);
+ if (cpu >= 0) {
+ *is_idle = true;
+ return cpu;
+ }
+#endif
+
+prev_cpu:
+ *is_idle = false;
+ return prev_cpu;
+}
+
+/**
+ * scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the
+ * idle-tracking per-CPU cpumask of a target NUMA node.
+ * @node: target NUMA node
+ *
+ * Returns an empty cpumask if idle tracking is not enabled, if @node is
+ * not valid, or running on a UP kernel. In this case the actual error will
+ * be reported to the BPF scheduler via scx_ops_error().
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask_node(int node)
+{
+ node = validate_node(node);
+ if (node < 0)
+ return cpu_none_mask;
+
+#ifdef CONFIG_SMP
+ return idle_cpumask(node)->cpu;
+#else
+ return cpu_none_mask;
+#endif
+}
+
+/**
+ * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
+ * per-CPU cpumask.
+ *
+ * Returns an empty mask if idle tracking is not enabled, or running on a
+ * UP kernel.
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
+{
+ if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
+ scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ return cpu_none_mask;
+ }
+
+ if (!check_builtin_idle_enabled())
+ return cpu_none_mask;
+
+#ifdef CONFIG_SMP
+ return idle_cpumask(NUMA_NO_NODE)->cpu;
+#else
+ return cpu_none_mask;
+#endif
+}
+
+/**
+ * scx_bpf_get_idle_smtmask_node - Get a referenced kptr to the
+ * idle-tracking, per-physical-core cpumask of a target NUMA node. Can be
+ * used to determine if an entire physical core is free.
+ * @node: target NUMA node
+ *
+ * Returns an empty cpumask if idle tracking is not enabled, if @node is
+ * not valid, or running on a UP kernel. In this case the actual error will
+ * be reported to the BPF scheduler via scx_ops_error().
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask_node(int node)
+{
+ node = validate_node(node);
+ if (node < 0)
+ return cpu_none_mask;
+
+#ifdef CONFIG_SMP
+ if (sched_smt_active())
+ return idle_cpumask(node)->smt;
+ else
+ return idle_cpumask(node)->cpu;
+#else
+ return cpu_none_mask;
+#endif
+}
+
+/**
+ * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
+ * per-physical-core cpumask. Can be used to determine if an entire physical
+ * core is free.
+ *
+ * Returns an empty mask if idle tracking is not enabled, or running on a
+ * UP kernel.
+ */
+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
+{
+ if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
+ scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
+ return cpu_none_mask;
+ }
+
+ if (!check_builtin_idle_enabled())
+ return cpu_none_mask;
+
+#ifdef CONFIG_SMP
+ if (sched_smt_active())
+ return idle_cpumask(NUMA_NO_NODE)->smt;
+ else
+ return idle_cpumask(NUMA_NO_NODE)->cpu;
+#else
+ return cpu_none_mask;
+#endif
+}
+
+/**
+ * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
+ * either the percpu, or SMT idle-tracking cpumask.
+ * @idle_mask: &cpumask to use
+ */
+__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
+{
+ /*
+ * Empty function body because we aren't actually acquiring or releasing
+ * a reference to a global idle cpumask, which is read-only in the
+ * caller and is never released. The acquire / release semantics here
+ * are just used to make the cpumask a trusted pointer in the caller.
+ */
+}
+
+/**
+ * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
+ * @cpu: cpu to test and clear idle for
+ *
+ * Returns %true if @cpu was idle and its idle state was successfully cleared.
+ * %false otherwise.
+ *
+ * Unavailable if ops.update_idle() is implemented and
+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
+ */
+__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
+{
+ if (!check_builtin_idle_enabled())
+ return false;
+
+ if (ops_cpu_valid(cpu, NULL))
+ return scx_idle_test_and_clear_cpu(cpu);
+ else
+ return false;
+}
+
+/**
+ * scx_bpf_pick_idle_cpu_node - Pick and claim an idle cpu from @node
+ * @cpus_allowed: Allowed cpumask
+ * @node: target NUMA node
+ * @flags: %SCX_PICK_IDLE_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed from the NUMA node @node.
+ *
+ * Returns the picked idle cpu number on success, or -%EBUSY if no matching
+ * cpu was found.
+ *
+ * The search starts from @node and proceeds to other online NUMA nodes in
+ * order of increasing distance (unless SCX_PICK_IDLE_IN_NODE is specified,
+ * in which case the search is limited to the target @node).
+ *
+ * Always returns an error if ops.update_idle() is implemented and
+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set, or if
+ * %SCX_OPS_BUILTIN_IDLE_PER_NODE is not set.
+ */
+__bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed,
+ int node, u64 flags)
+{
+ node = validate_node(node);
+ if (node < 0)
+ return node;
+
+ return scx_pick_idle_cpu(cpus_allowed, node, flags);
+}
+
+/**
+ * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
+ * @cpus_allowed: Allowed cpumask
+ * @flags: %SCX_PICK_IDLE_CPU_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
+ * number on success. -%EBUSY if no matching cpu was found.
+ *
+ * Idle CPU tracking may race against CPU scheduling state transitions. For
+ * example, this function may return -%EBUSY as CPUs are transitioning into the
+ * idle state. If the caller then assumes that there will be dispatch events on
+ * the CPUs as they were all busy, the scheduler may end up stalling with CPUs
+ * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and
+ * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch
+ * event in the near future.
+ *
+ * Unavailable if ops.update_idle() is implemented and
+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
+ *
+ * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use
+ * scx_bpf_pick_idle_cpu_node() instead.
+ */
+__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
+ u64 flags)
+{
+ if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
+ scx_ops_error("per-node idle tracking is enabled");
+ return -EBUSY;
+ }
+
+ if (!check_builtin_idle_enabled())
+ return -EBUSY;
+
+ return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags);
+}
+
+/**
+ * scx_bpf_pick_any_cpu_node - Pick and claim an idle cpu if available
+ * or pick any CPU from @node
+ * @cpus_allowed: Allowed cpumask
+ * @node: target NUMA node
+ * @flags: %SCX_PICK_IDLE_CPU_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
+ * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
+ * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
+ * empty.
+ *
+ * The search starts from @node and proceeds to other online NUMA nodes in
+ * order of increasing distance (unless %SCX_PICK_IDLE_IN_NODE is specified,
+ * in which case the search is limited to the target @node, regardless of
+ * the CPU idle state).
+ *
+ * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
+ * set, this function can't tell which CPUs are idle and will always pick any
+ * CPU.
+ */
+__bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed,
+ int node, u64 flags)
+{
+ s32 cpu;
+
+ node = validate_node(node);
+ if (node < 0)
+ return node;
+
+ cpu = scx_pick_idle_cpu(cpus_allowed, node, flags);
+ if (cpu >= 0)
+ return cpu;
+
+ if (flags & SCX_PICK_IDLE_IN_NODE)
+ cpu = cpumask_any_and_distribute(cpumask_of_node(node), cpus_allowed);
+ else
+ cpu = cpumask_any_distribute(cpus_allowed);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+ else
+ return -EBUSY;
+}
+
+/**
+ * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
+ * @cpus_allowed: Allowed cpumask
+ * @flags: %SCX_PICK_IDLE_CPU_* flags
+ *
+ * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
+ * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
+ * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
+ * empty.
+ *
+ * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
+ * set, this function can't tell which CPUs are idle and will always pick any
+ * CPU.
+ *
+ * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use
+ * scx_bpf_pick_any_cpu_node() instead.
+ */
+__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
+ u64 flags)
+{
+ s32 cpu;
+
+ if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
+ scx_ops_error("per-node idle tracking is enabled");
+ return -EBUSY;
+ }
+
+ if (static_branch_likely(&scx_builtin_idle_enabled)) {
+ cpu = scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags);
+ if (cpu >= 0)
+ return cpu;
+ }
+
+ cpu = cpumask_any_distribute(cpus_allowed);
+ if (cpu < nr_cpu_ids)
+ return cpu;
+ else
+ return -EBUSY;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(scx_kfunc_ids_idle)
+BTF_ID_FLAGS(func, scx_bpf_cpu_node)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE)
+BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
+BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_idle)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_idle,
+};
+
+BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
+BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
+BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
+
+static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
+ .owner = THIS_MODULE,
+ .set = &scx_kfunc_ids_select_cpu,
+};
+
+int scx_idle_init(void)
+{
+ int ret;
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) ||
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) ||
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) ||
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle);
+
+ return ret;
+}
diff --git a/kernel/sched/ext_idle.h b/kernel/sched/ext_idle.h
new file mode 100644
index 000000000000..511cc2221f7a
--- /dev/null
+++ b/kernel/sched/ext_idle.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com>
+ */
+#ifndef _KERNEL_SCHED_EXT_IDLE_H
+#define _KERNEL_SCHED_EXT_IDLE_H
+
+struct sched_ext_ops;
+
+#ifdef CONFIG_SMP
+void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops);
+void scx_idle_init_masks(void);
+bool scx_idle_test_and_clear_cpu(int cpu);
+s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags);
+#else /* !CONFIG_SMP */
+static inline void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops) {}
+static inline void scx_idle_init_masks(void) {}
+static inline bool scx_idle_test_and_clear_cpu(int cpu) { return false; }
+static inline s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags)
+{
+ return -EBUSY;
+}
+#endif /* CONFIG_SMP */
+
+s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags);
+void scx_idle_enable(struct sched_ext_ops *ops);
+void scx_idle_disable(void);
+int scx_idle_init(void);
+
+#endif /* _KERNEL_SCHED_EXT_IDLE_H */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c798d2795243..a0c4cd26ee07 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -74,12 +74,12 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
/*
* Minimal preemption granularity for CPU-bound tasks:
*
- * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
+ * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_base_slice = 750000ULL;
-static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
+unsigned int sysctl_sched_base_slice = 700000ULL;
+static unsigned int normalized_sysctl_sched_base_slice = 700000ULL;
-const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+__read_mostly unsigned int sysctl_sched_migration_cost = 500000UL;
static int __init setup_sched_thermal_decay_shift(char *str)
{
@@ -399,7 +399,7 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
static inline void assert_list_leaf_cfs_rq(struct rq *rq)
{
- SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
+ WARN_ON_ONCE(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
}
/* Iterate through all leaf cfs_rq's on a runqueue */
@@ -696,7 +696,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
s64 vlag, limit;
- SCHED_WARN_ON(!se->on_rq);
+ WARN_ON_ONCE(!se->on_rq);
vlag = avg_vruntime(cfs_rq) - se->vruntime;
limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
@@ -884,6 +884,26 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
}
/*
+ * HACK, stash a copy of deadline at the point of pick in vlag,
+ * which isn't used until dequeue.
+ */
+static inline void set_protect_slice(struct sched_entity *se)
+{
+ se->vlag = se->deadline;
+}
+
+static inline bool protect_slice(struct sched_entity *se)
+{
+ return se->vlag == se->deadline;
+}
+
+static inline void cancel_protect_slice(struct sched_entity *se)
+{
+ if (protect_slice(se))
+ se->vlag = se->deadline + 1;
+}
+
+/*
* Earliest Eligible Virtual Deadline First
*
* In order to provide latency guarantees for different request sizes
@@ -919,11 +939,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
curr = NULL;
- /*
- * Once selected, run a task until it either becomes non-eligible or
- * until it gets a new slice. See the HACK in set_next_entity().
- */
- if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline)
+ if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr))
return curr;
/* Pick the leftmost entity if it's eligible */
@@ -967,7 +983,6 @@ found:
return best;
}
-#ifdef CONFIG_SCHED_DEBUG
struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
@@ -994,7 +1009,6 @@ int sched_update_scaling(void)
return 0;
}
#endif
-#endif
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
@@ -3301,7 +3315,7 @@ static void task_numa_work(struct callback_head *work)
bool vma_pids_skipped;
bool vma_pids_forced = false;
- SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
+ WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
work->next = work;
/*
@@ -4020,7 +4034,7 @@ static inline bool load_avg_is_decayed(struct sched_avg *sa)
* Make sure that rounding and/or propagation of PELT values never
* break this.
*/
- SCHED_WARN_ON(sa->load_avg ||
+ WARN_ON_ONCE(sa->load_avg ||
sa->util_avg ||
sa->runnable_avg);
@@ -5444,7 +5458,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
clear_buddies(cfs_rq, se);
if (flags & DEQUEUE_DELAYED) {
- SCHED_WARN_ON(!se->sched_delayed);
+ WARN_ON_ONCE(!se->sched_delayed);
} else {
bool delay = sleep;
/*
@@ -5454,7 +5468,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (flags & DEQUEUE_SPECIAL)
delay = false;
- SCHED_WARN_ON(delay && se->sched_delayed);
+ WARN_ON_ONCE(delay && se->sched_delayed);
if (sched_feat(DELAY_DEQUEUE) && delay &&
!entity_eligible(cfs_rq, se)) {
@@ -5530,15 +5544,12 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_stats_wait_end_fair(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(cfs_rq, se, UPDATE_TG);
- /*
- * HACK, stash a copy of deadline at the point of pick in vlag,
- * which isn't used until dequeue.
- */
- se->vlag = se->deadline;
+
+ set_protect_slice(se);
}
update_stats_curr_start(cfs_rq, se);
- SCHED_WARN_ON(cfs_rq->curr);
+ WARN_ON_ONCE(cfs_rq->curr);
cfs_rq->curr = se;
/*
@@ -5579,7 +5590,7 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
if (sched_feat(PICK_BUDDY) &&
cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
/* ->next will never be delayed */
- SCHED_WARN_ON(cfs_rq->next->sched_delayed);
+ WARN_ON_ONCE(cfs_rq->next->sched_delayed);
return cfs_rq->next;
}
@@ -5615,7 +5626,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* in !on_rq case, update occurred at dequeue */
update_load_avg(cfs_rq, prev, 0);
}
- SCHED_WARN_ON(cfs_rq->curr != prev);
+ WARN_ON_ONCE(cfs_rq->curr != prev);
cfs_rq->curr = NULL;
}
@@ -5838,7 +5849,7 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
cfs_rq->throttled_clock_self = 0;
- if (SCHED_WARN_ON((s64)delta < 0))
+ if (WARN_ON_ONCE((s64)delta < 0))
delta = 0;
cfs_rq->throttled_clock_self_time += delta;
@@ -5858,7 +5869,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
list_del_leaf_cfs_rq(cfs_rq);
- SCHED_WARN_ON(cfs_rq->throttled_clock_self);
+ WARN_ON_ONCE(cfs_rq->throttled_clock_self);
if (cfs_rq->nr_queued)
cfs_rq->throttled_clock_self = rq_clock(rq);
}
@@ -5967,7 +5978,7 @@ done:
* throttled-list. rq->lock protects completion.
*/
cfs_rq->throttled = 1;
- SCHED_WARN_ON(cfs_rq->throttled_clock);
+ WARN_ON_ONCE(cfs_rq->throttled_clock);
if (cfs_rq->nr_queued)
cfs_rq->throttled_clock = rq_clock(rq);
return true;
@@ -6123,7 +6134,7 @@ static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
}
/* Already enqueued */
- if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
+ if (WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_csd_list)))
return;
first = list_empty(&rq->cfsb_csd_list);
@@ -6142,7 +6153,7 @@ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
{
lockdep_assert_rq_held(rq_of(cfs_rq));
- if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) ||
+ if (WARN_ON_ONCE(!cfs_rq_throttled(cfs_rq) ||
cfs_rq->runtime_remaining <= 0))
return;
@@ -6178,7 +6189,7 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
goto next;
/* By the above checks, this should never be true */
- SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
+ WARN_ON_ONCE(cfs_rq->runtime_remaining > 0);
raw_spin_lock(&cfs_b->lock);
runtime = -cfs_rq->runtime_remaining + 1;
@@ -6199,7 +6210,7 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
* We currently only expect to be unthrottling
* a single cfs_rq locally.
*/
- SCHED_WARN_ON(!list_empty(&local_unthrottle));
+ WARN_ON_ONCE(!list_empty(&local_unthrottle));
list_add_tail(&cfs_rq->throttled_csd_list,
&local_unthrottle);
}
@@ -6224,7 +6235,7 @@ next:
rq_unlock_irqrestore(rq, &rf);
}
- SCHED_WARN_ON(!list_empty(&local_unthrottle));
+ WARN_ON_ONCE(!list_empty(&local_unthrottle));
rcu_read_unlock();
@@ -6776,7 +6787,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
{
struct sched_entity *se = &p->se;
- SCHED_WARN_ON(task_rq(p) != rq);
+ WARN_ON_ONCE(task_rq(p) != rq);
if (rq->cfs.h_nr_queued > 1) {
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
@@ -6887,8 +6898,8 @@ requeue_delayed_entity(struct sched_entity *se)
* Because a delayed entity is one that is still on
* the runqueue competing until elegibility.
*/
- SCHED_WARN_ON(!se->sched_delayed);
- SCHED_WARN_ON(!se->on_rq);
+ WARN_ON_ONCE(!se->sched_delayed);
+ WARN_ON_ONCE(!se->on_rq);
if (sched_feat(DELAY_ZERO)) {
update_entity_lag(cfs_rq, se);
@@ -6991,6 +7002,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_group(se);
se->slice = slice;
+ if (se != cfs_rq->curr)
+ min_vruntime_cb_propagate(&se->run_node, NULL);
slice = cfs_rq_min_slice(cfs_rq);
cfs_rq->h_nr_runnable += h_nr_runnable;
@@ -7120,6 +7133,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
update_cfs_group(se);
se->slice = slice;
+ if (se != cfs_rq->curr)
+ min_vruntime_cb_propagate(&se->run_node, NULL);
slice = cfs_rq_min_slice(cfs_rq);
cfs_rq->h_nr_runnable -= h_nr_runnable;
@@ -7144,8 +7159,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
rq->next_balance = jiffies;
if (p && task_delayed) {
- SCHED_WARN_ON(!task_sleep);
- SCHED_WARN_ON(p->on_rq != 1);
+ WARN_ON_ONCE(!task_sleep);
+ WARN_ON_ONCE(p->on_rq != 1);
/* Fix-up what dequeue_task_fair() skipped */
hrtick_update(rq);
@@ -8723,7 +8738,7 @@ static inline void set_task_max_allowed_capacity(struct task_struct *p) {}
static void set_next_buddy(struct sched_entity *se)
{
for_each_sched_entity(se) {
- if (SCHED_WARN_ON(!se->on_rq))
+ if (WARN_ON_ONCE(!se->on_rq))
return;
if (se_is_idle(se))
return;
@@ -8783,8 +8798,15 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
* Preempt an idle entity in favor of a non-idle entity (and don't preempt
* in the inverse case).
*/
- if (cse_is_idle && !pse_is_idle)
+ if (cse_is_idle && !pse_is_idle) {
+ /*
+ * When non-idle entity preempt an idle entity,
+ * don't give idle entity slice protection.
+ */
+ cancel_protect_slice(se);
goto preempt;
+ }
+
if (cse_is_idle != pse_is_idle)
return;
@@ -8803,8 +8825,8 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
* Note that even if @p does not turn out to be the most eligible
* task at this moment, current's slice protection will be lost.
*/
- if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline)
- se->vlag = se->deadline + 1;
+ if (do_preempt_short(cfs_rq, pse, se))
+ cancel_protect_slice(se);
/*
* If @p has become the most eligible task, force preemption.
@@ -9417,12 +9439,11 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
return 0;
/* Prevent to re-select dst_cpu via env's CPUs: */
- for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
- if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
- env->flags |= LBF_DST_PINNED;
- env->new_dst_cpu = cpu;
- break;
- }
+ cpu = cpumask_first_and_and(env->dst_grpmask, env->cpus, p->cpus_ptr);
+
+ if (cpu < nr_cpu_ids) {
+ env->flags |= LBF_DST_PINNED;
+ env->new_dst_cpu = cpu;
}
return 0;
@@ -12461,7 +12482,7 @@ unlock:
void nohz_balance_exit_idle(struct rq *rq)
{
- SCHED_WARN_ON(rq != this_rq());
+ WARN_ON_ONCE(rq != this_rq());
if (likely(!rq->nohz_tick_stopped))
return;
@@ -12497,7 +12518,7 @@ void nohz_balance_enter_idle(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- SCHED_WARN_ON(cpu != smp_processor_id());
+ WARN_ON_ONCE(cpu != smp_processor_id());
/* If this CPU is going down, then nothing needs to be done: */
if (!cpu_active(cpu))
@@ -12580,7 +12601,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags)
int balance_cpu;
struct rq *rq;
- SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
+ WARN_ON_ONCE((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
/*
* We assume there will be no idle load after this update and clear
@@ -13020,7 +13041,7 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b,
struct cfs_rq *cfs_rqb;
s64 delta;
- SCHED_WARN_ON(task_rq(b)->core != rq->core);
+ WARN_ON_ONCE(task_rq(b)->core != rq->core);
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
@@ -13223,7 +13244,7 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
- SCHED_WARN_ON(p->se.sched_delayed);
+ WARN_ON_ONCE(p->se.sched_delayed);
attach_task_cfs_rq(p);
@@ -13258,7 +13279,7 @@ static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool firs
if (!first)
return;
- SCHED_WARN_ON(se->sched_delayed);
+ WARN_ON_ONCE(se->sched_delayed);
if (hrtick_enabled_fair(rq))
hrtick_start_fair(rq, p);
@@ -13645,7 +13666,6 @@ DEFINE_SCHED_CLASS(fair) = {
#endif
};
-#ifdef CONFIG_SCHED_DEBUG
void print_cfs_stats(struct seq_file *m, int cpu)
{
struct cfs_rq *cfs_rq, *pos;
@@ -13679,7 +13699,6 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
rcu_read_unlock();
}
#endif /* CONFIG_NUMA_BALANCING */
-#endif /* CONFIG_SCHED_DEBUG */
__init void init_sched_fair_class(void)
{
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 4b8e33c615b1..a4774155ae12 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -169,9 +169,8 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
{
-#ifdef CONFIG_SCHED_DEBUG
WARN_ON_ONCE(!rt_entity_is_task(rt_se));
-#endif
+
return container_of(rt_se, struct task_struct, rt);
}
@@ -1713,7 +1712,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)
BUG_ON(idx >= MAX_RT_PRIO);
queue = array->queue + idx;
- if (SCHED_WARN_ON(list_empty(queue)))
+ if (WARN_ON_ONCE(list_empty(queue)))
return NULL;
next = list_entry(queue->next, struct sched_rt_entity, run_list);
@@ -2910,6 +2909,7 @@ static int sched_rt_handler(const struct ctl_table *table, int write, void *buff
int ret;
mutex_lock(&mutex);
+ sched_domains_mutex_lock();
old_period = sysctl_sched_rt_period;
old_runtime = sysctl_sched_rt_runtime;
@@ -2936,6 +2936,7 @@ undo:
sysctl_sched_rt_period = old_period;
sysctl_sched_rt_runtime = old_runtime;
}
+ sched_domains_mutex_unlock();
mutex_unlock(&mutex);
return ret;
@@ -2967,7 +2968,6 @@ static int sched_rr_handler(const struct ctl_table *table, int write, void *buff
}
#endif /* CONFIG_SYSCTL */
-#ifdef CONFIG_SCHED_DEBUG
void print_rt_stats(struct seq_file *m, int cpu)
{
rt_rq_iter_t iter;
@@ -2978,4 +2978,3 @@ void print_rt_stats(struct seq_file *m, int cpu)
print_rt_rq(m, cpu, rt_rq);
rcu_read_unlock();
}
-#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 023b844159c9..47972f34ea70 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -91,12 +91,6 @@ struct cpuidle_state;
#include "cpupri.h"
#include "cpudeadline.h"
-#ifdef CONFIG_SCHED_DEBUG
-# define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
-#else
-# define SCHED_WARN_ON(x) ({ (void)(x), 0; })
-#endif
-
/* task_struct::on_rq states: */
#define TASK_ON_RQ_QUEUED 1
#define TASK_ON_RQ_MIGRATING 2
@@ -998,7 +992,7 @@ struct root_domain {
* Also, some corner cases, like 'wrap around' is dangerous, but given
* that u64 is 'big enough'. So that shouldn't be a concern.
*/
- u64 visit_gen;
+ u64 visit_cookie;
#ifdef HAVE_RT_PUSH_IPI
/*
@@ -1180,10 +1174,8 @@ struct rq {
atomic_t nr_iowait;
-#ifdef CONFIG_SCHED_DEBUG
u64 last_seen_need_resched_ns;
int ticks_without_resched;
-#endif
#ifdef CONFIG_MEMBARRIER
int membarrier_state;
@@ -1571,7 +1563,7 @@ static inline void update_idle_core(struct rq *rq) { }
static inline struct task_struct *task_of(struct sched_entity *se)
{
- SCHED_WARN_ON(!entity_is_task(se));
+ WARN_ON_ONCE(!entity_is_task(se));
return container_of(se, struct task_struct, se);
}
@@ -1652,7 +1644,7 @@ static inline void assert_clock_updated(struct rq *rq)
* The only reason for not seeing a clock update since the
* last rq_pin_lock() is if we're currently skipping updates.
*/
- SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP);
+ WARN_ON_ONCE(rq->clock_update_flags < RQCF_ACT_SKIP);
}
static inline u64 rq_clock(struct rq *rq)
@@ -1699,7 +1691,7 @@ static inline void rq_clock_cancel_skipupdate(struct rq *rq)
static inline void rq_clock_start_loop_update(struct rq *rq)
{
lockdep_assert_rq_held(rq);
- SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP);
+ WARN_ON_ONCE(rq->clock_update_flags & RQCF_ACT_SKIP);
rq->clock_update_flags |= RQCF_ACT_SKIP;
}
@@ -1712,14 +1704,12 @@ static inline void rq_clock_stop_loop_update(struct rq *rq)
struct rq_flags {
unsigned long flags;
struct pin_cookie cookie;
-#ifdef CONFIG_SCHED_DEBUG
/*
* A copy of (rq::clock_update_flags & RQCF_UPDATED) for the
* current pin context is stashed here in case it needs to be
* restored in rq_repin_lock().
*/
unsigned int clock_update_flags;
-#endif
};
extern struct balance_callback balance_push_callback;
@@ -1770,21 +1760,18 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
{
rf->cookie = lockdep_pin_lock(__rq_lockp(rq));
-#ifdef CONFIG_SCHED_DEBUG
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
rf->clock_update_flags = 0;
-# ifdef CONFIG_SMP
- SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != &balance_push_callback);
-# endif
+#ifdef CONFIG_SMP
+ WARN_ON_ONCE(rq->balance_callback && rq->balance_callback != &balance_push_callback);
#endif
}
static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
{
-#ifdef CONFIG_SCHED_DEBUG
if (rq->clock_update_flags > RQCF_ACT_SKIP)
rf->clock_update_flags = RQCF_UPDATED;
-#endif
+
scx_rq_clock_invalidate(rq);
lockdep_unpin_lock(__rq_lockp(rq), rf->cookie);
}
@@ -1793,12 +1780,10 @@ static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf)
{
lockdep_repin_lock(__rq_lockp(rq), rf->cookie);
-#ifdef CONFIG_SCHED_DEBUG
/*
* Restore the value we stashed in @rf for this pin context.
*/
rq->clock_update_flags |= rf->clock_update_flags;
-#endif
}
extern
@@ -2072,9 +2057,7 @@ struct sched_group_capacity {
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
-#ifdef CONFIG_SCHED_DEBUG
int id;
-#endif
unsigned long cpumask[]; /* Balance mask */
};
@@ -2114,13 +2097,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg)
extern int group_balance_cpu(struct sched_group *sg);
-#ifdef CONFIG_SCHED_DEBUG
extern void update_sched_domain_debugfs(void);
extern void dirty_sched_domain_sysctl(int cpu);
-#else
-static inline void update_sched_domain_debugfs(void) { }
-static inline void dirty_sched_domain_sysctl(int cpu) { }
-#endif
extern int sched_update_scaling(void);
@@ -2200,13 +2178,8 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
}
/*
- * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
+ * Tunables:
*/
-#ifdef CONFIG_SCHED_DEBUG
-# define const_debug __read_mostly
-#else
-# define const_debug const
-#endif
#define SCHED_FEAT(name, enabled) \
__SCHED_FEAT_##name ,
@@ -2218,13 +2191,11 @@ enum {
#undef SCHED_FEAT
-#ifdef CONFIG_SCHED_DEBUG
-
/*
* To support run-time toggling of sched features, all the translation units
* (but core.c) reference the sysctl_sched_features defined in core.c.
*/
-extern const_debug unsigned int sysctl_sched_features;
+extern __read_mostly unsigned int sysctl_sched_features;
#ifdef CONFIG_JUMP_LABEL
@@ -2246,24 +2217,6 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#endif /* !CONFIG_JUMP_LABEL */
-#else /* !SCHED_DEBUG: */
-
-/*
- * Each translation unit has its own copy of sysctl_sched_features to allow
- * constants propagation at compile time and compiler optimization based on
- * features default.
- */
-#define SCHED_FEAT(name, enabled) \
- (1UL << __SCHED_FEAT_##name) * enabled |
-static const_debug __maybe_unused unsigned int sysctl_sched_features =
-#include "features.h"
- 0;
-#undef SCHED_FEAT
-
-#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
-
-#endif /* !SCHED_DEBUG */
-
extern struct static_key_false sched_numa_balancing;
extern struct static_key_false sched_schedstats;
@@ -2685,7 +2638,7 @@ static inline void idle_set_state(struct rq *rq,
static inline struct cpuidle_state *idle_get_state(struct rq *rq)
{
- SCHED_WARN_ON(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held());
return rq->idle_state;
}
@@ -2843,12 +2796,11 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags);
# define SCHED_NR_MIGRATE_BREAK 32
#endif
-extern const_debug unsigned int sysctl_sched_nr_migrate;
-extern const_debug unsigned int sysctl_sched_migration_cost;
+extern __read_mostly unsigned int sysctl_sched_nr_migrate;
+extern __read_mostly unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_base_slice;
-#ifdef CONFIG_SCHED_DEBUG
extern int sysctl_resched_latency_warn_ms;
extern int sysctl_resched_latency_warn_once;
@@ -2859,7 +2811,6 @@ extern unsigned int sysctl_numa_balancing_scan_period_min;
extern unsigned int sysctl_numa_balancing_scan_period_max;
extern unsigned int sysctl_numa_balancing_scan_size;
extern unsigned int sysctl_numa_balancing_hot_threshold;
-#endif
#ifdef CONFIG_SCHED_HRTICK
@@ -2932,7 +2883,6 @@ unsigned long arch_scale_freq_capacity(int cpu)
}
#endif
-#ifdef CONFIG_SCHED_DEBUG
/*
* In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to
* acquire rq lock instead of rq_lock(). So at the end of these two functions
@@ -2947,9 +2897,6 @@ static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
#endif
}
-#else
-static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) { }
-#endif
#define DEFINE_LOCK_GUARD_2(name, type, _lock, _unlock, ...) \
__DEFINE_UNLOCK_GUARD(name, type, _unlock, type *lock2; __VA_ARGS__) \
@@ -3162,7 +3109,6 @@ extern struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
-#ifdef CONFIG_SCHED_DEBUG
extern bool sched_debug_verbose;
extern void print_cfs_stats(struct seq_file *m, int cpu);
@@ -3173,15 +3119,12 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
extern void resched_latency_warn(int cpu, u64 latency);
-# ifdef CONFIG_NUMA_BALANCING
+#ifdef CONFIG_NUMA_BALANCING
extern void show_numa_stats(struct task_struct *p, struct seq_file *m);
extern void
print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
unsigned long tpf, unsigned long gsf, unsigned long gpf);
-# endif /* CONFIG_NUMA_BALANCING */
-#else /* !CONFIG_SCHED_DEBUG: */
-static inline void resched_latency_warn(int cpu, u64 latency) { }
-#endif /* !CONFIG_SCHED_DEBUG */
+#endif /* CONFIG_NUMA_BALANCING */
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
extern void init_rt_rq(struct rt_rq *rt_rq);
@@ -3394,6 +3337,31 @@ static inline bool update_other_load_avgs(struct rq *rq) { return false; }
unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
+/*
+ * When uclamp is compiled in, the aggregation at rq level is 'turned off'
+ * by default in the fast path and only gets turned on once userspace performs
+ * an operation that requires it.
+ *
+ * Returns true if userspace opted-in to use uclamp and aggregation at rq level
+ * hence is active.
+ */
+static inline bool uclamp_is_used(void)
+{
+ return static_branch_likely(&sched_uclamp_used);
+}
+
+/*
+ * Enabling static branches would get the cpus_read_lock(),
+ * check whether uclamp_is_used before enable it to avoid always
+ * calling cpus_read_lock(). Because we never disable this
+ * static key once enable it.
+ */
+static inline void sched_uclamp_enable(void)
+{
+ if (!uclamp_is_used())
+ static_branch_enable(&sched_uclamp_used);
+}
+
static inline unsigned long uclamp_rq_get(struct rq *rq,
enum uclamp_id clamp_id)
{
@@ -3417,7 +3385,7 @@ static inline bool uclamp_rq_is_capped(struct rq *rq)
unsigned long rq_util;
unsigned long max_util;
- if (!static_branch_likely(&sched_uclamp_used))
+ if (!uclamp_is_used())
return false;
rq_util = cpu_util_cfs(cpu_of(rq)) + cpu_util_rt(rq);
@@ -3426,19 +3394,6 @@ static inline bool uclamp_rq_is_capped(struct rq *rq)
return max_util != SCHED_CAPACITY_SCALE && rq_util >= max_util;
}
-/*
- * When uclamp is compiled in, the aggregation at rq level is 'turned off'
- * by default in the fast path and only gets turned on once userspace performs
- * an operation that requires it.
- *
- * Returns true if userspace opted-in to use uclamp and aggregation at rq level
- * hence is active.
- */
-static inline bool uclamp_is_used(void)
-{
- return static_branch_likely(&sched_uclamp_used);
-}
-
#define for_each_clamp_id(clamp_id) \
for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
@@ -3486,6 +3441,8 @@ static inline bool uclamp_is_used(void)
return false;
}
+static inline void sched_uclamp_enable(void) {}
+
static inline unsigned long
uclamp_rq_get(struct rq *rq, enum uclamp_id clamp_id)
{
@@ -3619,6 +3576,7 @@ extern int preempt_dynamic_mode;
extern int sched_dynamic_mode(const char *str);
extern void sched_dynamic_update(int mode);
#endif
+extern const char *preempt_modes[];
#ifdef CONFIG_SCHED_MM_CID
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 19cdbe96f93d..452826df6ae1 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -144,7 +144,7 @@ static inline void psi_enqueue(struct task_struct *p, int flags)
if (p->se.sched_delayed) {
/* CPU migration of "sleeping" task */
- SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED));
+ WARN_ON_ONCE(!(flags & ENQUEUE_MIGRATED));
if (p->in_memstall)
set |= TSK_MEMSTALL;
if (p->in_iowait)
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 456d339be98f..c326de1344fb 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -368,7 +368,7 @@ static int uclamp_validate(struct task_struct *p,
* blocking operation which obviously cannot be done while holding
* scheduler locks.
*/
- static_branch_enable(&sched_uclamp_used);
+ sched_uclamp_enable();
return 0;
}
@@ -875,7 +875,7 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
{
struct sched_param lparam;
- if (!param || pid < 0)
+ if (unlikely(!param || pid < 0))
return -EINVAL;
if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
return -EFAULT;
@@ -984,7 +984,7 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
struct sched_attr attr;
int retval;
- if (!uattr || pid < 0 || flags)
+ if (unlikely(!uattr || pid < 0 || flags))
return -EINVAL;
retval = sched_copy_attr(uattr, &attr);
@@ -1049,7 +1049,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
struct task_struct *p;
int retval;
- if (!param || pid < 0)
+ if (unlikely(!param || pid < 0))
return -EINVAL;
scoped_guard (rcu) {
@@ -1085,8 +1085,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
struct task_struct *p;
int retval;
- if (!uattr || pid < 0 || usize > PAGE_SIZE ||
- usize < SCHED_ATTR_SIZE_VER0 || flags)
+ if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE ||
+ usize < SCHED_ATTR_SIZE_VER0 || flags))
return -EINVAL;
scoped_guard (rcu) {
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index c49aea8c1025..f1ebc60d967f 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -6,13 +6,19 @@
#include <linux/bsearch.h>
DEFINE_MUTEX(sched_domains_mutex);
+void sched_domains_mutex_lock(void)
+{
+ mutex_lock(&sched_domains_mutex);
+}
+void sched_domains_mutex_unlock(void)
+{
+ mutex_unlock(&sched_domains_mutex);
+}
/* Protected by sched_domains_mutex: */
static cpumask_var_t sched_domains_tmpmask;
static cpumask_var_t sched_domains_tmpmask2;
-#ifdef CONFIG_SCHED_DEBUG
-
static int __init sched_debug_setup(char *str)
{
sched_debug_verbose = true;
@@ -151,15 +157,6 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
break;
}
}
-#else /* !CONFIG_SCHED_DEBUG */
-
-# define sched_debug_verbose 0
-# define sched_domain_debug(sd, cpu) do { } while (0)
-static inline bool sched_debug(void)
-{
- return false;
-}
-#endif /* CONFIG_SCHED_DEBUG */
/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) |
@@ -560,7 +557,7 @@ static int init_rootdomain(struct root_domain *rd)
rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func);
#endif
- rd->visit_gen = 0;
+ rd->visit_cookie = 0;
init_dl_bw(&rd->dl_bw);
if (cpudl_init(&rd->cpudl) != 0)
goto free_rto_mask;
@@ -2275,9 +2272,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sgc)
return -ENOMEM;
-#ifdef CONFIG_SCHED_DEBUG
sgc->id = j;
-#endif
*per_cpu_ptr(sdd->sgc, j) = sgc;
}
@@ -2680,7 +2675,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
*
* Call with hotplug lock and sched_domains_mutex held
*/
-void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
+static void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
struct sched_domain_attr *dattr_new)
{
bool __maybe_unused has_eas = false;
@@ -2712,21 +2707,8 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
for (i = 0; i < ndoms_cur; i++) {
for (j = 0; j < n && !new_topology; j++) {
if (cpumask_equal(doms_cur[i], doms_new[j]) &&
- dattrs_equal(dattr_cur, i, dattr_new, j)) {
- struct root_domain *rd;
-
- /*
- * This domain won't be destroyed and as such
- * its dl_bw->total_bw needs to be cleared.
- * Tasks contribution will be then recomputed
- * in function dl_update_tasks_root_domain(),
- * dl_servers contribution in function
- * dl_restore_server_root_domain().
- */
- rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
- dl_clear_root_domain(rd);
+ dattrs_equal(dattr_cur, i, dattr_new, j))
goto match1;
- }
}
/* No match - a current sched domain not in new doms_new[] */
detach_destroy_domains(doms_cur[i]);
@@ -2783,6 +2765,7 @@ match3:
ndoms_cur = ndoms_new;
update_sched_domain_debugfs();
+ dl_rebuild_rd_accounting();
}
/*
@@ -2791,7 +2774,7 @@ match3:
void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
struct sched_domain_attr *dattr_new)
{
- mutex_lock(&sched_domains_mutex);
+ sched_domains_mutex_lock();
partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
- mutex_unlock(&sched_domains_mutex);
+ sched_domains_mutex_unlock();
}