summaryrefslogtreecommitdiff
path: root/kernel/sched/ext.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/ext.c')
-rw-r--r--kernel/sched/ext.c1309
1 files changed, 528 insertions, 781 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 19813b387ef9..21575d39c376 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6,6 +6,9 @@
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
*/
+#include <linux/btf_ids.h>
+#include "ext_idle.h"
+
#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
enum scx_consts {
@@ -93,7 +96,7 @@ enum scx_ops_flags {
/*
* Keep built-in idle tracking even if ops.update_idle() is implemented.
*/
- SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
+ SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
/*
* By default, if there are no other task to run on the CPU, ext core
@@ -101,7 +104,7 @@ enum scx_ops_flags {
* flag is specified, such tasks are passed to ops.enqueue() with
* %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
*/
- SCX_OPS_ENQ_LAST = 1LLU << 1,
+ SCX_OPS_ENQ_LAST = 1LLU << 1,
/*
* An exiting task may schedule after PF_EXITING is set. In such cases,
@@ -114,13 +117,48 @@ enum scx_ops_flags {
* depend on pid lookups and wants to handle these tasks directly, the
* following flag can be used.
*/
- SCX_OPS_ENQ_EXITING = 1LLU << 2,
+ SCX_OPS_ENQ_EXITING = 1LLU << 2,
/*
* If set, only tasks with policy set to SCHED_EXT are attached to
* sched_ext. If clear, SCHED_NORMAL tasks are also included.
*/
- SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
+ SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
+
+ /*
+ * A migration disabled task can only execute on its current CPU. By
+ * default, such tasks are automatically put on the CPU's local DSQ with
+ * the default slice on enqueue. If this ops flag is set, they also go
+ * through ops.enqueue().
+ *
+ * A migration disabled task never invokes ops.select_cpu() as it can
+ * only select the current CPU. Also, p->cpus_ptr will only contain its
+ * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
+ * and thus may disagree with cpumask_weight(p->cpus_ptr).
+ */
+ SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
+
+ /*
+ * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes
+ * ops.enqueue() on the ops.select_cpu() selected or the wakee's
+ * previous CPU via IPI (inter-processor interrupt) to reduce cacheline
+ * transfers. When this optimization is enabled, ops.select_cpu() is
+ * skipped in some cases (when racing against the wakee switching out).
+ * As the BPF scheduler may depend on ops.select_cpu() being invoked
+ * during wakeups, queued wakeup is disabled by default.
+ *
+ * If this ops flag is set, queued wakeup optimization is enabled and
+ * the BPF scheduler must be able to handle ops.enqueue() invoked on the
+ * wakee's CPU without preceding ops.select_cpu() even for tasks which
+ * may be executed on multiple CPUs.
+ */
+ SCX_OPS_ALLOW_QUEUED_WAKEUP = 1LLU << 5,
+
+ /*
+ * If set, enable per-node idle cpumasks. If clear, use a single global
+ * flat idle cpumask.
+ */
+ SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 6,
/*
* CPU cgroup support flags
@@ -130,7 +168,10 @@ enum scx_ops_flags {
SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
SCX_OPS_ENQ_LAST |
SCX_OPS_ENQ_EXITING |
+ SCX_OPS_ENQ_MIGRATION_DISABLED |
+ SCX_OPS_ALLOW_QUEUED_WAKEUP |
SCX_OPS_SWITCH_PARTIAL |
+ SCX_OPS_BUILTIN_IDLE_PER_NODE |
SCX_OPS_HAS_CGROUP_WEIGHT,
};
@@ -206,7 +247,7 @@ struct scx_dump_ctx {
*/
struct sched_ext_ops {
/**
- * select_cpu - Pick the target CPU for a task which is being woken up
+ * @select_cpu: Pick the target CPU for a task which is being woken up
* @p: task being woken up
* @prev_cpu: the cpu @p was on before sleeping
* @wake_flags: SCX_WAKE_*
@@ -233,7 +274,7 @@ struct sched_ext_ops {
s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
/**
- * enqueue - Enqueue a task on the BPF scheduler
+ * @enqueue: Enqueue a task on the BPF scheduler
* @p: task being enqueued
* @enq_flags: %SCX_ENQ_*
*
@@ -248,7 +289,7 @@ struct sched_ext_ops {
void (*enqueue)(struct task_struct *p, u64 enq_flags);
/**
- * dequeue - Remove a task from the BPF scheduler
+ * @dequeue: Remove a task from the BPF scheduler
* @p: task being dequeued
* @deq_flags: %SCX_DEQ_*
*
@@ -264,7 +305,7 @@ struct sched_ext_ops {
void (*dequeue)(struct task_struct *p, u64 deq_flags);
/**
- * dispatch - Dispatch tasks from the BPF scheduler and/or user DSQs
+ * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs
* @cpu: CPU to dispatch tasks for
* @prev: previous task being switched out
*
@@ -287,7 +328,7 @@ struct sched_ext_ops {
void (*dispatch)(s32 cpu, struct task_struct *prev);
/**
- * tick - Periodic tick
+ * @tick: Periodic tick
* @p: task running currently
*
* This operation is called every 1/HZ seconds on CPUs which are
@@ -297,7 +338,7 @@ struct sched_ext_ops {
void (*tick)(struct task_struct *p);
/**
- * runnable - A task is becoming runnable on its associated CPU
+ * @runnable: A task is becoming runnable on its associated CPU
* @p: task becoming runnable
* @enq_flags: %SCX_ENQ_*
*
@@ -324,7 +365,7 @@ struct sched_ext_ops {
void (*runnable)(struct task_struct *p, u64 enq_flags);
/**
- * running - A task is starting to run on its associated CPU
+ * @running: A task is starting to run on its associated CPU
* @p: task starting to run
*
* See ->runnable() for explanation on the task state notifiers.
@@ -332,7 +373,7 @@ struct sched_ext_ops {
void (*running)(struct task_struct *p);
/**
- * stopping - A task is stopping execution
+ * @stopping: A task is stopping execution
* @p: task stopping to run
* @runnable: is task @p still runnable?
*
@@ -343,7 +384,7 @@ struct sched_ext_ops {
void (*stopping)(struct task_struct *p, bool runnable);
/**
- * quiescent - A task is becoming not runnable on its associated CPU
+ * @quiescent: A task is becoming not runnable on its associated CPU
* @p: task becoming not runnable
* @deq_flags: %SCX_DEQ_*
*
@@ -363,7 +404,7 @@ struct sched_ext_ops {
void (*quiescent)(struct task_struct *p, u64 deq_flags);
/**
- * yield - Yield CPU
+ * @yield: Yield CPU
* @from: yielding task
* @to: optional yield target task
*
@@ -378,7 +419,7 @@ struct sched_ext_ops {
bool (*yield)(struct task_struct *from, struct task_struct *to);
/**
- * core_sched_before - Task ordering for core-sched
+ * @core_sched_before: Task ordering for core-sched
* @a: task A
* @b: task B
*
@@ -396,7 +437,7 @@ struct sched_ext_ops {
bool (*core_sched_before)(struct task_struct *a, struct task_struct *b);
/**
- * set_weight - Set task weight
+ * @set_weight: Set task weight
* @p: task to set weight for
* @weight: new weight [1..10000]
*
@@ -405,7 +446,7 @@ struct sched_ext_ops {
void (*set_weight)(struct task_struct *p, u32 weight);
/**
- * set_cpumask - Set CPU affinity
+ * @set_cpumask: Set CPU affinity
* @p: task to set CPU affinity for
* @cpumask: cpumask of cpus that @p can run on
*
@@ -415,8 +456,8 @@ struct sched_ext_ops {
const struct cpumask *cpumask);
/**
- * update_idle - Update the idle state of a CPU
- * @cpu: CPU to udpate the idle state for
+ * @update_idle: Update the idle state of a CPU
+ * @cpu: CPU to update the idle state for
* @idle: whether entering or exiting the idle state
*
* This operation is called when @rq's CPU goes or leaves the idle
@@ -436,7 +477,7 @@ struct sched_ext_ops {
void (*update_idle)(s32 cpu, bool idle);
/**
- * cpu_acquire - A CPU is becoming available to the BPF scheduler
+ * @cpu_acquire: A CPU is becoming available to the BPF scheduler
* @cpu: The CPU being acquired by the BPF scheduler.
* @args: Acquire arguments, see the struct definition.
*
@@ -446,7 +487,7 @@ struct sched_ext_ops {
void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
/**
- * cpu_release - A CPU is taken away from the BPF scheduler
+ * @cpu_release: A CPU is taken away from the BPF scheduler
* @cpu: The CPU being released by the BPF scheduler.
* @args: Release arguments, see the struct definition.
*
@@ -458,7 +499,7 @@ struct sched_ext_ops {
void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
/**
- * init_task - Initialize a task to run in a BPF scheduler
+ * @init_task: Initialize a task to run in a BPF scheduler
* @p: task to initialize for BPF scheduling
* @args: init arguments, see the struct definition
*
@@ -473,8 +514,9 @@ struct sched_ext_ops {
s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args);
/**
- * exit_task - Exit a previously-running task from the system
+ * @exit_task: Exit a previously-running task from the system
* @p: task to exit
+ * @args: exit arguments, see the struct definition
*
* @p is exiting or the BPF scheduler is being unloaded. Perform any
* necessary cleanup for @p.
@@ -482,7 +524,7 @@ struct sched_ext_ops {
void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args);
/**
- * enable - Enable BPF scheduling for a task
+ * @enable: Enable BPF scheduling for a task
* @p: task to enable BPF scheduling for
*
* Enable @p for BPF scheduling. enable() is called on @p any time it
@@ -491,7 +533,7 @@ struct sched_ext_ops {
void (*enable)(struct task_struct *p);
/**
- * disable - Disable BPF scheduling for a task
+ * @disable: Disable BPF scheduling for a task
* @p: task to disable BPF scheduling for
*
* @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
@@ -501,7 +543,7 @@ struct sched_ext_ops {
void (*disable)(struct task_struct *p);
/**
- * dump - Dump BPF scheduler state on error
+ * @dump: Dump BPF scheduler state on error
* @ctx: debug dump context
*
* Use scx_bpf_dump() to generate BPF scheduler specific debug dump.
@@ -509,7 +551,7 @@ struct sched_ext_ops {
void (*dump)(struct scx_dump_ctx *ctx);
/**
- * dump_cpu - Dump BPF scheduler state for a CPU on error
+ * @dump_cpu: Dump BPF scheduler state for a CPU on error
* @ctx: debug dump context
* @cpu: CPU to generate debug dump for
* @idle: @cpu is currently idle without any runnable tasks
@@ -521,7 +563,7 @@ struct sched_ext_ops {
void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
/**
- * dump_task - Dump BPF scheduler state for a runnable task on error
+ * @dump_task: Dump BPF scheduler state for a runnable task on error
* @ctx: debug dump context
* @p: runnable task to generate debug dump for
*
@@ -532,7 +574,7 @@ struct sched_ext_ops {
#ifdef CONFIG_EXT_GROUP_SCHED
/**
- * cgroup_init - Initialize a cgroup
+ * @cgroup_init: Initialize a cgroup
* @cgrp: cgroup being initialized
* @args: init arguments, see the struct definition
*
@@ -547,7 +589,7 @@ struct sched_ext_ops {
struct scx_cgroup_init_args *args);
/**
- * cgroup_exit - Exit a cgroup
+ * @cgroup_exit: Exit a cgroup
* @cgrp: cgroup being exited
*
* Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
@@ -556,7 +598,7 @@ struct sched_ext_ops {
void (*cgroup_exit)(struct cgroup *cgrp);
/**
- * cgroup_prep_move - Prepare a task to be moved to a different cgroup
+ * @cgroup_prep_move: Prepare a task to be moved to a different cgroup
* @p: task being moved
* @from: cgroup @p is being moved from
* @to: cgroup @p is being moved to
@@ -571,7 +613,7 @@ struct sched_ext_ops {
struct cgroup *from, struct cgroup *to);
/**
- * cgroup_move - Commit cgroup move
+ * @cgroup_move: Commit cgroup move
* @p: task being moved
* @from: cgroup @p is being moved from
* @to: cgroup @p is being moved to
@@ -582,7 +624,7 @@ struct sched_ext_ops {
struct cgroup *from, struct cgroup *to);
/**
- * cgroup_cancel_move - Cancel cgroup move
+ * @cgroup_cancel_move: Cancel cgroup move
* @p: task whose cgroup move is being canceled
* @from: cgroup @p was being moved from
* @to: cgroup @p was being moved to
@@ -594,7 +636,7 @@ struct sched_ext_ops {
struct cgroup *from, struct cgroup *to);
/**
- * cgroup_set_weight - A cgroup's weight is being changed
+ * @cgroup_set_weight: A cgroup's weight is being changed
* @cgrp: cgroup whose weight is being updated
* @weight: new weight [1..10000]
*
@@ -608,7 +650,7 @@ struct sched_ext_ops {
*/
/**
- * cpu_online - A CPU became online
+ * @cpu_online: A CPU became online
* @cpu: CPU which just came up
*
* @cpu just came online. @cpu will not call ops.enqueue() or
@@ -617,7 +659,7 @@ struct sched_ext_ops {
void (*cpu_online)(s32 cpu);
/**
- * cpu_offline - A CPU is going offline
+ * @cpu_offline: A CPU is going offline
* @cpu: CPU which is going offline
*
* @cpu is going offline. @cpu will not call ops.enqueue() or
@@ -630,12 +672,12 @@ struct sched_ext_ops {
*/
/**
- * init - Initialize the BPF scheduler
+ * @init: Initialize the BPF scheduler
*/
s32 (*init)(void);
/**
- * exit - Clean up after the BPF scheduler
+ * @exit: Clean up after the BPF scheduler
* @info: Exit info
*
* ops.exit() is also called on ops.init() failure, which is a bit
@@ -645,17 +687,17 @@ struct sched_ext_ops {
void (*exit)(struct scx_exit_info *info);
/**
- * dispatch_max_batch - Max nr of tasks that dispatch() can dispatch
+ * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch
*/
u32 dispatch_max_batch;
/**
- * flags - %SCX_OPS_* flags
+ * @flags: %SCX_OPS_* flags
*/
u64 flags;
/**
- * timeout_ms - The maximum amount of time, in milliseconds, that a
+ * @timeout_ms: The maximum amount of time, in milliseconds, that a
* runnable task should be able to wait before being scheduled. The
* maximum timeout may not exceed the default timeout of 30 seconds.
*
@@ -664,13 +706,13 @@ struct sched_ext_ops {
u32 timeout_ms;
/**
- * exit_dump_len - scx_exit_info.dump buffer length. If 0, the default
+ * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default
* value of 32768 is used.
*/
u32 exit_dump_len;
/**
- * hotplug_seq - A sequence number that may be set by the scheduler to
+ * @hotplug_seq: A sequence number that may be set by the scheduler to
* detect when a hotplug event has occurred during the loading process.
* If 0, no detection occurs. Otherwise, the scheduler will fail to
* load if the sequence number does not match @scx_hotplug_seq on the
@@ -679,7 +721,7 @@ struct sched_ext_ops {
u64 hotplug_seq;
/**
- * name - BPF scheduler's name
+ * @name: BPF scheduler's name
*
* Must be a non-zero valid BPF object name including only isalnum(),
* '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
@@ -764,6 +806,7 @@ enum scx_deq_flags {
enum scx_pick_idle_cpu_flags {
SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */
+ SCX_PICK_IDLE_IN_NODE = 1LLU << 1, /* pick a CPU in the same target NUMA node */
};
enum scx_kick_flags {
@@ -879,15 +922,11 @@ DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
static struct sched_ext_ops scx_ops;
static bool scx_warned_zero_slice;
+DEFINE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
+static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled);
static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
-static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
-
-#ifdef CONFIG_SMP
-static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
-static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa);
-#endif
static struct static_key_false scx_has_op[SCX_OPI_END] =
{ [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
@@ -922,21 +961,6 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
static struct delayed_work scx_watchdog_work;
-/* idle tracking */
-#ifdef CONFIG_SMP
-#ifdef CONFIG_CPUMASK_OFFSTACK
-#define CL_ALIGNED_IF_ONSTACK
-#else
-#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp
-#endif
-
-static struct {
- cpumask_var_t cpu;
- cpumask_var_t smt;
-} idle_masks CL_ALIGNED_IF_ONSTACK;
-
-#endif /* CONFIG_SMP */
-
/* for %SCX_KICK_WAIT */
static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
@@ -960,7 +984,7 @@ static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
static struct scx_dispatch_q **global_dsqs;
static const struct rhashtable_params dsq_hash_params = {
- .key_len = 8,
+ .key_len = sizeof_field(struct scx_dispatch_q, id),
.key_offset = offsetof(struct scx_dispatch_q, id),
.head_offset = offsetof(struct scx_dispatch_q, hash_node),
};
@@ -1213,7 +1237,7 @@ static bool scx_kf_allowed_if_unlocked(void)
/**
* nldsq_next_task - Iterate to the next task in a non-local DSQ
- * @dsq: user dsq being interated
+ * @dsq: user dsq being iterated
* @cur: current position, %NULL to start iteration
* @rev: walk backwards
*
@@ -1408,7 +1432,6 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
/**
* scx_task_iter_next_locked - Next non-idle task with its rq locked
* @iter: iterator to walk
- * @include_dead: Whether we should include dead tasks in the iteration
*
* Visit the non-idle task with its rq lock held. Allows callers to specify
* whether they would like to filter out dead tasks. See scx_task_iter_start()
@@ -1458,6 +1481,117 @@ static struct task_struct *scx_task_iter_next_locked(struct scx_task_iter *iter)
return p;
}
+/*
+ * Collection of event counters. Event types are placed in descending order.
+ */
+struct scx_event_stats {
+ /*
+ * If ops.select_cpu() returns a CPU which can't be used by the task,
+ * the core scheduler code silently picks a fallback CPU.
+ */
+ s64 SCX_EV_SELECT_CPU_FALLBACK;
+
+ /*
+ * When dispatching to a local DSQ, the CPU may have gone offline in
+ * the meantime. In this case, the task is bounced to the global DSQ.
+ */
+ s64 SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE;
+
+ /*
+ * If SCX_OPS_ENQ_LAST is not set, the number of times that a task
+ * continued to run because there were no other tasks on the CPU.
+ */
+ s64 SCX_EV_DISPATCH_KEEP_LAST;
+
+ /*
+ * If SCX_OPS_ENQ_EXITING is not set, the number of times that a task
+ * is dispatched to a local DSQ when exiting.
+ */
+ s64 SCX_EV_ENQ_SKIP_EXITING;
+
+ /*
+ * If SCX_OPS_ENQ_MIGRATION_DISABLED is not set, the number of times a
+ * migration disabled task skips ops.enqueue() and is dispatched to its
+ * local DSQ.
+ */
+ s64 SCX_EV_ENQ_SKIP_MIGRATION_DISABLED;
+
+ /*
+ * The total number of tasks enqueued (or pick_task-ed) with a
+ * default time slice (SCX_SLICE_DFL).
+ */
+ s64 SCX_EV_ENQ_SLICE_DFL;
+
+ /*
+ * The total duration of bypass modes in nanoseconds.
+ */
+ s64 SCX_EV_BYPASS_DURATION;
+
+ /*
+ * The number of tasks dispatched in the bypassing mode.
+ */
+ s64 SCX_EV_BYPASS_DISPATCH;
+
+ /*
+ * The number of times the bypassing mode has been activated.
+ */
+ s64 SCX_EV_BYPASS_ACTIVATE;
+};
+
+/*
+ * The event counter is organized by a per-CPU variable to minimize the
+ * accounting overhead without synchronization. A system-wide view on the
+ * event counter is constructed when requested by scx_bpf_get_event_stat().
+ */
+static DEFINE_PER_CPU(struct scx_event_stats, event_stats_cpu);
+
+/**
+ * scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @name: an event name defined in struct scx_event_stats
+ * @cnt: the number of the event occured
+ *
+ * This can be used when preemption is not disabled.
+ */
+#define scx_add_event(name, cnt) do { \
+ this_cpu_add(event_stats_cpu.name, cnt); \
+ trace_sched_ext_event(#name, cnt); \
+} while(0)
+
+/**
+ * __scx_add_event - Increase an event counter for 'name' by 'cnt'
+ * @name: an event name defined in struct scx_event_stats
+ * @cnt: the number of the event occured
+ *
+ * This should be used only when preemption is disabled.
+ */
+#define __scx_add_event(name, cnt) do { \
+ __this_cpu_add(event_stats_cpu.name, cnt); \
+ trace_sched_ext_event(#name, cnt); \
+} while(0)
+
+/**
+ * scx_agg_event - Aggregate an event counter 'kind' from 'src_e' to 'dst_e'
+ * @dst_e: destination event stats
+ * @src_e: source event stats
+ * @kind: a kind of event to be aggregated
+ */
+#define scx_agg_event(dst_e, src_e, kind) do { \
+ (dst_e)->kind += READ_ONCE((src_e)->kind); \
+} while(0)
+
+/**
+ * scx_dump_event - Dump an event 'kind' in 'events' to 's'
+ * @s: output seq_buf
+ * @events: event stats
+ * @kind: a kind of event to dump
+ */
+#define scx_dump_event(s, events, kind) do { \
+ dump_line(&(s), "%40s: %16lld", #kind, (events)->kind); \
+} while (0)
+
+
+static void scx_bpf_events(struct scx_event_stats *events, size_t events__sz);
+
static enum scx_ops_enable_state scx_ops_enable_state(void)
{
return atomic_read(&scx_ops_enable_state_var);
@@ -2003,16 +2137,27 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
if (!scx_rq_online(rq))
goto local;
- if (scx_rq_bypassing(rq))
+ if (scx_rq_bypassing(rq)) {
+ __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
goto global;
+ }
if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
goto direct;
/* see %SCX_OPS_ENQ_EXITING */
if (!static_branch_unlikely(&scx_ops_enq_exiting) &&
- unlikely(p->flags & PF_EXITING))
+ unlikely(p->flags & PF_EXITING)) {
+ __scx_add_event(SCX_EV_ENQ_SKIP_EXITING, 1);
+ goto local;
+ }
+
+ /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
+ if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) &&
+ is_migration_disabled(p)) {
+ __scx_add_event(SCX_EV_ENQ_SKIP_MIGRATION_DISABLED, 1);
goto local;
+ }
if (!SCX_HAS_OP(enqueue))
goto global;
@@ -2052,6 +2197,7 @@ local:
*/
touch_core_sched(rq, p);
p->scx.slice = SCX_SLICE_DFL;
+ __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
local_norefill:
dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
return;
@@ -2059,6 +2205,7 @@ local_norefill:
global:
touch_core_sched(rq, p); /* see the comment in local: */
p->scx.slice = SCX_SLICE_DFL;
+ __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
dispatch_enqueue(find_global_dsq(p), p, enq_flags);
}
@@ -2078,7 +2225,7 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p)
/*
* list_add_tail() must be used. scx_ops_bypass() depends on tasks being
- * appened to the runnable_list.
+ * appended to the runnable_list.
*/
list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
}
@@ -2130,6 +2277,10 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
do_enqueue_task(rq, p, enq_flags, sticky_cpu);
out:
rq->scx.flags &= ~SCX_RQ_IN_WAKEUP;
+
+ if ((enq_flags & SCX_ENQ_CPU_SELECTED) &&
+ unlikely(cpu_of(rq) != p->scx.selected_cpu))
+ __scx_add_event(SCX_EV_SELECT_CPU_FALLBACK, 1);
}
static void ops_dequeue(struct task_struct *p, u64 deq_flags)
@@ -2313,12 +2464,35 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
*
* - The BPF scheduler is bypassed while the rq is offline and we can always say
* no to the BPF scheduler initiated migrations while offline.
+ *
+ * The caller must ensure that @p and @rq are on different CPUs.
*/
static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
- bool trigger_error)
+ bool enforce)
{
int cpu = cpu_of(rq);
+ WARN_ON_ONCE(task_cpu(p) == cpu);
+
+ /*
+ * If @p has migration disabled, @p->cpus_ptr is updated to contain only
+ * the pinned CPU in migrate_disable_switch() while @p is being switched
+ * out. However, put_prev_task_scx() is called before @p->cpus_ptr is
+ * updated and thus another CPU may see @p on a DSQ inbetween leading to
+ * @p passing the below task_allowed_on_cpu() check while migration is
+ * disabled.
+ *
+ * Test the migration disabled state first as the race window is narrow
+ * and the BPF scheduler failing to check migration disabled state can
+ * easily be masked if task_allowed_on_cpu() is done first.
+ */
+ if (unlikely(is_migration_disabled(p))) {
+ if (enforce)
+ scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d",
+ p->comm, p->pid, task_cpu(p), cpu);
+ return false;
+ }
+
/*
* We don't require the BPF scheduler to avoid dispatching to offline
* CPUs mostly for convenience but also because CPUs can go offline
@@ -2326,17 +2500,17 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
* picked CPU is outside the allowed mask.
*/
if (!task_allowed_on_cpu(p, cpu)) {
- if (trigger_error)
- scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]",
- cpu_of(rq), p->comm, p->pid);
+ if (enforce)
+ scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]",
+ cpu, p->comm, p->pid);
return false;
}
- if (unlikely(is_migration_disabled(p)))
- return false;
-
- if (!scx_rq_online(rq))
+ if (!scx_rq_online(rq)) {
+ if (enforce)
+ __scx_add_event(SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE, 1);
return false;
+ }
return true;
}
@@ -2406,7 +2580,7 @@ static bool consume_remote_task(struct rq *this_rq, struct task_struct *p,
}
#else /* CONFIG_SMP */
static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); }
-static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; }
+static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool enforce) { return false; }
static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; }
#endif /* CONFIG_SMP */
@@ -2437,7 +2611,8 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
if (dst_dsq->id == SCX_DSQ_LOCAL) {
dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
- if (!task_can_run_on_remote_rq(p, dst_rq, true)) {
+ if (src_rq != dst_rq &&
+ unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
dst_dsq = find_global_dsq(p);
dst_rq = src_rq;
}
@@ -2480,7 +2655,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
/*
* A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly
* banging on the same DSQ on a large NUMA system to the point where switching
- * to the bypass mode can take a long time. Inject artifical delays while the
+ * to the bypass mode can take a long time. Inject artificial delays while the
* bypass mode is switching to guarantee timely completion.
*/
static void scx_ops_breather(struct rq *rq)
@@ -2575,6 +2750,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
{
struct rq *src_rq = task_rq(p);
struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
+#ifdef CONFIG_SMP
+ struct rq *locked_rq = rq;
+#endif
/*
* We're synchronized against dequeue through DISPATCHING. As @p can't
@@ -2588,7 +2766,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
}
#ifdef CONFIG_SMP
- if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
+ if (src_rq != dst_rq &&
+ unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
dispatch_enqueue(find_global_dsq(p), p,
enq_flags | SCX_ENQ_CLEAR_OPSS);
return;
@@ -2611,8 +2790,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
/* switch to @src_rq lock */
- if (rq != src_rq) {
- raw_spin_rq_unlock(rq);
+ if (locked_rq != src_rq) {
+ raw_spin_rq_unlock(locked_rq);
+ locked_rq = src_rq;
raw_spin_rq_lock(src_rq);
}
@@ -2630,6 +2810,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
} else {
move_remote_task_to_local_dsq(p, enq_flags,
src_rq, dst_rq);
+ /* task has been moved to dst_rq, which is now locked */
+ locked_rq = dst_rq;
}
/* if the destination CPU is idle, wake it up */
@@ -2638,8 +2820,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
}
/* switch back to @rq lock */
- if (rq != dst_rq) {
- raw_spin_rq_unlock(dst_rq);
+ if (locked_rq != rq) {
+ raw_spin_rq_unlock(locked_rq);
raw_spin_rq_lock(rq);
}
#else /* CONFIG_SMP */
@@ -2845,6 +3027,7 @@ no_tasks:
if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) ||
scx_rq_bypassing(rq))) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
+ __scx_add_event(SCX_EV_DISPATCH_KEEP_LAST, 1);
goto has_tasks;
}
rq->scx.flags &= ~SCX_RQ_IN_BALANCE;
@@ -3069,7 +3252,6 @@ static struct task_struct *pick_task_scx(struct rq *rq)
{
struct task_struct *prev = rq->curr;
struct task_struct *p;
- bool prev_on_scx = prev->sched_class == &ext_sched_class;
bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
bool kick_idle = false;
@@ -3089,14 +3271,18 @@ static struct task_struct *pick_task_scx(struct rq *rq)
* if pick_task_scx() is called without preceding balance_scx().
*/
if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) {
- if (prev_on_scx) {
+ if (prev->scx.flags & SCX_TASK_QUEUED) {
keep_prev = true;
} else {
keep_prev = false;
kick_idle = true;
}
- } else if (unlikely(keep_prev && !prev_on_scx)) {
- /* only allowed during transitions */
+ } else if (unlikely(keep_prev &&
+ prev->sched_class != &ext_sched_class)) {
+ /*
+ * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is
+ * conditional on scx_enabled() and may have been skipped.
+ */
WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED);
keep_prev = false;
}
@@ -3108,8 +3294,10 @@ static struct task_struct *pick_task_scx(struct rq *rq)
*/
if (keep_prev) {
p = prev;
- if (!p->scx.slice)
+ if (!p->scx.slice) {
p->scx.slice = SCX_SLICE_DFL;
+ __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
+ }
} else {
p = first_local_task(rq);
if (!p) {
@@ -3125,6 +3313,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
scx_warned_zero_slice = true;
}
p->scx.slice = SCX_SLICE_DFL;
+ __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
}
}
@@ -3136,6 +3325,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
* scx_prio_less - Task ordering for core-sched
* @a: task A
* @b: task B
+ * @in_fi: in forced idle state
*
* Core-sched is implemented as an additional scheduling layer on top of the
* usual sched_class'es and needs to find out the expected task ordering. For
@@ -3143,7 +3333,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
*
* Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used
* to implement the default task ordering. The older the timestamp, the higher
- * prority the task - the global FIFO ordering matching the default scheduling
+ * priority the task - the global FIFO ordering matching the default scheduling
* behavior.
*
* When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to
@@ -3168,354 +3358,10 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
#ifdef CONFIG_SMP
-static bool test_and_clear_cpu_idle(int cpu)
-{
-#ifdef CONFIG_SCHED_SMT
- /*
- * SMT mask should be cleared whether we can claim @cpu or not. The SMT
- * cluster is not wholly idle either way. This also prevents
- * scx_pick_idle_cpu() from getting caught in an infinite loop.
- */
- if (sched_smt_active()) {
- const struct cpumask *smt = cpu_smt_mask(cpu);
-
- /*
- * If offline, @cpu is not its own sibling and
- * scx_pick_idle_cpu() can get caught in an infinite loop as
- * @cpu is never cleared from idle_masks.smt. Ensure that @cpu
- * is eventually cleared.
- */
- if (cpumask_intersects(smt, idle_masks.smt))
- cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
- else if (cpumask_test_cpu(cpu, idle_masks.smt))
- __cpumask_clear_cpu(cpu, idle_masks.smt);
- }
-#endif
- return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
-}
-
-static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
-{
- int cpu;
-
-retry:
- if (sched_smt_active()) {
- cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed);
- if (cpu < nr_cpu_ids)
- goto found;
-
- if (flags & SCX_PICK_IDLE_CORE)
- return -EBUSY;
- }
-
- cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed);
- if (cpu >= nr_cpu_ids)
- return -EBUSY;
-
-found:
- if (test_and_clear_cpu_idle(cpu))
- return cpu;
- else
- goto retry;
-}
-
-/*
- * Return true if the LLC domains do not perfectly overlap with the NUMA
- * domains, false otherwise.
- */
-static bool llc_numa_mismatch(void)
-{
- int cpu;
-
- /*
- * We need to scan all online CPUs to verify whether their scheduling
- * domains overlap.
- *
- * While it is rare to encounter architectures with asymmetric NUMA
- * topologies, CPU hotplugging or virtualized environments can result
- * in asymmetric configurations.
- *
- * For example:
- *
- * NUMA 0:
- * - LLC 0: cpu0..cpu7
- * - LLC 1: cpu8..cpu15 [offline]
- *
- * NUMA 1:
- * - LLC 0: cpu16..cpu23
- * - LLC 1: cpu24..cpu31
- *
- * In this case, if we only check the first online CPU (cpu0), we might
- * incorrectly assume that the LLC and NUMA domains are fully
- * overlapping, which is incorrect (as NUMA 1 has two distinct LLC
- * domains).
- */
- for_each_online_cpu(cpu) {
- const struct cpumask *numa_cpus;
- struct sched_domain *sd;
-
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
- if (!sd)
- return true;
-
- numa_cpus = cpumask_of_node(cpu_to_node(cpu));
- if (sd->span_weight != cpumask_weight(numa_cpus))
- return true;
- }
-
- return false;
-}
-
-/*
- * Initialize topology-aware scheduling.
- *
- * Detect if the system has multiple LLC or multiple NUMA domains and enable
- * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
- * selection policy.
- *
- * Assumption: the kernel's internal topology representation assumes that each
- * CPU belongs to a single LLC domain, and that each LLC domain is entirely
- * contained within a single NUMA node.
- */
-static void update_selcpu_topology(void)
-{
- bool enable_llc = false, enable_numa = false;
- struct sched_domain *sd;
- const struct cpumask *cpus;
- s32 cpu = cpumask_first(cpu_online_mask);
-
- /*
- * Enable LLC domain optimization only when there are multiple LLC
- * domains among the online CPUs. If all online CPUs are part of a
- * single LLC domain, the idle CPU selection logic can choose any
- * online CPU without bias.
- *
- * Note that it is sufficient to check the LLC domain of the first
- * online CPU to determine whether a single LLC domain includes all
- * CPUs.
- */
- rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
- if (sd) {
- if (sd->span_weight < num_online_cpus())
- enable_llc = true;
- }
-
- /*
- * Enable NUMA optimization only when there are multiple NUMA domains
- * among the online CPUs and the NUMA domains don't perfectly overlaps
- * with the LLC domains.
- *
- * If all CPUs belong to the same NUMA node and the same LLC domain,
- * enabling both NUMA and LLC optimizations is unnecessary, as checking
- * for an idle CPU in the same domain twice is redundant.
- */
- cpus = cpumask_of_node(cpu_to_node(cpu));
- if ((cpumask_weight(cpus) < num_online_cpus()) && llc_numa_mismatch())
- enable_numa = true;
- rcu_read_unlock();
-
- pr_debug("sched_ext: LLC idle selection %s\n",
- enable_llc ? "enabled" : "disabled");
- pr_debug("sched_ext: NUMA idle selection %s\n",
- enable_numa ? "enabled" : "disabled");
-
- if (enable_llc)
- static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
- else
- static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
- if (enable_numa)
- static_branch_enable_cpuslocked(&scx_selcpu_topo_numa);
- else
- static_branch_disable_cpuslocked(&scx_selcpu_topo_numa);
-}
-
-/*
- * Built-in CPU idle selection policy:
- *
- * 1. Prioritize full-idle cores:
- * - always prioritize CPUs from fully idle cores (both logical CPUs are
- * idle) to avoid interference caused by SMT.
- *
- * 2. Reuse the same CPU:
- * - prefer the last used CPU to take advantage of cached data (L1, L2) and
- * branch prediction optimizations.
- *
- * 3. Pick a CPU within the same LLC (Last-Level Cache):
- * - if the above conditions aren't met, pick a CPU that shares the same LLC
- * to maintain cache locality.
- *
- * 4. Pick a CPU within the same NUMA node, if enabled:
- * - choose a CPU from the same NUMA node to reduce memory access latency.
- *
- * Step 3 and 4 are performed only if the system has, respectively, multiple
- * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and
- * scx_selcpu_topo_numa).
- *
- * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
- * we never call ops.select_cpu() for them, see select_task_rq().
- */
-static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
- u64 wake_flags, bool *found)
-{
- const struct cpumask *llc_cpus = NULL;
- const struct cpumask *numa_cpus = NULL;
- s32 cpu;
-
- *found = false;
-
-
- /*
- * This is necessary to protect llc_cpus.
- */
- rcu_read_lock();
-
- /*
- * Determine the scheduling domain only if the task is allowed to run
- * on all CPUs.
- *
- * This is done primarily for efficiency, as it avoids the overhead of
- * updating a cpumask every time we need to select an idle CPU (which
- * can be costly in large SMP systems), but it also aligns logically:
- * if a task's scheduling domain is restricted by user-space (through
- * CPU affinity), the task will simply use the flat scheduling domain
- * defined by user-space.
- */
- if (p->nr_cpus_allowed >= num_possible_cpus()) {
- if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
- numa_cpus = cpumask_of_node(cpu_to_node(prev_cpu));
-
- if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) {
- struct sched_domain *sd;
-
- sd = rcu_dereference(per_cpu(sd_llc, prev_cpu));
- if (sd)
- llc_cpus = sched_domain_span(sd);
- }
- }
-
- /*
- * If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
- */
- if (wake_flags & SCX_WAKE_SYNC) {
- cpu = smp_processor_id();
-
- /*
- * If the waker's CPU is cache affine and prev_cpu is idle,
- * then avoid a migration.
- */
- if (cpus_share_cache(cpu, prev_cpu) &&
- test_and_clear_cpu_idle(prev_cpu)) {
- cpu = prev_cpu;
- goto cpu_found;
- }
-
- /*
- * If the waker's local DSQ is empty, and the system is under
- * utilized, try to wake up @p to the local DSQ of the waker.
- *
- * Checking only for an empty local DSQ is insufficient as it
- * could give the wakee an unfair advantage when the system is
- * oversaturated.
- *
- * Checking only for the presence of idle CPUs is also
- * insufficient as the local DSQ of the waker could have tasks
- * piled up on it even if there is an idle core elsewhere on
- * the system.
- */
- if (!cpumask_empty(idle_masks.cpu) &&
- !(current->flags & PF_EXITING) &&
- cpu_rq(cpu)->scx.local_dsq.nr == 0) {
- if (cpumask_test_cpu(cpu, p->cpus_ptr))
- goto cpu_found;
- }
- }
-
- /*
- * If CPU has SMT, any wholly idle CPU is likely a better pick than
- * partially idle @prev_cpu.
- */
- if (sched_smt_active()) {
- /*
- * Keep using @prev_cpu if it's part of a fully idle core.
- */
- if (cpumask_test_cpu(prev_cpu, idle_masks.smt) &&
- test_and_clear_cpu_idle(prev_cpu)) {
- cpu = prev_cpu;
- goto cpu_found;
- }
-
- /*
- * Search for any fully idle core in the same LLC domain.
- */
- if (llc_cpus) {
- cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE);
- if (cpu >= 0)
- goto cpu_found;
- }
-
- /*
- * Search for any fully idle core in the same NUMA node.
- */
- if (numa_cpus) {
- cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE);
- if (cpu >= 0)
- goto cpu_found;
- }
-
- /*
- * Search for any full idle core usable by the task.
- */
- cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE);
- if (cpu >= 0)
- goto cpu_found;
- }
-
- /*
- * Use @prev_cpu if it's idle.
- */
- if (test_and_clear_cpu_idle(prev_cpu)) {
- cpu = prev_cpu;
- goto cpu_found;
- }
-
- /*
- * Search for any idle CPU in the same LLC domain.
- */
- if (llc_cpus) {
- cpu = scx_pick_idle_cpu(llc_cpus, 0);
- if (cpu >= 0)
- goto cpu_found;
- }
-
- /*
- * Search for any idle CPU in the same NUMA node.
- */
- if (numa_cpus) {
- cpu = scx_pick_idle_cpu(numa_cpus, 0);
- if (cpu >= 0)
- goto cpu_found;
- }
-
- /*
- * Search for any idle CPU usable by the task.
- */
- cpu = scx_pick_idle_cpu(p->cpus_ptr, 0);
- if (cpu >= 0)
- goto cpu_found;
-
- rcu_read_unlock();
- return prev_cpu;
-
-cpu_found:
- rcu_read_unlock();
-
- *found = true;
- return cpu;
-}
-
static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
{
+ bool rq_bypass;
+
/*
* sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it
* can be a good migration opportunity with low cache and memory
@@ -3529,7 +3375,8 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
if (unlikely(wake_flags & WF_EXEC))
return prev_cpu;
- if (SCX_HAS_OP(select_cpu) && !scx_rq_bypassing(task_rq(p))) {
+ rq_bypass = scx_rq_bypassing(task_rq(p));
+ if (SCX_HAS_OP(select_cpu) && !rq_bypass) {
s32 cpu;
struct task_struct **ddsp_taskp;
@@ -3539,20 +3386,27 @@ static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flag
cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
select_cpu, p, prev_cpu, wake_flags);
+ p->scx.selected_cpu = cpu;
*ddsp_taskp = NULL;
if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
return cpu;
else
return prev_cpu;
} else {
- bool found;
s32 cpu;
- cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found);
- if (found) {
+ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, 0);
+ if (cpu >= 0) {
p->scx.slice = SCX_SLICE_DFL;
p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
+ __scx_add_event(SCX_EV_ENQ_SLICE_DFL, 1);
+ } else {
+ cpu = prev_cpu;
}
+ p->scx.selected_cpu = cpu;
+
+ if (rq_bypass)
+ __scx_add_event(SCX_EV_BYPASS_DISPATCH, 1);
return cpu;
}
}
@@ -3580,95 +3434,6 @@ static void set_cpus_allowed_scx(struct task_struct *p,
(struct cpumask *)p->cpus_ptr);
}
-static void reset_idle_masks(void)
-{
- /*
- * Consider all online cpus idle. Should converge to the actual state
- * quickly.
- */
- cpumask_copy(idle_masks.cpu, cpu_online_mask);
- cpumask_copy(idle_masks.smt, cpu_online_mask);
-}
-
-static void update_builtin_idle(int cpu, bool idle)
-{
- if (idle)
- cpumask_set_cpu(cpu, idle_masks.cpu);
- else
- cpumask_clear_cpu(cpu, idle_masks.cpu);
-
-#ifdef CONFIG_SCHED_SMT
- if (sched_smt_active()) {
- const struct cpumask *smt = cpu_smt_mask(cpu);
-
- if (idle) {
- /*
- * idle_masks.smt handling is racy but that's fine as
- * it's only for optimization and self-correcting.
- */
- for_each_cpu(cpu, smt) {
- if (!cpumask_test_cpu(cpu, idle_masks.cpu))
- return;
- }
- cpumask_or(idle_masks.smt, idle_masks.smt, smt);
- } else {
- cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
- }
- }
-#endif
-}
-
-/*
- * Update the idle state of a CPU to @idle.
- *
- * If @do_notify is true, ops.update_idle() is invoked to notify the scx
- * scheduler of an actual idle state transition (idle to busy or vice
- * versa). If @do_notify is false, only the idle state in the idle masks is
- * refreshed without invoking ops.update_idle().
- *
- * This distinction is necessary, because an idle CPU can be "reserved" and
- * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as
- * busy even if no tasks are dispatched. In this case, the CPU may return
- * to idle without a true state transition. Refreshing the idle masks
- * without invoking ops.update_idle() ensures accurate idle state tracking
- * while avoiding unnecessary updates and maintaining balanced state
- * transitions.
- */
-void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
-{
- int cpu = cpu_of(rq);
-
- lockdep_assert_rq_held(rq);
-
- /*
- * Trigger ops.update_idle() only when transitioning from a task to
- * the idle thread and vice versa.
- *
- * Idle transitions are indicated by do_notify being set to true,
- * managed by put_prev_task_idle()/set_next_task_idle().
- */
- if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
- SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
-
- /*
- * Update the idle masks:
- * - for real idle transitions (do_notify == true)
- * - for idle-to-idle transitions (indicated by the previous task
- * being the idle thread, managed by pick_task_idle())
- *
- * Skip updating idle masks if the previous task is not the idle
- * thread, since set_next_task_idle() has already handled it when
- * transitioning from a task to the idle thread (calling this
- * function with do_notify == true).
- *
- * In this way we can avoid updating the idle masks twice,
- * unnecessarily.
- */
- if (static_branch_likely(&scx_builtin_idle_enabled))
- if (do_notify || is_idle_task(rq->curr))
- update_builtin_idle(cpu, idle);
-}
-
static void handle_hotplug(struct rq *rq, bool online)
{
int cpu = cpu_of(rq);
@@ -3676,7 +3441,7 @@ static void handle_hotplug(struct rq *rq, bool online)
atomic_long_inc(&scx_hotplug_seq);
if (scx_enabled())
- update_selcpu_topology();
+ scx_idle_update_selcpu_topology(&scx_ops);
if (online && SCX_HAS_OP(cpu_online))
SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
@@ -3708,12 +3473,6 @@ static void rq_offline_scx(struct rq *rq)
rq->scx.flags &= ~SCX_RQ_ONLINE;
}
-#else /* CONFIG_SMP */
-
-static bool test_and_clear_cpu_idle(int cpu) { return false; }
-static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; }
-static void reset_idle_masks(void) {}
-
#endif /* CONFIG_SMP */
static bool check_rq_for_timeouts(struct rq *rq)
@@ -3791,7 +3550,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
curr->scx.slice = 0;
touch_core_sched(rq, curr);
} else if (SCX_HAS_OP(tick)) {
- SCX_CALL_OP(SCX_KF_REST, tick, curr);
+ SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr);
}
if (!curr->scx.slice)
@@ -3938,7 +3697,7 @@ static void scx_ops_disable_task(struct task_struct *p)
WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
if (SCX_HAS_OP(disable))
- SCX_CALL_OP(SCX_KF_REST, disable, p);
+ SCX_CALL_OP_TASK(SCX_KF_REST, disable, p);
scx_set_task_state(p, SCX_TASK_READY);
}
@@ -3967,7 +3726,7 @@ static void scx_ops_exit_task(struct task_struct *p)
}
if (SCX_HAS_OP(exit_task))
- SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args);
+ SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args);
scx_set_task_state(p, SCX_TASK_NONE);
}
@@ -4263,25 +4022,12 @@ err:
return ops_sanitize_err("cgroup_prep_move", ret);
}
-void scx_move_task(struct task_struct *p)
+void scx_cgroup_move_task(struct task_struct *p)
{
if (!scx_cgroup_enabled)
return;
/*
- * We're called from sched_move_task() which handles both cgroup and
- * autogroup moves. Ignore the latter.
- *
- * Also ignore exiting tasks, because in the exit path tasks transition
- * from the autogroup to the root group, so task_group_is_autogroup()
- * alone isn't able to catch exiting autogroup tasks. This is safe for
- * cgroup_move(), because cgroup migrations never happen for PF_EXITING
- * tasks.
- */
- if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING))
- return;
-
- /*
* @p must have ops.cgroup_prep_move() called on it and thus
* cgrp_moving_from set.
*/
@@ -4530,7 +4276,7 @@ static int scx_cgroup_init(void)
cgroup_warned_missing_idle = false;
/*
- * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk
+ * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
* cgroups and init, all online cgroups are initialized.
*/
rcu_read_lock();
@@ -4651,8 +4397,33 @@ static ssize_t scx_attr_ops_show(struct kobject *kobj,
}
SCX_ATTR(ops);
+#define scx_attr_event_show(buf, at, events, kind) ({ \
+ sysfs_emit_at(buf, at, "%s %llu\n", #kind, (events)->kind); \
+})
+
+static ssize_t scx_attr_events_show(struct kobject *kobj,
+ struct kobj_attribute *ka, char *buf)
+{
+ struct scx_event_stats events;
+ int at = 0;
+
+ scx_bpf_events(&events, sizeof(events));
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_SELECT_CPU_FALLBACK);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_DISPATCH_KEEP_LAST);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_EXITING);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_ENQ_SLICE_DFL);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DURATION);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_DISPATCH);
+ at += scx_attr_event_show(buf, at, &events, SCX_EV_BYPASS_ACTIVATE);
+ return at;
+}
+SCX_ATTR(events);
+
static struct attribute *scx_sched_attrs[] = {
&scx_attr_ops.attr,
+ &scx_attr_events.attr,
NULL,
};
ATTRIBUTE_GROUPS(scx_sched);
@@ -4688,6 +4459,7 @@ bool task_should_scx(int policy)
/**
* scx_softlockup - sched_ext softlockup handler
+ * @dur_s: number of seconds of CPU stuck due to soft lockup
*
* On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can
* live-lock the system by making many CPUs target the same DSQ to the point
@@ -4731,6 +4503,7 @@ static void scx_clear_softlockup(void)
/**
* scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
+ * @bypass: true for bypass, false for unbypass
*
* Bypassing guarantees that all runnable tasks make forward progress without
* trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might
@@ -4762,6 +4535,8 @@ static void scx_clear_softlockup(void)
static void scx_ops_bypass(bool bypass)
{
static DEFINE_RAW_SPINLOCK(bypass_lock);
+ static unsigned long bypass_timestamp;
+
int cpu;
unsigned long flags;
@@ -4771,11 +4546,15 @@ static void scx_ops_bypass(bool bypass)
WARN_ON_ONCE(scx_ops_bypass_depth <= 0);
if (scx_ops_bypass_depth != 1)
goto unlock;
+ bypass_timestamp = ktime_get_ns();
+ scx_add_event(SCX_EV_BYPASS_ACTIVATE, 1);
} else {
scx_ops_bypass_depth--;
WARN_ON_ONCE(scx_ops_bypass_depth < 0);
if (scx_ops_bypass_depth != 0)
goto unlock;
+ scx_add_event(SCX_EV_BYPASS_DURATION,
+ ktime_get_ns() - bypass_timestamp);
}
atomic_inc(&scx_ops_breather_depth);
@@ -4899,7 +4678,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
struct task_struct *p;
struct rhashtable_iter rht_iter;
struct scx_dispatch_q *dsq;
- int i, kind;
+ int i, kind, cpu;
kind = atomic_read(&scx_exit_kind);
while (true) {
@@ -4982,14 +4761,25 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
+ /*
+ * Invalidate all the rq clocks to prevent getting outdated
+ * rq clocks from a previous scx scheduler.
+ */
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ scx_rq_clock_invalidate(rq);
+ }
+
/* no task is on scx, turn off all the switches and flush in-progress calls */
static_branch_disable(&__scx_ops_enabled);
for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
static_branch_disable(&scx_has_op[i]);
+ static_branch_disable(&scx_ops_allow_queued_wakeup);
static_branch_disable(&scx_ops_enq_last);
static_branch_disable(&scx_ops_enq_exiting);
+ static_branch_disable(&scx_ops_enq_migration_disabled);
static_branch_disable(&scx_ops_cpu_preempt);
- static_branch_disable(&scx_builtin_idle_enabled);
+ scx_idle_disable();
synchronize_rcu();
if (ei->kind >= SCX_EXIT_ERROR) {
@@ -5206,9 +4996,10 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK,
p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK,
ops_state >> SCX_OPSS_QSEQ_SHIFT);
- dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu",
- p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf,
- p->scx.dsq_vtime);
+ dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s",
+ p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf);
+ dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u",
+ p->scx.dsq_vtime, p->scx.slice, p->scx.weight);
dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
if (SCX_HAS_OP(dump_task)) {
@@ -5238,6 +5029,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
.at_jiffies = jiffies,
};
struct seq_buf s;
+ struct scx_event_stats events;
unsigned long flags;
char *buf;
int cpu;
@@ -5346,6 +5138,21 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
rq_unlock(rq, &rf);
}
+ dump_newline(&s);
+ dump_line(&s, "Event counters");
+ dump_line(&s, "--------------");
+
+ scx_bpf_events(&events, sizeof(events));
+ scx_dump_event(s, &events, SCX_EV_SELECT_CPU_FALLBACK);
+ scx_dump_event(s, &events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+ scx_dump_event(s, &events, SCX_EV_DISPATCH_KEEP_LAST);
+ scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_EXITING);
+ scx_dump_event(s, &events, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+ scx_dump_event(s, &events, SCX_EV_ENQ_SLICE_DFL);
+ scx_dump_event(s, &events, SCX_EV_BYPASS_DURATION);
+ scx_dump_event(s, &events, SCX_EV_BYPASS_DISPATCH);
+ scx_dump_event(s, &events, SCX_EV_BYPASS_ACTIVATE);
+
if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
memcpy(ei->dump + dump_len - sizeof(trunc_marker),
trunc_marker, sizeof(trunc_marker));
@@ -5399,7 +5206,7 @@ static struct kthread_worker *scx_create_rt_helper(const char *name)
{
struct kthread_worker *helper;
- helper = kthread_create_worker(0, name);
+ helper = kthread_run_worker(0, name);
if (helper)
sched_set_fifo(helper->task);
return helper;
@@ -5435,6 +5242,16 @@ static int validate_ops(const struct sched_ext_ops *ops)
return -EINVAL;
}
+ /*
+ * SCX_OPS_BUILTIN_IDLE_PER_NODE requires built-in CPU idle
+ * selection policy to be enabled.
+ */
+ if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) &&
+ (ops->update_idle && !(ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))) {
+ scx_ops_error("SCX_OPS_BUILTIN_IDLE_PER_NODE requires CPU idle selection enabled");
+ return -EINVAL;
+ }
+
return 0;
}
@@ -5453,6 +5270,15 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
mutex_lock(&scx_ops_enable_mutex);
+ /*
+ * Clear event counters so a new scx scheduler gets
+ * fresh event counter values.
+ */
+ for_each_possible_cpu(cpu) {
+ struct scx_event_stats *e = per_cpu_ptr(&event_stats_cpu, cpu);
+ memset(e, 0, sizeof(*e));
+ }
+
if (!scx_ops_helper) {
WRITE_ONCE(scx_ops_helper,
scx_create_rt_helper("sched_ext_ops_helper"));
@@ -5550,9 +5376,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
static_branch_enable_cpuslocked(&scx_has_op[i]);
check_hotplug_seq(ops);
-#ifdef CONFIG_SMP
- update_selcpu_topology();
-#endif
+ scx_idle_update_selcpu_topology(ops);
+
cpus_read_unlock();
ret = validate_ops(ops);
@@ -5591,20 +5416,18 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (((void (**)(void))ops)[i])
static_branch_enable(&scx_has_op[i]);
+ if (ops->flags & SCX_OPS_ALLOW_QUEUED_WAKEUP)
+ static_branch_enable(&scx_ops_allow_queued_wakeup);
if (ops->flags & SCX_OPS_ENQ_LAST)
static_branch_enable(&scx_ops_enq_last);
-
if (ops->flags & SCX_OPS_ENQ_EXITING)
static_branch_enable(&scx_ops_enq_exiting);
+ if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED)
+ static_branch_enable(&scx_ops_enq_migration_disabled);
if (scx_ops.cpu_acquire || scx_ops.cpu_release)
static_branch_enable(&scx_ops_cpu_preempt);
- if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
- reset_idle_masks();
- static_branch_enable(&scx_builtin_idle_enabled);
- } else {
- static_branch_disable(&scx_builtin_idle_enabled);
- }
+ scx_idle_enable(ops);
/*
* Lock out forks, cgroup on/offlining and moves before opening the
@@ -6243,10 +6066,8 @@ void __init init_sched_ext_class(void)
SCX_TG_ONLINE);
BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
-#ifdef CONFIG_SMP
- BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
- BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
-#endif
+ scx_idle_init_masks();
+
scx_kick_cpus_pnt_seqs =
__alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
__alignof__(scx_kick_cpus_pnt_seqs[0]));
@@ -6254,15 +6075,16 @@ void __init init_sched_ext_class(void)
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
+ int n = cpu_to_node(cpu);
init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
INIT_LIST_HEAD(&rq->scx.runnable_list);
INIT_LIST_HEAD(&rq->scx.ddsp_deferred_locals);
- BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
- BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL));
- BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
- BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick, GFP_KERNEL, n));
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n));
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n));
+ BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n));
init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn);
init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
@@ -6279,55 +6101,6 @@ void __init init_sched_ext_class(void)
/********************************************************************************
* Helpers that can be called from the BPF scheduler.
*/
-#include <linux/btf_ids.h>
-
-__bpf_kfunc_start_defs();
-
-/**
- * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
- * @p: task_struct to select a CPU for
- * @prev_cpu: CPU @p was on previously
- * @wake_flags: %SCX_WAKE_* flags
- * @is_idle: out parameter indicating whether the returned CPU is idle
- *
- * Can only be called from ops.select_cpu() if the built-in CPU selection is
- * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
- * @p, @prev_cpu and @wake_flags match ops.select_cpu().
- *
- * Returns the picked CPU with *@is_idle indicating whether the picked CPU is
- * currently idle and thus a good candidate for direct dispatching.
- */
-__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
- u64 wake_flags, bool *is_idle)
-{
- if (!static_branch_likely(&scx_builtin_idle_enabled)) {
- scx_ops_error("built-in idle tracking is disabled");
- goto prev_cpu;
- }
-
- if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
- goto prev_cpu;
-
-#ifdef CONFIG_SMP
- return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle);
-#endif
-
-prev_cpu:
- *is_idle = false;
- return prev_cpu;
-}
-
-__bpf_kfunc_end_defs();
-
-BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
-BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
-BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
-
-static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
- .owner = THIS_MODULE,
- .set = &scx_kfunc_ids_select_cpu,
-};
-
static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
{
if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
@@ -6387,9 +6160,7 @@ __bpf_kfunc_start_defs();
* ops.select_cpu(), and ops.dispatch().
*
* When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch
- * and @p must match the task being enqueued. Also, %SCX_DSQ_LOCAL_ON can't be
- * used to target the local DSQ of a CPU other than the enqueueing one. Use
- * ops.select_cpu() to be on the target CPU in the first place.
+ * and @p must match the task being enqueued.
*
* When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p
* will be directly inserted into the corresponding dispatch queue after
@@ -6851,8 +6622,12 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void)
* CPUs disagree, they use %ENQUEUE_RESTORE which is bypassed to
* the current local DSQ for running tasks and thus are not
* visible to the BPF scheduler.
+ *
+ * Also skip re-enqueueing tasks that can only run on this
+ * CPU, as they would just be re-added to the same local
+ * DSQ without any benefit.
*/
- if (p->migration_pending)
+ if (p->migration_pending || is_migration_disabled(p) || p->nr_cpus_allowed == 1)
continue;
dispatch_dequeue(rq, p);
@@ -7228,7 +7003,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
}
/**
- * scx_bpf_dump - Generate extra debug dump specific to the BPF scheduler
+ * scx_bpf_dump_bstr - Generate extra debug dump specific to the BPF scheduler
* @fmt: format string
* @data: format string parameters packaged using ___bpf_fill() macro
* @data__sz: @data len, must end in '__sz' for the verifier
@@ -7320,7 +7095,6 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
* scx_bpf_cpuperf_set - Set the relative performance target of a CPU
* @cpu: CPU of interest
* @perf: target performance level [0, %SCX_CPUPERF_ONE]
- * @flags: %SCX_CPUPERF_* flags
*
* Set the target performance level of @cpu to @perf. @perf is in linear
* relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the
@@ -7350,6 +7124,16 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf)
}
/**
+ * scx_bpf_nr_node_ids - Return the number of possible node IDs
+ *
+ * All valid node IDs in the system are smaller than the returned value.
+ */
+__bpf_kfunc u32 scx_bpf_nr_node_ids(void)
+{
+ return nr_node_ids;
+}
+
+/**
* scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs
*
* All valid CPU IDs in the system are smaller than the returned value.
@@ -7390,149 +7174,6 @@ __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask)
}
/**
- * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
- * per-CPU cpumask.
- *
- * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
- */
-__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
-{
- if (!static_branch_likely(&scx_builtin_idle_enabled)) {
- scx_ops_error("built-in idle tracking is disabled");
- return cpu_none_mask;
- }
-
-#ifdef CONFIG_SMP
- return idle_masks.cpu;
-#else
- return cpu_none_mask;
-#endif
-}
-
-/**
- * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
- * per-physical-core cpumask. Can be used to determine if an entire physical
- * core is free.
- *
- * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
- */
-__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
-{
- if (!static_branch_likely(&scx_builtin_idle_enabled)) {
- scx_ops_error("built-in idle tracking is disabled");
- return cpu_none_mask;
- }
-
-#ifdef CONFIG_SMP
- if (sched_smt_active())
- return idle_masks.smt;
- else
- return idle_masks.cpu;
-#else
- return cpu_none_mask;
-#endif
-}
-
-/**
- * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
- * either the percpu, or SMT idle-tracking cpumask.
- */
-__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
-{
- /*
- * Empty function body because we aren't actually acquiring or releasing
- * a reference to a global idle cpumask, which is read-only in the
- * caller and is never released. The acquire / release semantics here
- * are just used to make the cpumask a trusted pointer in the caller.
- */
-}
-
-/**
- * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
- * @cpu: cpu to test and clear idle for
- *
- * Returns %true if @cpu was idle and its idle state was successfully cleared.
- * %false otherwise.
- *
- * Unavailable if ops.update_idle() is implemented and
- * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
- */
-__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
-{
- if (!static_branch_likely(&scx_builtin_idle_enabled)) {
- scx_ops_error("built-in idle tracking is disabled");
- return false;
- }
-
- if (ops_cpu_valid(cpu, NULL))
- return test_and_clear_cpu_idle(cpu);
- else
- return false;
-}
-
-/**
- * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
- * @cpus_allowed: Allowed cpumask
- * @flags: %SCX_PICK_IDLE_CPU_* flags
- *
- * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
- * number on success. -%EBUSY if no matching cpu was found.
- *
- * Idle CPU tracking may race against CPU scheduling state transitions. For
- * example, this function may return -%EBUSY as CPUs are transitioning into the
- * idle state. If the caller then assumes that there will be dispatch events on
- * the CPUs as they were all busy, the scheduler may end up stalling with CPUs
- * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and
- * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch
- * event in the near future.
- *
- * Unavailable if ops.update_idle() is implemented and
- * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
- */
-__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
- u64 flags)
-{
- if (!static_branch_likely(&scx_builtin_idle_enabled)) {
- scx_ops_error("built-in idle tracking is disabled");
- return -EBUSY;
- }
-
- return scx_pick_idle_cpu(cpus_allowed, flags);
-}
-
-/**
- * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
- * @cpus_allowed: Allowed cpumask
- * @flags: %SCX_PICK_IDLE_CPU_* flags
- *
- * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
- * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
- * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
- * empty.
- *
- * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
- * set, this function can't tell which CPUs are idle and will always pick any
- * CPU.
- */
-__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
- u64 flags)
-{
- s32 cpu;
-
- if (static_branch_likely(&scx_builtin_idle_enabled)) {
- cpu = scx_pick_idle_cpu(cpus_allowed, flags);
- if (cpu >= 0)
- return cpu;
- }
-
- cpu = cpumask_any_distribute(cpus_allowed);
- if (cpu < nr_cpu_ids)
- return cpu;
- else
- return -EBUSY;
-}
-
-/**
* scx_bpf_task_running - Is task currently running?
* @p: task of interest
*/
@@ -7590,6 +7231,105 @@ out:
}
#endif
+/**
+ * scx_bpf_now - Returns a high-performance monotonically non-decreasing
+ * clock for the current CPU. The clock returned is in nanoseconds.
+ *
+ * It provides the following properties:
+ *
+ * 1) High performance: Many BPF schedulers call bpf_ktime_get_ns() frequently
+ * to account for execution time and track tasks' runtime properties.
+ * Unfortunately, in some hardware platforms, bpf_ktime_get_ns() -- which
+ * eventually reads a hardware timestamp counter -- is neither performant nor
+ * scalable. scx_bpf_now() aims to provide a high-performance clock by
+ * using the rq clock in the scheduler core whenever possible.
+ *
+ * 2) High enough resolution for the BPF scheduler use cases: In most BPF
+ * scheduler use cases, the required clock resolution is lower than the most
+ * accurate hardware clock (e.g., rdtsc in x86). scx_bpf_now() basically
+ * uses the rq clock in the scheduler core whenever it is valid. It considers
+ * that the rq clock is valid from the time the rq clock is updated
+ * (update_rq_clock) until the rq is unlocked (rq_unpin_lock).
+ *
+ * 3) Monotonically non-decreasing clock for the same CPU: scx_bpf_now()
+ * guarantees the clock never goes backward when comparing them in the same
+ * CPU. On the other hand, when comparing clocks in different CPUs, there
+ * is no such guarantee -- the clock can go backward. It provides a
+ * monotonically *non-decreasing* clock so that it would provide the same
+ * clock values in two different scx_bpf_now() calls in the same CPU
+ * during the same period of when the rq clock is valid.
+ */
+__bpf_kfunc u64 scx_bpf_now(void)
+{
+ struct rq *rq;
+ u64 clock;
+
+ preempt_disable();
+
+ rq = this_rq();
+ if (smp_load_acquire(&rq->scx.flags) & SCX_RQ_CLK_VALID) {
+ /*
+ * If the rq clock is valid, use the cached rq clock.
+ *
+ * Note that scx_bpf_now() is re-entrant between a process
+ * context and an interrupt context (e.g., timer interrupt).
+ * However, we don't need to consider the race between them
+ * because such race is not observable from a caller.
+ */
+ clock = READ_ONCE(rq->scx.clock);
+ } else {
+ /*
+ * Otherwise, return a fresh rq clock.
+ *
+ * The rq clock is updated outside of the rq lock.
+ * In this case, keep the updated rq clock invalid so the next
+ * kfunc call outside the rq lock gets a fresh rq clock.
+ */
+ clock = sched_clock_cpu(cpu_of(rq));
+ }
+
+ preempt_enable();
+
+ return clock;
+}
+
+/*
+ * scx_bpf_events - Get a system-wide event counter to
+ * @events: output buffer from a BPF program
+ * @events__sz: @events len, must end in '__sz'' for the verifier
+ */
+__bpf_kfunc void scx_bpf_events(struct scx_event_stats *events,
+ size_t events__sz)
+{
+ struct scx_event_stats e_sys, *e_cpu;
+ int cpu;
+
+ /* Aggregate per-CPU event counters into the system-wide counters. */
+ memset(&e_sys, 0, sizeof(e_sys));
+ for_each_possible_cpu(cpu) {
+ e_cpu = per_cpu_ptr(&event_stats_cpu, cpu);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_SELECT_CPU_FALLBACK);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_DISPATCH_KEEP_LAST);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_EXITING);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SKIP_MIGRATION_DISABLED);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_ENQ_SLICE_DFL);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DURATION);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_DISPATCH);
+ scx_agg_event(&e_sys, e_cpu, SCX_EV_BYPASS_ACTIVATE);
+ }
+
+ /*
+ * We cannot entirely trust a BPF-provided size since a BPF program
+ * might be compiled against a different vmlinux.h, of which
+ * scx_event_stats would be larger (a newer vmlinux.h) or smaller
+ * (an older vmlinux.h). Hence, we use the smaller size to avoid
+ * memory corruption.
+ */
+ events__sz = min(events__sz, sizeof(*events));
+ memcpy(events, &e_sys, events__sz);
+}
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_any)
@@ -7605,6 +7345,7 @@ BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur)
BTF_ID_FLAGS(func, scx_bpf_cpuperf_set)
+BTF_ID_FLAGS(func, scx_bpf_nr_node_ids)
BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
@@ -7621,6 +7362,8 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
#ifdef CONFIG_CGROUP_SCHED
BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
#endif
+BTF_ID_FLAGS(func, scx_bpf_now)
+BTF_ID_FLAGS(func, scx_bpf_events, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(scx_kfunc_ids_any)
static const struct btf_kfunc_id_set scx_kfunc_set_any = {
@@ -7644,8 +7387,6 @@ static int __init scx_init(void)
* check using scx_kf_allowed().
*/
if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
- &scx_kfunc_set_select_cpu)) ||
- (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
&scx_kfunc_set_enqueue_dispatch)) ||
(ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
&scx_kfunc_set_dispatch)) ||
@@ -7665,6 +7406,12 @@ static int __init scx_init(void)
return ret;
}
+ ret = scx_idle_init();
+ if (ret) {
+ pr_err("sched_ext: Failed to initialize idle tracking (%d)\n", ret);
+ return ret;
+ }
+
ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops);
if (ret) {
pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret);