summaryrefslogtreecommitdiff
path: root/tools/testing/selftests/sched_ext
diff options
context:
space:
mode:
Diffstat (limited to 'tools/testing/selftests/sched_ext')
-rw-r--r--tools/testing/selftests/sched_ext/Makefile11
-rw-r--r--tools/testing/selftests/sched_ext/allowed_cpus.bpf.c144
-rw-r--r--tools/testing/selftests/sched_ext/allowed_cpus.c84
-rw-r--r--tools/testing/selftests/sched_ext/config1
-rw-r--r--tools/testing/selftests/sched_ext/cyclic_kick_wait.bpf.c68
-rw-r--r--tools/testing/selftests/sched_ext/cyclic_kick_wait.c194
-rw-r--r--tools/testing/selftests/sched_ext/dequeue.bpf.c389
-rw-r--r--tools/testing/selftests/sched_ext/dequeue.c274
-rw-r--r--tools/testing/selftests/sched_ext/enq_select_cpu.bpf.c74
-rw-r--r--tools/testing/selftests/sched_ext/enq_select_cpu.c88
-rw-r--r--tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c43
-rw-r--r--tools/testing/selftests/sched_ext/enq_select_cpu_fails.c61
-rw-r--r--tools/testing/selftests/sched_ext/exit.bpf.c2
-rw-r--r--tools/testing/selftests/sched_ext/exit.c10
-rw-r--r--tools/testing/selftests/sched_ext/exit_test.h2
-rw-r--r--tools/testing/selftests/sched_ext/hotplug.c1
-rw-r--r--tools/testing/selftests/sched_ext/init_enable_count.c35
-rw-r--r--tools/testing/selftests/sched_ext/maximal.bpf.c22
-rw-r--r--tools/testing/selftests/sched_ext/maximal.c3
-rw-r--r--tools/testing/selftests/sched_ext/numa.bpf.c100
-rw-r--r--tools/testing/selftests/sched_ext/numa.c59
-rw-r--r--tools/testing/selftests/sched_ext/peek_dsq.bpf.c251
-rw-r--r--tools/testing/selftests/sched_ext/peek_dsq.c224
-rw-r--r--tools/testing/selftests/sched_ext/reload_loop.c3
-rw-r--r--tools/testing/selftests/sched_ext/rt_stall.bpf.c23
-rw-r--r--tools/testing/selftests/sched_ext/rt_stall.c293
-rw-r--r--tools/testing/selftests/sched_ext/runner.c51
-rw-r--r--tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c8
-rw-r--r--tools/testing/selftests/sched_ext/total_bw.c281
-rw-r--r--tools/testing/selftests/sched_ext/util.c4
-rw-r--r--tools/testing/selftests/sched_ext/util.h2
31 files changed, 2664 insertions, 141 deletions
diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile
index 011762224600..789037be44c7 100644
--- a/tools/testing/selftests/sched_ext/Makefile
+++ b/tools/testing/selftests/sched_ext/Makefile
@@ -93,6 +93,8 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) \
$(CLANG_SYS_INCLUDES) \
-Wall -Wno-compare-distinct-pointer-types \
-Wno-incompatible-function-pointer-types \
+ -Wno-microsoft-anon-tag \
+ -fms-extensions \
-O2 -mcpu=v3
# sort removes libbpf duplicates when not cross-building
@@ -161,17 +163,21 @@ all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubs
auto-test-targets := \
create_dsq \
+ dequeue \
enq_last_no_enq_fails \
- enq_select_cpu_fails \
ddsp_bogus_dsq_fail \
ddsp_vtimelocal_fail \
dsp_local_on \
+ enq_select_cpu \
exit \
hotplug \
init_enable_count \
maximal \
maybe_null \
minimal \
+ numa \
+ allowed_cpus \
+ peek_dsq \
prog_run \
reload_loop \
select_cpu_dfl \
@@ -180,7 +186,10 @@ auto-test-targets := \
select_cpu_dispatch_bad_dsq \
select_cpu_dispatch_dbl_dsp \
select_cpu_vtime \
+ rt_stall \
test_example \
+ total_bw \
+ cyclic_kick_wait \
testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets)))
diff --git a/tools/testing/selftests/sched_ext/allowed_cpus.bpf.c b/tools/testing/selftests/sched_ext/allowed_cpus.bpf.c
new file mode 100644
index 000000000000..35923e74a2ec
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/allowed_cpus.bpf.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A scheduler that validates the behavior of scx_bpf_select_cpu_and() by
+ * selecting idle CPUs strictly within a subset of allowed CPUs.
+ *
+ * Copyright (c) 2025 Andrea Righi <arighi@nvidia.com>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+private(PREF_CPUS) struct bpf_cpumask __kptr * allowed_cpumask;
+
+static void
+validate_idle_cpu(const struct task_struct *p, const struct cpumask *allowed, s32 cpu)
+{
+ if (scx_bpf_test_and_clear_cpu_idle(cpu))
+ scx_bpf_error("CPU %d should be marked as busy", cpu);
+
+ if (bpf_cpumask_subset(allowed, p->cpus_ptr) &&
+ !bpf_cpumask_test_cpu(cpu, allowed))
+ scx_bpf_error("CPU %d not in the allowed domain for %d (%s)",
+ cpu, p->pid, p->comm);
+}
+
+s32 BPF_STRUCT_OPS(allowed_cpus_select_cpu,
+ struct task_struct *p, s32 prev_cpu, u64 wake_flags)
+{
+ const struct cpumask *allowed;
+ s32 cpu;
+
+ allowed = cast_mask(allowed_cpumask);
+ if (!allowed) {
+ scx_bpf_error("allowed domain not initialized");
+ return -EINVAL;
+ }
+
+ /*
+ * Select an idle CPU strictly within the allowed domain.
+ */
+ cpu = scx_bpf_select_cpu_and(p, prev_cpu, wake_flags, allowed, 0);
+ if (cpu >= 0) {
+ validate_idle_cpu(p, allowed, cpu);
+ scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
+
+ return cpu;
+ }
+
+ return prev_cpu;
+}
+
+void BPF_STRUCT_OPS(allowed_cpus_enqueue, struct task_struct *p, u64 enq_flags)
+{
+ const struct cpumask *allowed;
+ s32 prev_cpu = scx_bpf_task_cpu(p), cpu;
+
+ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+
+ allowed = cast_mask(allowed_cpumask);
+ if (!allowed) {
+ scx_bpf_error("allowed domain not initialized");
+ return;
+ }
+
+ /*
+ * Use scx_bpf_select_cpu_and() to proactively kick an idle CPU
+ * within @allowed_cpumask, usable by @p.
+ */
+ cpu = scx_bpf_select_cpu_and(p, prev_cpu, 0, allowed, 0);
+ if (cpu >= 0) {
+ validate_idle_cpu(p, allowed, cpu);
+ scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
+ }
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(allowed_cpus_init)
+{
+ struct bpf_cpumask *mask;
+
+ mask = bpf_cpumask_create();
+ if (!mask)
+ return -ENOMEM;
+
+ mask = bpf_kptr_xchg(&allowed_cpumask, mask);
+ if (mask)
+ bpf_cpumask_release(mask);
+
+ bpf_rcu_read_lock();
+
+ /*
+ * Assign the first online CPU to the allowed domain.
+ */
+ mask = allowed_cpumask;
+ if (mask) {
+ const struct cpumask *online = scx_bpf_get_online_cpumask();
+
+ bpf_cpumask_set_cpu(bpf_cpumask_first(online), mask);
+ scx_bpf_put_cpumask(online);
+ }
+
+ bpf_rcu_read_unlock();
+
+ return 0;
+}
+
+void BPF_STRUCT_OPS(allowed_cpus_exit, struct scx_exit_info *ei)
+{
+ UEI_RECORD(uei, ei);
+}
+
+struct task_cpu_arg {
+ pid_t pid;
+};
+
+SEC("syscall")
+int select_cpu_from_user(struct task_cpu_arg *input)
+{
+ struct task_struct *p;
+ int cpu;
+
+ p = bpf_task_from_pid(input->pid);
+ if (!p)
+ return -EINVAL;
+
+ bpf_rcu_read_lock();
+ cpu = scx_bpf_select_cpu_and(p, bpf_get_smp_processor_id(), 0, p->cpus_ptr, 0);
+ bpf_rcu_read_unlock();
+
+ bpf_task_release(p);
+
+ return cpu;
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops allowed_cpus_ops = {
+ .select_cpu = (void *)allowed_cpus_select_cpu,
+ .enqueue = (void *)allowed_cpus_enqueue,
+ .init = (void *)allowed_cpus_init,
+ .exit = (void *)allowed_cpus_exit,
+ .name = "allowed_cpus",
+};
diff --git a/tools/testing/selftests/sched_ext/allowed_cpus.c b/tools/testing/selftests/sched_ext/allowed_cpus.c
new file mode 100644
index 000000000000..093f285ab4ba
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/allowed_cpus.c
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025 Andrea Righi <arighi@nvidia.com>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "allowed_cpus.bpf.skel.h"
+#include "scx_test.h"
+
+static enum scx_test_status setup(void **ctx)
+{
+ struct allowed_cpus *skel;
+
+ skel = allowed_cpus__open();
+ SCX_FAIL_IF(!skel, "Failed to open");
+ SCX_ENUM_INIT(skel);
+ SCX_FAIL_IF(allowed_cpus__load(skel), "Failed to load skel");
+
+ *ctx = skel;
+
+ return SCX_TEST_PASS;
+}
+
+static int test_select_cpu_from_user(const struct allowed_cpus *skel)
+{
+ int fd, ret;
+ __u64 args[1];
+
+ LIBBPF_OPTS(bpf_test_run_opts, attr,
+ .ctx_in = args,
+ .ctx_size_in = sizeof(args),
+ );
+
+ args[0] = getpid();
+ fd = bpf_program__fd(skel->progs.select_cpu_from_user);
+ if (fd < 0)
+ return fd;
+
+ ret = bpf_prog_test_run_opts(fd, &attr);
+ if (ret < 0)
+ return ret;
+
+ fprintf(stderr, "%s: CPU %d\n", __func__, attr.retval);
+
+ return 0;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+ struct allowed_cpus *skel = ctx;
+ struct bpf_link *link;
+
+ link = bpf_map__attach_struct_ops(skel->maps.allowed_cpus_ops);
+ SCX_FAIL_IF(!link, "Failed to attach scheduler");
+
+ /* Pick an idle CPU from user-space */
+ SCX_FAIL_IF(test_select_cpu_from_user(skel), "Failed to pick idle CPU");
+
+ /* Just sleeping is fine, plenty of scheduling events happening */
+ sleep(1);
+
+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE));
+ bpf_link__destroy(link);
+
+ return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+ struct allowed_cpus *skel = ctx;
+
+ allowed_cpus__destroy(skel);
+}
+
+struct scx_test allowed_cpus = {
+ .name = "allowed_cpus",
+ .description = "Verify scx_bpf_select_cpu_and()",
+ .setup = setup,
+ .run = run,
+ .cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&allowed_cpus)
diff --git a/tools/testing/selftests/sched_ext/config b/tools/testing/selftests/sched_ext/config
index 0de9b4ee249d..aa901b05c8ad 100644
--- a/tools/testing/selftests/sched_ext/config
+++ b/tools/testing/selftests/sched_ext/config
@@ -1,4 +1,3 @@
-CONFIG_SCHED_DEBUG=y
CONFIG_SCHED_CLASS_EXT=y
CONFIG_CGROUPS=y
CONFIG_CGROUP_SCHED=y
diff --git a/tools/testing/selftests/sched_ext/cyclic_kick_wait.bpf.c b/tools/testing/selftests/sched_ext/cyclic_kick_wait.bpf.c
new file mode 100644
index 000000000000..cb34d3335917
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/cyclic_kick_wait.bpf.c
@@ -0,0 +1,68 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Stress concurrent SCX_KICK_WAIT calls to reproduce wait-cycle deadlock.
+ *
+ * Three CPUs are designated from userspace. Every enqueue from one of the
+ * three CPUs kicks the next CPU in the ring with SCX_KICK_WAIT, creating a
+ * persistent A -> B -> C -> A wait cycle pressure.
+ */
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+const volatile s32 test_cpu_a;
+const volatile s32 test_cpu_b;
+const volatile s32 test_cpu_c;
+
+u64 nr_enqueues;
+u64 nr_wait_kicks;
+
+UEI_DEFINE(uei);
+
+static s32 target_cpu(s32 cpu)
+{
+ if (cpu == test_cpu_a)
+ return test_cpu_b;
+ if (cpu == test_cpu_b)
+ return test_cpu_c;
+ if (cpu == test_cpu_c)
+ return test_cpu_a;
+ return -1;
+}
+
+void BPF_STRUCT_OPS(cyclic_kick_wait_enqueue, struct task_struct *p,
+ u64 enq_flags)
+{
+ s32 this_cpu = bpf_get_smp_processor_id();
+ s32 tgt;
+
+ __sync_fetch_and_add(&nr_enqueues, 1);
+
+ if (p->flags & PF_KTHREAD) {
+ scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_INF,
+ enq_flags | SCX_ENQ_PREEMPT);
+ return;
+ }
+
+ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+
+ tgt = target_cpu(this_cpu);
+ if (tgt < 0 || tgt == this_cpu)
+ return;
+
+ __sync_fetch_and_add(&nr_wait_kicks, 1);
+ scx_bpf_kick_cpu(tgt, SCX_KICK_WAIT);
+}
+
+void BPF_STRUCT_OPS(cyclic_kick_wait_exit, struct scx_exit_info *ei)
+{
+ UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops cyclic_kick_wait_ops = {
+ .enqueue = cyclic_kick_wait_enqueue,
+ .exit = cyclic_kick_wait_exit,
+ .name = "cyclic_kick_wait",
+ .timeout_ms = 1000U,
+};
diff --git a/tools/testing/selftests/sched_ext/cyclic_kick_wait.c b/tools/testing/selftests/sched_ext/cyclic_kick_wait.c
new file mode 100644
index 000000000000..c2e5aa9de715
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/cyclic_kick_wait.c
@@ -0,0 +1,194 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Test SCX_KICK_WAIT forward progress under cyclic wait pressure.
+ *
+ * SCX_KICK_WAIT busy-waits until the target CPU enters the scheduling path.
+ * If multiple CPUs form a wait cycle (A waits for B, B waits for C, C waits
+ * for A), all CPUs deadlock unless the implementation breaks the cycle.
+ *
+ * This test creates that scenario: three CPUs are arranged in a ring. The BPF
+ * scheduler's ops.enqueue() kicks the next CPU in the ring with SCX_KICK_WAIT
+ * on every enqueue. Userspace pins 4 worker threads per CPU that loop calling
+ * sched_yield(), generating a steady stream of enqueues and thus sustained
+ * A->B->C->A kick_wait cycle pressure. The test passes if the system remains
+ * responsive for 5 seconds without the scheduler being killed by the watchdog.
+ */
+#define _GNU_SOURCE
+
+#include <bpf/bpf.h>
+#include <errno.h>
+#include <pthread.h>
+#include <sched.h>
+#include <scx/common.h>
+#include <stdint.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "scx_test.h"
+#include "cyclic_kick_wait.bpf.skel.h"
+
+#define WORKERS_PER_CPU 4
+#define NR_TEST_CPUS 3
+#define NR_WORKERS (NR_TEST_CPUS * WORKERS_PER_CPU)
+
+struct worker_ctx {
+ pthread_t tid;
+ int cpu;
+ volatile bool stop;
+ volatile __u64 iters;
+ bool started;
+};
+
+static void *worker_fn(void *arg)
+{
+ struct worker_ctx *worker = arg;
+ cpu_set_t mask;
+
+ CPU_ZERO(&mask);
+ CPU_SET(worker->cpu, &mask);
+
+ if (sched_setaffinity(0, sizeof(mask), &mask))
+ return (void *)(uintptr_t)errno;
+
+ while (!worker->stop) {
+ sched_yield();
+ worker->iters++;
+ }
+
+ return NULL;
+}
+
+static int join_worker(struct worker_ctx *worker)
+{
+ void *ret;
+ struct timespec ts;
+ int err;
+
+ if (!worker->started)
+ return 0;
+
+ if (clock_gettime(CLOCK_REALTIME, &ts))
+ return -errno;
+
+ ts.tv_sec += 2;
+ err = pthread_timedjoin_np(worker->tid, &ret, &ts);
+ if (err == ETIMEDOUT)
+ pthread_detach(worker->tid);
+ if (err)
+ return -err;
+
+ if ((uintptr_t)ret)
+ return -(int)(uintptr_t)ret;
+
+ return 0;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+ struct cyclic_kick_wait *skel;
+
+ skel = cyclic_kick_wait__open();
+ SCX_FAIL_IF(!skel, "Failed to open skel");
+ SCX_ENUM_INIT(skel);
+
+ *ctx = skel;
+ return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+ struct cyclic_kick_wait *skel = ctx;
+ struct worker_ctx workers[NR_WORKERS] = {};
+ struct bpf_link *link = NULL;
+ enum scx_test_status status = SCX_TEST_PASS;
+ int test_cpus[NR_TEST_CPUS];
+ int nr_cpus = 0;
+ cpu_set_t mask;
+ int ret, i;
+
+ if (sched_getaffinity(0, sizeof(mask), &mask)) {
+ SCX_ERR("Failed to get affinity (%d)", errno);
+ return SCX_TEST_FAIL;
+ }
+
+ for (i = 0; i < CPU_SETSIZE; i++) {
+ if (CPU_ISSET(i, &mask))
+ test_cpus[nr_cpus++] = i;
+ if (nr_cpus == NR_TEST_CPUS)
+ break;
+ }
+
+ if (nr_cpus < NR_TEST_CPUS)
+ return SCX_TEST_SKIP;
+
+ skel->rodata->test_cpu_a = test_cpus[0];
+ skel->rodata->test_cpu_b = test_cpus[1];
+ skel->rodata->test_cpu_c = test_cpus[2];
+
+ if (cyclic_kick_wait__load(skel)) {
+ SCX_ERR("Failed to load skel");
+ return SCX_TEST_FAIL;
+ }
+
+ link = bpf_map__attach_struct_ops(skel->maps.cyclic_kick_wait_ops);
+ if (!link) {
+ SCX_ERR("Failed to attach scheduler");
+ return SCX_TEST_FAIL;
+ }
+
+ for (i = 0; i < NR_WORKERS; i++)
+ workers[i].cpu = test_cpus[i / WORKERS_PER_CPU];
+
+ for (i = 0; i < NR_WORKERS; i++) {
+ ret = pthread_create(&workers[i].tid, NULL, worker_fn, &workers[i]);
+ if (ret) {
+ SCX_ERR("Failed to create worker thread %d (%d)", i, ret);
+ status = SCX_TEST_FAIL;
+ goto out;
+ }
+ workers[i].started = true;
+ }
+
+ sleep(5);
+
+ if (skel->data->uei.kind != EXIT_KIND(SCX_EXIT_NONE)) {
+ SCX_ERR("Scheduler exited unexpectedly (kind=%llu code=%lld)",
+ (unsigned long long)skel->data->uei.kind,
+ (long long)skel->data->uei.exit_code);
+ status = SCX_TEST_FAIL;
+ }
+
+out:
+ for (i = 0; i < NR_WORKERS; i++)
+ workers[i].stop = true;
+
+ for (i = 0; i < NR_WORKERS; i++) {
+ ret = join_worker(&workers[i]);
+ if (ret && status == SCX_TEST_PASS) {
+ SCX_ERR("Failed to join worker thread %d (%d)", i, ret);
+ status = SCX_TEST_FAIL;
+ }
+ }
+
+ if (link)
+ bpf_link__destroy(link);
+
+ return status;
+}
+
+static void cleanup(void *ctx)
+{
+ struct cyclic_kick_wait *skel = ctx;
+
+ cyclic_kick_wait__destroy(skel);
+}
+
+struct scx_test cyclic_kick_wait = {
+ .name = "cyclic_kick_wait",
+ .description = "Verify SCX_KICK_WAIT forward progress under a 3-CPU wait cycle",
+ .setup = setup,
+ .run = run,
+ .cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&cyclic_kick_wait)
diff --git a/tools/testing/selftests/sched_ext/dequeue.bpf.c b/tools/testing/selftests/sched_ext/dequeue.bpf.c
new file mode 100644
index 000000000000..624e2ccb0688
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/dequeue.bpf.c
@@ -0,0 +1,389 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A scheduler that validates ops.dequeue() is called correctly:
+ * - Tasks dispatched to terminal DSQs (local, global) bypass the BPF
+ * scheduler entirely: no ops.dequeue() should be called
+ * - Tasks dispatched to user DSQs from ops.enqueue() enter BPF custody:
+ * ops.dequeue() must be called when they leave custody
+ * - Every ops.enqueue() dispatch to non-terminal DSQs is followed by
+ * exactly one ops.dequeue() (validate 1:1 pairing and state machine)
+ *
+ * Copyright (c) 2026 NVIDIA Corporation.
+ */
+
+#include <scx/common.bpf.h>
+
+#define SHARED_DSQ 0
+
+/*
+ * BPF internal queue.
+ *
+ * Tasks are stored here and consumed from ops.dispatch(), validating that
+ * tasks on BPF internal structures still get ops.dequeue() when they
+ * leave.
+ */
+struct {
+ __uint(type, BPF_MAP_TYPE_QUEUE);
+ __uint(max_entries, 32768);
+ __type(value, s32);
+} global_queue SEC(".maps");
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+/*
+ * Counters to track the lifecycle of tasks:
+ * - enqueue_cnt: Number of times ops.enqueue() was called
+ * - dequeue_cnt: Number of times ops.dequeue() was called (any type)
+ * - dispatch_dequeue_cnt: Number of regular dispatch dequeues (no flag)
+ * - change_dequeue_cnt: Number of property change dequeues
+ * - bpf_queue_full: Number of times the BPF internal queue was full
+ */
+u64 enqueue_cnt, dequeue_cnt, dispatch_dequeue_cnt, change_dequeue_cnt, bpf_queue_full;
+
+/*
+ * Test scenarios:
+ * 0) Dispatch to local DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
+ * scheduler, no dequeue callbacks)
+ * 1) Dispatch to global DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
+ * scheduler, no dequeue callbacks)
+ * 2) Dispatch to shared user DSQ from ops.select_cpu() (enters BPF scheduler,
+ * dequeue callbacks expected)
+ * 3) Dispatch to local DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
+ * scheduler, no dequeue callbacks)
+ * 4) Dispatch to global DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
+ * scheduler, no dequeue callbacks)
+ * 5) Dispatch to shared user DSQ from ops.enqueue() (enters BPF scheduler,
+ * dequeue callbacks expected)
+ * 6) BPF internal queue from ops.enqueue(): store task PIDs in ops.enqueue(),
+ * consume in ops.dispatch() and dispatch to local DSQ (validates dequeue
+ * for tasks stored in internal BPF data structures)
+ */
+u32 test_scenario;
+
+/*
+ * Per-task state to track lifecycle and validate workflow semantics.
+ * State transitions:
+ * NONE -> ENQUEUED (on enqueue)
+ * NONE -> DISPATCHED (on direct dispatch to terminal DSQ)
+ * ENQUEUED -> DISPATCHED (on dispatch dequeue)
+ * DISPATCHED -> NONE (on property change dequeue or re-enqueue)
+ * ENQUEUED -> NONE (on property change dequeue before dispatch)
+ */
+enum task_state {
+ TASK_NONE = 0,
+ TASK_ENQUEUED,
+ TASK_DISPATCHED,
+};
+
+struct task_ctx {
+ enum task_state state; /* Current state in the workflow */
+ u64 enqueue_seq; /* Sequence number for debugging */
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct task_ctx);
+} task_ctx_stor SEC(".maps");
+
+static struct task_ctx *try_lookup_task_ctx(struct task_struct *p)
+{
+ return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+}
+
+s32 BPF_STRUCT_OPS(dequeue_select_cpu, struct task_struct *p,
+ s32 prev_cpu, u64 wake_flags)
+{
+ struct task_ctx *tctx;
+
+ tctx = try_lookup_task_ctx(p);
+ if (!tctx)
+ return prev_cpu;
+
+ switch (test_scenario) {
+ case 0:
+ /*
+ * Direct dispatch to the local DSQ.
+ *
+ * Task bypasses BPF scheduler entirely: no enqueue
+ * tracking, no ops.dequeue() callbacks.
+ */
+ scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
+ tctx->state = TASK_DISPATCHED;
+ break;
+ case 1:
+ /*
+ * Direct dispatch to the global DSQ.
+ *
+ * Task bypasses BPF scheduler entirely: no enqueue
+ * tracking, no ops.dequeue() callbacks.
+ */
+ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+ tctx->state = TASK_DISPATCHED;
+ break;
+ case 2:
+ /*
+ * Dispatch to a shared user DSQ.
+ *
+ * Task enters BPF scheduler management: track
+ * enqueue/dequeue lifecycle and validate state
+ * transitions.
+ */
+ if (tctx->state == TASK_ENQUEUED)
+ scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
+ p->pid, p->comm, tctx->enqueue_seq);
+
+ scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, 0);
+
+ __sync_fetch_and_add(&enqueue_cnt, 1);
+
+ tctx->state = TASK_ENQUEUED;
+ tctx->enqueue_seq++;
+ break;
+ }
+
+ return prev_cpu;
+}
+
+void BPF_STRUCT_OPS(dequeue_enqueue, struct task_struct *p, u64 enq_flags)
+{
+ struct task_ctx *tctx;
+ s32 pid = p->pid;
+
+ tctx = try_lookup_task_ctx(p);
+ if (!tctx)
+ return;
+
+ switch (test_scenario) {
+ case 3:
+ /*
+ * Direct dispatch to the local DSQ.
+ *
+ * Task bypasses BPF scheduler entirely: no enqueue
+ * tracking, no ops.dequeue() callbacks.
+ */
+ scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+ tctx->state = TASK_DISPATCHED;
+ break;
+ case 4:
+ /*
+ * Direct dispatch to the global DSQ.
+ *
+ * Task bypasses BPF scheduler entirely: no enqueue
+ * tracking, no ops.dequeue() callbacks.
+ */
+ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+ tctx->state = TASK_DISPATCHED;
+ break;
+ case 5:
+ /*
+ * Dispatch to shared user DSQ.
+ *
+ * Task enters BPF scheduler management: track
+ * enqueue/dequeue lifecycle and validate state
+ * transitions.
+ */
+ if (tctx->state == TASK_ENQUEUED)
+ scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
+ p->pid, p->comm, tctx->enqueue_seq);
+
+ scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
+
+ __sync_fetch_and_add(&enqueue_cnt, 1);
+
+ tctx->state = TASK_ENQUEUED;
+ tctx->enqueue_seq++;
+ break;
+ case 6:
+ /*
+ * Store task in BPF internal queue.
+ *
+ * Task enters BPF scheduler management: track
+ * enqueue/dequeue lifecycle and validate state
+ * transitions.
+ */
+ if (tctx->state == TASK_ENQUEUED)
+ scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
+ p->pid, p->comm, tctx->enqueue_seq);
+
+ if (bpf_map_push_elem(&global_queue, &pid, 0)) {
+ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+ __sync_fetch_and_add(&bpf_queue_full, 1);
+
+ tctx->state = TASK_DISPATCHED;
+ } else {
+ __sync_fetch_and_add(&enqueue_cnt, 1);
+
+ tctx->state = TASK_ENQUEUED;
+ tctx->enqueue_seq++;
+ }
+ break;
+ default:
+ /* For all other scenarios, dispatch to the global DSQ */
+ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+ tctx->state = TASK_DISPATCHED;
+ break;
+ }
+
+ scx_bpf_kick_cpu(scx_bpf_task_cpu(p), SCX_KICK_IDLE);
+}
+
+void BPF_STRUCT_OPS(dequeue_dequeue, struct task_struct *p, u64 deq_flags)
+{
+ struct task_ctx *tctx;
+
+ __sync_fetch_and_add(&dequeue_cnt, 1);
+
+ tctx = try_lookup_task_ctx(p);
+ if (!tctx)
+ return;
+
+ /*
+ * For scenarios 0, 1, 3, and 4 (terminal DSQs: local and global),
+ * ops.dequeue() should never be called because tasks bypass the
+ * BPF scheduler entirely. If we get here, it's a kernel bug.
+ */
+ if (test_scenario == 0 || test_scenario == 3) {
+ scx_bpf_error("%d (%s): dequeue called for local DSQ scenario",
+ p->pid, p->comm);
+ return;
+ }
+
+ if (test_scenario == 1 || test_scenario == 4) {
+ scx_bpf_error("%d (%s): dequeue called for global DSQ scenario",
+ p->pid, p->comm);
+ return;
+ }
+
+ if (deq_flags & SCX_DEQ_SCHED_CHANGE) {
+ /*
+ * Property change interrupting the workflow. Valid from
+ * both ENQUEUED and DISPATCHED states. Transitions task
+ * back to NONE state.
+ */
+ __sync_fetch_and_add(&change_dequeue_cnt, 1);
+
+ /* Validate state transition */
+ if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_DISPATCHED)
+ scx_bpf_error("%d (%s): invalid property change dequeue state=%d seq=%llu",
+ p->pid, p->comm, tctx->state, tctx->enqueue_seq);
+
+ /*
+ * Transition back to NONE: task outside scheduler control.
+ *
+ * Scenario 6: dispatch() checks tctx->state after popping a
+ * PID, if the task is in state NONE, it was dequeued by
+ * property change and must not be dispatched (this
+ * prevents "target CPU not allowed").
+ */
+ tctx->state = TASK_NONE;
+ } else {
+ /*
+ * Regular dispatch dequeue: kernel is moving the task from
+ * BPF custody to a terminal DSQ. Normally we come from
+ * ENQUEUED state. We can also see TASK_NONE if the task
+ * was dequeued by property change (SCX_DEQ_SCHED_CHANGE)
+ * while it was already on a DSQ (dispatched but not yet
+ * consumed); in that case we just leave state as NONE.
+ */
+ __sync_fetch_and_add(&dispatch_dequeue_cnt, 1);
+
+ /*
+ * Must be ENQUEUED (normal path) or NONE (already dequeued
+ * by property change while on a DSQ).
+ */
+ if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_NONE)
+ scx_bpf_error("%d (%s): dispatch dequeue from state %d seq=%llu",
+ p->pid, p->comm, tctx->state, tctx->enqueue_seq);
+
+ if (tctx->state == TASK_ENQUEUED)
+ tctx->state = TASK_DISPATCHED;
+
+ /* NONE: leave as-is, task was already property-change dequeued */
+ }
+}
+
+void BPF_STRUCT_OPS(dequeue_dispatch, s32 cpu, struct task_struct *prev)
+{
+ if (test_scenario == 6) {
+ struct task_ctx *tctx;
+ struct task_struct *p;
+ s32 pid;
+
+ if (bpf_map_pop_elem(&global_queue, &pid))
+ return;
+
+ p = bpf_task_from_pid(pid);
+ if (!p)
+ return;
+
+ /*
+ * If the task was dequeued by property change
+ * (ops.dequeue() set tctx->state = TASK_NONE), skip
+ * dispatch.
+ */
+ tctx = try_lookup_task_ctx(p);
+ if (!tctx || tctx->state == TASK_NONE) {
+ bpf_task_release(p);
+ return;
+ }
+
+ /*
+ * Dispatch to this CPU's local DSQ if allowed, otherwise
+ * fallback to the global DSQ.
+ */
+ if (bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
+ scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0);
+ else
+ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
+
+ bpf_task_release(p);
+ } else {
+ scx_bpf_dsq_move_to_local(SHARED_DSQ, 0);
+ }
+}
+
+s32 BPF_STRUCT_OPS(dequeue_init_task, struct task_struct *p,
+ struct scx_init_task_args *args)
+{
+ struct task_ctx *tctx;
+
+ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0,
+ BPF_LOCAL_STORAGE_GET_F_CREATE);
+ if (!tctx)
+ return -ENOMEM;
+
+ return 0;
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(dequeue_init)
+{
+ s32 ret;
+
+ ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+void BPF_STRUCT_OPS(dequeue_exit, struct scx_exit_info *ei)
+{
+ UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops dequeue_ops = {
+ .select_cpu = (void *)dequeue_select_cpu,
+ .enqueue = (void *)dequeue_enqueue,
+ .dequeue = (void *)dequeue_dequeue,
+ .dispatch = (void *)dequeue_dispatch,
+ .init_task = (void *)dequeue_init_task,
+ .init = (void *)dequeue_init,
+ .exit = (void *)dequeue_exit,
+ .flags = SCX_OPS_ENQ_LAST,
+ .name = "dequeue_test",
+};
diff --git a/tools/testing/selftests/sched_ext/dequeue.c b/tools/testing/selftests/sched_ext/dequeue.c
new file mode 100644
index 000000000000..4e93262703ca
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/dequeue.c
@@ -0,0 +1,274 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025 NVIDIA Corporation.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <time.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <sched.h>
+#include <pthread.h>
+#include "scx_test.h"
+#include "dequeue.bpf.skel.h"
+
+#define NUM_WORKERS 8
+#define AFFINITY_HAMMER_MS 500
+
+/*
+ * Worker function that creates enqueue/dequeue events via CPU work and
+ * sleep.
+ */
+static void worker_fn(int id)
+{
+ int i;
+ volatile int sum = 0;
+
+ for (i = 0; i < 1000; i++) {
+ volatile int j;
+
+ /* Do some work to trigger scheduling events */
+ for (j = 0; j < 10000; j++)
+ sum += j;
+
+ /* Sleep to trigger dequeue */
+ usleep(1000 + (id * 100));
+ }
+
+ exit(0);
+}
+
+/*
+ * This thread changes workers' affinity from outside so that some changes
+ * hit tasks while they are still in the scheduler's queue and trigger
+ * property-change dequeues.
+ */
+static void *affinity_hammer_fn(void *arg)
+{
+ pid_t *pids = arg;
+ cpu_set_t cpuset;
+ int i = 0, n = NUM_WORKERS;
+ struct timespec start, now;
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+ while (1) {
+ int w = i % n;
+ int cpu = (i / n) % 4;
+
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpu, &cpuset);
+ sched_setaffinity(pids[w], sizeof(cpuset), &cpuset);
+ i++;
+
+ /* Check elapsed time every 256 iterations to limit gettime cost */
+ if ((i & 255) == 0) {
+ long long elapsed_ms;
+
+ clock_gettime(CLOCK_MONOTONIC, &now);
+ elapsed_ms = (now.tv_sec - start.tv_sec) * 1000LL +
+ (now.tv_nsec - start.tv_nsec) / 1000000;
+ if (elapsed_ms >= AFFINITY_HAMMER_MS)
+ break;
+ }
+ }
+ return NULL;
+}
+
+static enum scx_test_status run_scenario(struct dequeue *skel, u32 scenario,
+ const char *scenario_name)
+{
+ struct bpf_link *link;
+ pid_t pids[NUM_WORKERS];
+ pthread_t hammer;
+
+ int i, status;
+ u64 enq_start, deq_start,
+ dispatch_deq_start, change_deq_start, bpf_queue_full_start;
+ u64 enq_delta, deq_delta,
+ dispatch_deq_delta, change_deq_delta, bpf_queue_full_delta;
+
+ /* Set the test scenario */
+ skel->bss->test_scenario = scenario;
+
+ /* Record starting counts */
+ enq_start = skel->bss->enqueue_cnt;
+ deq_start = skel->bss->dequeue_cnt;
+ dispatch_deq_start = skel->bss->dispatch_dequeue_cnt;
+ change_deq_start = skel->bss->change_dequeue_cnt;
+ bpf_queue_full_start = skel->bss->bpf_queue_full;
+
+ link = bpf_map__attach_struct_ops(skel->maps.dequeue_ops);
+ SCX_FAIL_IF(!link, "Failed to attach struct_ops for scenario %s", scenario_name);
+
+ /* Fork worker processes to generate enqueue/dequeue events */
+ for (i = 0; i < NUM_WORKERS; i++) {
+ pids[i] = fork();
+ SCX_FAIL_IF(pids[i] < 0, "Failed to fork worker %d", i);
+
+ if (pids[i] == 0) {
+ worker_fn(i);
+ /* Should not reach here */
+ exit(1);
+ }
+ }
+
+ /*
+ * Run an "affinity hammer" so that some property changes hit tasks
+ * while they are still in BPF custody (e.g., in user DSQ or BPF
+ * queue), triggering SCX_DEQ_SCHED_CHANGE dequeues.
+ */
+ SCX_FAIL_IF(pthread_create(&hammer, NULL, affinity_hammer_fn, pids) != 0,
+ "Failed to create affinity hammer thread");
+ pthread_join(hammer, NULL);
+
+ /* Wait for all workers to complete */
+ for (i = 0; i < NUM_WORKERS; i++) {
+ SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i],
+ "Failed to wait for worker %d", i);
+ SCX_FAIL_IF(status != 0, "Worker %d exited with status %d", i, status);
+ }
+
+ bpf_link__destroy(link);
+
+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG));
+
+ /* Calculate deltas */
+ enq_delta = skel->bss->enqueue_cnt - enq_start;
+ deq_delta = skel->bss->dequeue_cnt - deq_start;
+ dispatch_deq_delta = skel->bss->dispatch_dequeue_cnt - dispatch_deq_start;
+ change_deq_delta = skel->bss->change_dequeue_cnt - change_deq_start;
+ bpf_queue_full_delta = skel->bss->bpf_queue_full - bpf_queue_full_start;
+
+ printf("%s:\n", scenario_name);
+ printf(" enqueues: %lu\n", (unsigned long)enq_delta);
+ printf(" dequeues: %lu (dispatch: %lu, property_change: %lu)\n",
+ (unsigned long)deq_delta,
+ (unsigned long)dispatch_deq_delta,
+ (unsigned long)change_deq_delta);
+ printf(" BPF queue full: %lu\n", (unsigned long)bpf_queue_full_delta);
+
+ /*
+ * Validate enqueue/dequeue lifecycle tracking.
+ *
+ * For scenarios 0, 1, 3, 4 (local and global DSQs from
+ * ops.select_cpu() and ops.enqueue()), both enqueues and dequeues
+ * should be 0 because tasks bypass the BPF scheduler entirely:
+ * tasks never enter BPF scheduler's custody.
+ *
+ * For scenarios 2, 5, 6 (user DSQ or BPF internal queue) we expect
+ * both enqueues and dequeues.
+ *
+ * The BPF code does strict state machine validation with
+ * scx_bpf_error() to ensure the workflow semantics are correct.
+ *
+ * If we reach this point without errors, the semantics are
+ * validated correctly.
+ */
+ if (scenario == 0 || scenario == 1 ||
+ scenario == 3 || scenario == 4) {
+ /* Tasks bypass BPF scheduler completely */
+ SCX_EQ(enq_delta, 0);
+ SCX_EQ(deq_delta, 0);
+ SCX_EQ(dispatch_deq_delta, 0);
+ SCX_EQ(change_deq_delta, 0);
+ } else {
+ /*
+ * User DSQ from ops.enqueue() or ops.select_cpu(): tasks
+ * enter BPF scheduler's custody.
+ *
+ * Also validate 1:1 enqueue/dequeue pairing.
+ */
+ SCX_GT(enq_delta, 0);
+ SCX_GT(deq_delta, 0);
+ SCX_EQ(enq_delta, deq_delta);
+ }
+
+ return SCX_TEST_PASS;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+ struct dequeue *skel;
+
+ skel = dequeue__open();
+ SCX_FAIL_IF(!skel, "Failed to open skel");
+ SCX_ENUM_INIT(skel);
+ SCX_FAIL_IF(dequeue__load(skel), "Failed to load skel");
+
+ *ctx = skel;
+
+ return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+ struct dequeue *skel = ctx;
+ enum scx_test_status status;
+
+ status = run_scenario(skel, 0, "Scenario 0: Local DSQ from ops.select_cpu()");
+ if (status != SCX_TEST_PASS)
+ return status;
+
+ status = run_scenario(skel, 1, "Scenario 1: Global DSQ from ops.select_cpu()");
+ if (status != SCX_TEST_PASS)
+ return status;
+
+ status = run_scenario(skel, 2, "Scenario 2: User DSQ from ops.select_cpu()");
+ if (status != SCX_TEST_PASS)
+ return status;
+
+ status = run_scenario(skel, 3, "Scenario 3: Local DSQ from ops.enqueue()");
+ if (status != SCX_TEST_PASS)
+ return status;
+
+ status = run_scenario(skel, 4, "Scenario 4: Global DSQ from ops.enqueue()");
+ if (status != SCX_TEST_PASS)
+ return status;
+
+ status = run_scenario(skel, 5, "Scenario 5: User DSQ from ops.enqueue()");
+ if (status != SCX_TEST_PASS)
+ return status;
+
+ status = run_scenario(skel, 6, "Scenario 6: BPF queue from ops.enqueue()");
+ if (status != SCX_TEST_PASS)
+ return status;
+
+ printf("\n=== Summary ===\n");
+ printf("Total enqueues: %lu\n", (unsigned long)skel->bss->enqueue_cnt);
+ printf("Total dequeues: %lu\n", (unsigned long)skel->bss->dequeue_cnt);
+ printf(" Dispatch dequeues: %lu (no flag, normal workflow)\n",
+ (unsigned long)skel->bss->dispatch_dequeue_cnt);
+ printf(" Property change dequeues: %lu (SCX_DEQ_SCHED_CHANGE flag)\n",
+ (unsigned long)skel->bss->change_dequeue_cnt);
+ printf(" BPF queue full: %lu\n",
+ (unsigned long)skel->bss->bpf_queue_full);
+ printf("\nAll scenarios passed - no state machine violations detected\n");
+ printf("-> Validated: Local DSQ dispatch bypasses BPF scheduler\n");
+ printf("-> Validated: Global DSQ dispatch bypasses BPF scheduler\n");
+ printf("-> Validated: User DSQ dispatch triggers ops.dequeue() callbacks\n");
+ printf("-> Validated: Dispatch dequeues have no flags (normal workflow)\n");
+ printf("-> Validated: Property change dequeues have SCX_DEQ_SCHED_CHANGE flag\n");
+ printf("-> Validated: No duplicate enqueues or invalid state transitions\n");
+
+ return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+ struct dequeue *skel = ctx;
+
+ dequeue__destroy(skel);
+}
+
+struct scx_test dequeue_test = {
+ .name = "dequeue",
+ .description = "Verify ops.dequeue() semantics",
+ .setup = setup,
+ .run = run,
+ .cleanup = cleanup,
+};
+
+REGISTER_SCX_TEST(&dequeue_test)
diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu.bpf.c b/tools/testing/selftests/sched_ext/enq_select_cpu.bpf.c
new file mode 100644
index 000000000000..ee2c9b89716e
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/enq_select_cpu.bpf.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+s32 BPF_STRUCT_OPS(enq_select_cpu_select_cpu, struct task_struct *p,
+ s32 prev_cpu, u64 wake_flags)
+{
+ /* Bounce all tasks to ops.enqueue() */
+ return prev_cpu;
+}
+
+void BPF_STRUCT_OPS(enq_select_cpu_enqueue, struct task_struct *p,
+ u64 enq_flags)
+{
+ s32 cpu, prev_cpu = scx_bpf_task_cpu(p);
+ bool found = false;
+
+ cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, 0, &found);
+ if (found) {
+ scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, enq_flags);
+ return;
+ }
+
+ scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+}
+
+void BPF_STRUCT_OPS(enq_select_cpu_exit, struct scx_exit_info *ei)
+{
+ UEI_RECORD(uei, ei);
+}
+
+struct task_cpu_arg {
+ pid_t pid;
+};
+
+SEC("syscall")
+int select_cpu_from_user(struct task_cpu_arg *input)
+{
+ struct task_struct *p;
+ bool found = false;
+ s32 cpu;
+
+ p = bpf_task_from_pid(input->pid);
+ if (!p)
+ return -EINVAL;
+
+ bpf_rcu_read_lock();
+ cpu = scx_bpf_select_cpu_dfl(p, bpf_get_smp_processor_id(), 0, &found);
+ if (!found)
+ cpu = -EBUSY;
+ bpf_rcu_read_unlock();
+
+ bpf_task_release(p);
+
+ return cpu;
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops enq_select_cpu_ops = {
+ .select_cpu = (void *)enq_select_cpu_select_cpu,
+ .enqueue = (void *)enq_select_cpu_enqueue,
+ .exit = (void *)enq_select_cpu_exit,
+ .name = "enq_select_cpu",
+ .timeout_ms = 1000U,
+};
diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu.c b/tools/testing/selftests/sched_ext/enq_select_cpu.c
new file mode 100644
index 000000000000..340c6f8b86da
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/enq_select_cpu.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "enq_select_cpu.bpf.skel.h"
+#include "scx_test.h"
+
+static enum scx_test_status setup(void **ctx)
+{
+ struct enq_select_cpu *skel;
+
+ skel = enq_select_cpu__open();
+ SCX_FAIL_IF(!skel, "Failed to open");
+ SCX_ENUM_INIT(skel);
+ SCX_FAIL_IF(enq_select_cpu__load(skel), "Failed to load skel");
+
+ *ctx = skel;
+
+ return SCX_TEST_PASS;
+}
+
+static int test_select_cpu_from_user(const struct enq_select_cpu *skel)
+{
+ int fd, ret;
+ __u64 args[1];
+
+ LIBBPF_OPTS(bpf_test_run_opts, attr,
+ .ctx_in = args,
+ .ctx_size_in = sizeof(args),
+ );
+
+ args[0] = getpid();
+ fd = bpf_program__fd(skel->progs.select_cpu_from_user);
+ if (fd < 0)
+ return fd;
+
+ ret = bpf_prog_test_run_opts(fd, &attr);
+ if (ret < 0)
+ return ret;
+
+ fprintf(stderr, "%s: CPU %d\n", __func__, attr.retval);
+
+ return 0;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+ struct enq_select_cpu *skel = ctx;
+ struct bpf_link *link;
+
+ link = bpf_map__attach_struct_ops(skel->maps.enq_select_cpu_ops);
+ if (!link) {
+ SCX_ERR("Failed to attach scheduler");
+ return SCX_TEST_FAIL;
+ }
+
+ /* Pick an idle CPU from user-space */
+ SCX_FAIL_IF(test_select_cpu_from_user(skel), "Failed to pick idle CPU");
+
+ sleep(1);
+
+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE));
+ bpf_link__destroy(link);
+
+ return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+ struct enq_select_cpu *skel = ctx;
+
+ enq_select_cpu__destroy(skel);
+}
+
+struct scx_test enq_select_cpu = {
+ .name = "enq_select_cpu",
+ .description = "Verify scx_bpf_select_cpu_dfl() from multiple contexts",
+ .setup = setup,
+ .run = run,
+ .cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&enq_select_cpu)
diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c
deleted file mode 100644
index a7cf868d5e31..000000000000
--- a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
- * Copyright (c) 2023 David Vernet <dvernet@meta.com>
- * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
- */
-
-#include <scx/common.bpf.h>
-
-char _license[] SEC("license") = "GPL";
-
-/* Manually specify the signature until the kfunc is added to the scx repo. */
-s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
- bool *found) __ksym;
-
-s32 BPF_STRUCT_OPS(enq_select_cpu_fails_select_cpu, struct task_struct *p,
- s32 prev_cpu, u64 wake_flags)
-{
- return prev_cpu;
-}
-
-void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p,
- u64 enq_flags)
-{
- /*
- * Need to initialize the variable or the verifier will fail to load.
- * Improving these semantics is actively being worked on.
- */
- bool found = false;
-
- /* Can only call from ops.select_cpu() */
- scx_bpf_select_cpu_dfl(p, 0, 0, &found);
-
- scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
-}
-
-SEC(".struct_ops.link")
-struct sched_ext_ops enq_select_cpu_fails_ops = {
- .select_cpu = (void *) enq_select_cpu_fails_select_cpu,
- .enqueue = (void *) enq_select_cpu_fails_enqueue,
- .name = "enq_select_cpu_fails",
- .timeout_ms = 1000U,
-};
diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c
deleted file mode 100644
index a80e3a3b3698..000000000000
--- a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c
+++ /dev/null
@@ -1,61 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
- * Copyright (c) 2023 David Vernet <dvernet@meta.com>
- * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
- */
-#include <bpf/bpf.h>
-#include <scx/common.h>
-#include <sys/wait.h>
-#include <unistd.h>
-#include "enq_select_cpu_fails.bpf.skel.h"
-#include "scx_test.h"
-
-static enum scx_test_status setup(void **ctx)
-{
- struct enq_select_cpu_fails *skel;
-
- skel = enq_select_cpu_fails__open();
- SCX_FAIL_IF(!skel, "Failed to open");
- SCX_ENUM_INIT(skel);
- SCX_FAIL_IF(enq_select_cpu_fails__load(skel), "Failed to load skel");
-
- *ctx = skel;
-
- return SCX_TEST_PASS;
-}
-
-static enum scx_test_status run(void *ctx)
-{
- struct enq_select_cpu_fails *skel = ctx;
- struct bpf_link *link;
-
- link = bpf_map__attach_struct_ops(skel->maps.enq_select_cpu_fails_ops);
- if (!link) {
- SCX_ERR("Failed to attach scheduler");
- return SCX_TEST_FAIL;
- }
-
- sleep(1);
-
- bpf_link__destroy(link);
-
- return SCX_TEST_PASS;
-}
-
-static void cleanup(void *ctx)
-{
- struct enq_select_cpu_fails *skel = ctx;
-
- enq_select_cpu_fails__destroy(skel);
-}
-
-struct scx_test enq_select_cpu_fails = {
- .name = "enq_select_cpu_fails",
- .description = "Verify we fail to call scx_bpf_select_cpu_dfl() "
- "from ops.enqueue()",
- .setup = setup,
- .run = run,
- .cleanup = cleanup,
-};
-REGISTER_SCX_TEST(&enq_select_cpu_fails)
diff --git a/tools/testing/selftests/sched_ext/exit.bpf.c b/tools/testing/selftests/sched_ext/exit.bpf.c
index 4bc36182d3ff..2e848820a44b 100644
--- a/tools/testing/selftests/sched_ext/exit.bpf.c
+++ b/tools/testing/selftests/sched_ext/exit.bpf.c
@@ -41,7 +41,7 @@ void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p)
if (exit_point == EXIT_DISPATCH)
EXIT_CLEANLY();
- scx_bpf_dsq_move_to_local(DSQ_ID);
+ scx_bpf_dsq_move_to_local(DSQ_ID, 0);
}
void BPF_STRUCT_OPS(exit_enable, struct task_struct *p)
diff --git a/tools/testing/selftests/sched_ext/exit.c b/tools/testing/selftests/sched_ext/exit.c
index 9451782689de..b987611789d1 100644
--- a/tools/testing/selftests/sched_ext/exit.c
+++ b/tools/testing/selftests/sched_ext/exit.c
@@ -22,10 +22,18 @@ static enum scx_test_status run(void *ctx)
struct bpf_link *link;
char buf[16];
+ /*
+ * On single-CPU systems, ops.select_cpu() is never
+ * invoked, so skip this test to avoid getting stuck
+ * indefinitely.
+ */
+ if (tc == EXIT_SELECT_CPU && libbpf_num_possible_cpus() == 1)
+ continue;
+
skel = exit__open();
SCX_ENUM_INIT(skel);
skel->rodata->exit_point = tc;
- exit__load(skel);
+ SCX_FAIL_IF(exit__load(skel), "Failed to load skel");
link = bpf_map__attach_struct_ops(skel->maps.exit_ops);
if (!link) {
SCX_ERR("Failed to attach scheduler");
diff --git a/tools/testing/selftests/sched_ext/exit_test.h b/tools/testing/selftests/sched_ext/exit_test.h
index 94f0268b9cb8..2723e0fda801 100644
--- a/tools/testing/selftests/sched_ext/exit_test.h
+++ b/tools/testing/selftests/sched_ext/exit_test.h
@@ -17,4 +17,4 @@ enum exit_test_case {
NUM_EXITS,
};
-#endif // # __EXIT_TEST_H__
+#endif // __EXIT_TEST_H__
diff --git a/tools/testing/selftests/sched_ext/hotplug.c b/tools/testing/selftests/sched_ext/hotplug.c
index 1c9ceb661c43..0cfbb111a2d0 100644
--- a/tools/testing/selftests/sched_ext/hotplug.c
+++ b/tools/testing/selftests/sched_ext/hotplug.c
@@ -6,7 +6,6 @@
#include <bpf/bpf.h>
#include <sched.h>
#include <scx/common.h>
-#include <sched.h>
#include <sys/wait.h>
#include <unistd.h>
diff --git a/tools/testing/selftests/sched_ext/init_enable_count.c b/tools/testing/selftests/sched_ext/init_enable_count.c
index eddf9e0e26e7..44577e30e764 100644
--- a/tools/testing/selftests/sched_ext/init_enable_count.c
+++ b/tools/testing/selftests/sched_ext/init_enable_count.c
@@ -4,6 +4,7 @@
* Copyright (c) 2023 David Vernet <dvernet@meta.com>
* Copyright (c) 2023 Tejun Heo <tj@kernel.org>
*/
+#include <signal.h>
#include <stdio.h>
#include <unistd.h>
#include <sched.h>
@@ -23,6 +24,9 @@ static enum scx_test_status run_test(bool global)
int ret, i, status;
struct sched_param param = {};
pid_t pids[num_pre_forks];
+ int pipe_fds[2];
+
+ SCX_FAIL_IF(pipe(pipe_fds) < 0, "Failed to create pipe");
skel = init_enable_count__open();
SCX_FAIL_IF(!skel, "Failed to open");
@@ -38,26 +42,35 @@ static enum scx_test_status run_test(bool global)
* ensure (at least in practical terms) that there are more tasks that
* transition from SCHED_OTHER -> SCHED_EXT than there are tasks that
* take the fork() path either below or in other processes.
+ *
+ * All children will block on read() on the pipe until the parent closes
+ * the write end after attaching the scheduler, which signals all of
+ * them to exit simultaneously. Auto-reap so we don't have to wait on
+ * them.
*/
+ signal(SIGCHLD, SIG_IGN);
for (i = 0; i < num_pre_forks; i++) {
- pids[i] = fork();
- SCX_FAIL_IF(pids[i] < 0, "Failed to fork child");
- if (pids[i] == 0) {
- sleep(1);
+ pid_t pid = fork();
+
+ SCX_FAIL_IF(pid < 0, "Failed to fork child");
+ if (pid == 0) {
+ char buf;
+
+ close(pipe_fds[1]);
+ if (read(pipe_fds[0], &buf, 1) < 0)
+ exit(1);
+ close(pipe_fds[0]);
exit(0);
}
}
+ close(pipe_fds[0]);
link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops);
SCX_FAIL_IF(!link, "Failed to attach struct_ops");
- for (i = 0; i < num_pre_forks; i++) {
- SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i],
- "Failed to wait for pre-forked child\n");
-
- SCX_FAIL_IF(status != 0, "Pre-forked child %d exited with status %d\n", i,
- status);
- }
+ /* Signal all pre-forked children to exit. */
+ close(pipe_fds[1]);
+ signal(SIGCHLD, SIG_DFL);
bpf_link__destroy(link);
SCX_GE(skel->bss->init_task_cnt, num_pre_forks);
diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c
index 430f5e13bf55..04a369078aac 100644
--- a/tools/testing/selftests/sched_ext/maximal.bpf.c
+++ b/tools/testing/selftests/sched_ext/maximal.bpf.c
@@ -30,7 +30,7 @@ void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags)
void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev)
{
- scx_bpf_dsq_move_to_local(DSQ_ID);
+ scx_bpf_dsq_move_to_local(DSQ_ID, 0);
}
void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags)
@@ -67,13 +67,12 @@ void BPF_STRUCT_OPS(maximal_set_cpumask, struct task_struct *p,
void BPF_STRUCT_OPS(maximal_update_idle, s32 cpu, bool idle)
{}
-void BPF_STRUCT_OPS(maximal_cpu_acquire, s32 cpu,
- struct scx_cpu_acquire_args *args)
-{}
-
-void BPF_STRUCT_OPS(maximal_cpu_release, s32 cpu,
- struct scx_cpu_release_args *args)
-{}
+SEC("tp_btf/sched_switch")
+int BPF_PROG(maximal_sched_switch, bool preempt, struct task_struct *prev,
+ struct task_struct *next, unsigned int prev_state)
+{
+ return 0;
+}
void BPF_STRUCT_OPS(maximal_cpu_online, s32 cpu)
{}
@@ -123,6 +122,10 @@ void BPF_STRUCT_OPS(maximal_cgroup_cancel_move, struct task_struct *p,
void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
{}
+void BPF_STRUCT_OPS(maximal_cgroup_set_bandwidth, struct cgroup *cgrp,
+ u64 period_us, u64 quota_us, u64 burst_us)
+{}
+
s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init)
{
return scx_bpf_create_dsq(DSQ_ID, -1);
@@ -146,8 +149,6 @@ struct sched_ext_ops maximal_ops = {
.set_weight = (void *) maximal_set_weight,
.set_cpumask = (void *) maximal_set_cpumask,
.update_idle = (void *) maximal_update_idle,
- .cpu_acquire = (void *) maximal_cpu_acquire,
- .cpu_release = (void *) maximal_cpu_release,
.cpu_online = (void *) maximal_cpu_online,
.cpu_offline = (void *) maximal_cpu_offline,
.init_task = (void *) maximal_init_task,
@@ -160,6 +161,7 @@ struct sched_ext_ops maximal_ops = {
.cgroup_move = (void *) maximal_cgroup_move,
.cgroup_cancel_move = (void *) maximal_cgroup_cancel_move,
.cgroup_set_weight = (void *) maximal_cgroup_set_weight,
+ .cgroup_set_bandwidth = (void *) maximal_cgroup_set_bandwidth,
.init = (void *) maximal_init,
.exit = (void *) maximal_exit,
.name = "maximal",
diff --git a/tools/testing/selftests/sched_ext/maximal.c b/tools/testing/selftests/sched_ext/maximal.c
index c6be50a9941d..1dc369224670 100644
--- a/tools/testing/selftests/sched_ext/maximal.c
+++ b/tools/testing/selftests/sched_ext/maximal.c
@@ -19,6 +19,9 @@ static enum scx_test_status setup(void **ctx)
SCX_ENUM_INIT(skel);
SCX_FAIL_IF(maximal__load(skel), "Failed to load skel");
+ bpf_map__set_autoattach(skel->maps.maximal_ops, false);
+ SCX_FAIL_IF(maximal__attach(skel), "Failed to attach skel");
+
*ctx = skel;
return SCX_TEST_PASS;
diff --git a/tools/testing/selftests/sched_ext/numa.bpf.c b/tools/testing/selftests/sched_ext/numa.bpf.c
new file mode 100644
index 000000000000..78cc49a7f9a6
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/numa.bpf.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A scheduler that validates the behavior of the NUMA-aware
+ * functionalities.
+ *
+ * The scheduler creates a separate DSQ for each NUMA node, ensuring tasks
+ * are exclusively processed by CPUs within their respective nodes. Idle
+ * CPUs are selected only within the same node, so task migration can only
+ * occurs between CPUs belonging to the same node.
+ *
+ * Copyright (c) 2025 Andrea Righi <arighi@nvidia.com>
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+const volatile unsigned int __COMPAT_SCX_PICK_IDLE_IN_NODE;
+
+static bool is_cpu_idle(s32 cpu, int node)
+{
+ const struct cpumask *idle_cpumask;
+ bool idle;
+
+ idle_cpumask = __COMPAT_scx_bpf_get_idle_cpumask_node(node);
+ idle = bpf_cpumask_test_cpu(cpu, idle_cpumask);
+ scx_bpf_put_cpumask(idle_cpumask);
+
+ return idle;
+}
+
+s32 BPF_STRUCT_OPS(numa_select_cpu,
+ struct task_struct *p, s32 prev_cpu, u64 wake_flags)
+{
+ int node = __COMPAT_scx_bpf_cpu_node(scx_bpf_task_cpu(p));
+ s32 cpu;
+
+ /*
+ * We could just use __COMPAT_scx_bpf_pick_any_cpu_node() here,
+ * since it already tries to pick an idle CPU within the node
+ * first, but let's use both functions for better testing coverage.
+ */
+ cpu = __COMPAT_scx_bpf_pick_idle_cpu_node(p->cpus_ptr, node,
+ __COMPAT_SCX_PICK_IDLE_IN_NODE);
+ if (cpu < 0)
+ cpu = __COMPAT_scx_bpf_pick_any_cpu_node(p->cpus_ptr, node,
+ __COMPAT_SCX_PICK_IDLE_IN_NODE);
+
+ if (is_cpu_idle(cpu, node))
+ scx_bpf_error("CPU %d should be marked as busy", cpu);
+
+ if (__COMPAT_scx_bpf_cpu_node(cpu) != node)
+ scx_bpf_error("CPU %d should be in node %d", cpu, node);
+
+ return cpu;
+}
+
+void BPF_STRUCT_OPS(numa_enqueue, struct task_struct *p, u64 enq_flags)
+{
+ int node = __COMPAT_scx_bpf_cpu_node(scx_bpf_task_cpu(p));
+
+ scx_bpf_dsq_insert(p, node, SCX_SLICE_DFL, enq_flags);
+}
+
+void BPF_STRUCT_OPS(numa_dispatch, s32 cpu, struct task_struct *prev)
+{
+ int node = __COMPAT_scx_bpf_cpu_node(cpu);
+
+ scx_bpf_dsq_move_to_local(node, 0);
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(numa_init)
+{
+ int node, err;
+
+ bpf_for(node, 0, __COMPAT_scx_bpf_nr_node_ids()) {
+ err = scx_bpf_create_dsq(node, node);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+void BPF_STRUCT_OPS(numa_exit, struct scx_exit_info *ei)
+{
+ UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops numa_ops = {
+ .select_cpu = (void *)numa_select_cpu,
+ .enqueue = (void *)numa_enqueue,
+ .dispatch = (void *)numa_dispatch,
+ .init = (void *)numa_init,
+ .exit = (void *)numa_exit,
+ .name = "numa",
+};
diff --git a/tools/testing/selftests/sched_ext/numa.c b/tools/testing/selftests/sched_ext/numa.c
new file mode 100644
index 000000000000..b060c3b65c82
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/numa.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025 Andrea Righi <arighi@nvidia.com>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "numa.bpf.skel.h"
+#include "scx_test.h"
+
+static enum scx_test_status setup(void **ctx)
+{
+ struct numa *skel;
+
+ skel = numa__open();
+ SCX_FAIL_IF(!skel, "Failed to open");
+ SCX_ENUM_INIT(skel);
+ skel->rodata->__COMPAT_SCX_PICK_IDLE_IN_NODE = SCX_PICK_IDLE_IN_NODE;
+ skel->struct_ops.numa_ops->flags = SCX_OPS_BUILTIN_IDLE_PER_NODE;
+ SCX_FAIL_IF(numa__load(skel), "Failed to load skel");
+
+ *ctx = skel;
+
+ return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+ struct numa *skel = ctx;
+ struct bpf_link *link;
+
+ link = bpf_map__attach_struct_ops(skel->maps.numa_ops);
+ SCX_FAIL_IF(!link, "Failed to attach scheduler");
+
+ /* Just sleeping is fine, plenty of scheduling events happening */
+ sleep(1);
+
+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE));
+ bpf_link__destroy(link);
+
+ return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+ struct numa *skel = ctx;
+
+ numa__destroy(skel);
+}
+
+struct scx_test numa = {
+ .name = "numa",
+ .description = "Verify NUMA-aware functionalities",
+ .setup = setup,
+ .run = run,
+ .cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&numa)
diff --git a/tools/testing/selftests/sched_ext/peek_dsq.bpf.c b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c
new file mode 100644
index 000000000000..7f23fb17b1e0
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c
@@ -0,0 +1,251 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A BPF program for testing DSQ operations and peek in particular.
+ *
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Ryan Newton <ryan.newton@alum.mit.edu>
+ */
+
+#include <scx/common.bpf.h>
+#include <scx/compat.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei); /* Error handling */
+
+#define MAX_SAMPLES 100
+#define MAX_CPUS 512
+#define DSQ_POOL_SIZE 8
+int max_samples = MAX_SAMPLES;
+int max_cpus = MAX_CPUS;
+int dsq_pool_size = DSQ_POOL_SIZE;
+
+/* Global variables to store test results */
+int dsq_peek_result1 = -1;
+long dsq_inserted_pid = -1;
+int insert_test_cpu = -1; /* Set to the cpu that performs the test */
+long dsq_peek_result2 = -1;
+long dsq_peek_result2_pid = -1;
+long dsq_peek_result2_expected = -1;
+int test_dsq_id = 1234; /* Use a simple ID like create_dsq example */
+int real_dsq_id = 1235; /* DSQ for normal operation */
+int enqueue_count = -1;
+int dispatch_count = -1;
+bool debug_ksym_exists;
+
+/* DSQ pool for stress testing */
+int dsq_pool_base_id = 2000;
+int phase1_complete = -1;
+long total_peek_attempts = -1;
+long successful_peeks = -1;
+
+/* BPF map for sharing peek results with userspace */
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, MAX_SAMPLES);
+ __type(key, u32);
+ __type(value, long);
+} peek_results SEC(".maps");
+
+static int get_random_dsq_id(void)
+{
+ u64 time = bpf_ktime_get_ns();
+
+ return dsq_pool_base_id + (time % DSQ_POOL_SIZE);
+}
+
+static void record_peek_result(long pid)
+{
+ u32 slot_key;
+ long *slot_pid_ptr;
+ u32 ix;
+
+ if (pid <= 0)
+ return;
+
+ /* Find an empty slot or one with the same PID */
+ bpf_for(ix, 0, 10) {
+ slot_key = ((u64)pid + ix) % MAX_SAMPLES;
+ slot_pid_ptr = bpf_map_lookup_elem(&peek_results, &slot_key);
+ if (!slot_pid_ptr)
+ continue;
+
+ if (*slot_pid_ptr == -1 || *slot_pid_ptr == pid) {
+ *slot_pid_ptr = pid;
+ break;
+ }
+ }
+}
+
+/* Scan all DSQs in the pool and try to move a task to local */
+static int scan_dsq_pool(void)
+{
+ struct task_struct *task;
+ int moved = 0;
+ int i;
+
+ bpf_for(i, 0, DSQ_POOL_SIZE) {
+ int dsq_id = dsq_pool_base_id + i;
+
+ total_peek_attempts++;
+
+ task = __COMPAT_scx_bpf_dsq_peek(dsq_id);
+ if (task) {
+ successful_peeks++;
+ record_peek_result(task->pid);
+
+ /* Try to move this task to local */
+ if (!moved && scx_bpf_dsq_move_to_local(dsq_id, 0) == 0) {
+ moved = 1;
+ break;
+ }
+ }
+ }
+ return moved;
+}
+
+/* Struct_ops scheduler for testing DSQ peek operations */
+void BPF_STRUCT_OPS(peek_dsq_enqueue, struct task_struct *p, u64 enq_flags)
+{
+ struct task_struct *peek_result;
+ int last_insert_test_cpu, cpu;
+
+ enqueue_count++;
+ cpu = bpf_get_smp_processor_id();
+ last_insert_test_cpu = __sync_val_compare_and_swap(&insert_test_cpu, -1, cpu);
+
+ /* Phase 1: Simple insert-then-peek test (only on first task) */
+ if (last_insert_test_cpu == -1) {
+ bpf_printk("peek_dsq_enqueue beginning phase 1 peek test on cpu %d", cpu);
+
+ /* Test 1: Peek empty DSQ - should return NULL */
+ peek_result = __COMPAT_scx_bpf_dsq_peek(test_dsq_id);
+ dsq_peek_result1 = (long)peek_result; /* Should be 0 (NULL) */
+
+ /* Test 2: Insert task into test DSQ for testing in dispatch callback */
+ dsq_inserted_pid = p->pid;
+ scx_bpf_dsq_insert(p, test_dsq_id, 0, enq_flags);
+ dsq_peek_result2_expected = (long)p; /* Expected the task we just inserted */
+ } else if (!phase1_complete) {
+ /* Still in phase 1, use real DSQ */
+ scx_bpf_dsq_insert(p, real_dsq_id, 0, enq_flags);
+ } else {
+ /* Phase 2: Random DSQ insertion for stress testing */
+ int random_dsq_id = get_random_dsq_id();
+
+ scx_bpf_dsq_insert(p, random_dsq_id, 0, enq_flags);
+ }
+}
+
+void BPF_STRUCT_OPS(peek_dsq_dispatch, s32 cpu, struct task_struct *prev)
+{
+ dispatch_count++;
+
+ /* Phase 1: Complete the simple peek test if we inserted a task but
+ * haven't tested peek yet
+ */
+ if (insert_test_cpu == cpu && dsq_peek_result2 == -1) {
+ struct task_struct *peek_result;
+
+ bpf_printk("peek_dsq_dispatch completing phase 1 peek test on cpu %d", cpu);
+
+ /* Test 3: Peek DSQ after insert - should return the task we inserted */
+ peek_result = __COMPAT_scx_bpf_dsq_peek(test_dsq_id);
+ /* Store the PID of the peeked task for comparison */
+ dsq_peek_result2 = (long)peek_result;
+ dsq_peek_result2_pid = peek_result ? peek_result->pid : -1;
+
+ /* Now consume the task since we've peeked at it */
+ scx_bpf_dsq_move_to_local(test_dsq_id, 0);
+
+ /* Mark phase 1 as complete */
+ phase1_complete = 1;
+ bpf_printk("Phase 1 complete, starting phase 2 stress testing");
+ } else if (!phase1_complete) {
+ /* Still in phase 1, use real DSQ */
+ scx_bpf_dsq_move_to_local(real_dsq_id, 0);
+ } else {
+ /* Phase 2: Scan all DSQs in the pool and try to move a task */
+ if (!scan_dsq_pool()) {
+ /* No tasks found in DSQ pool, fall back to real DSQ */
+ scx_bpf_dsq_move_to_local(real_dsq_id, 0);
+ }
+ }
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(peek_dsq_init)
+{
+ s32 err;
+ int i;
+
+ /* Always set debug values so we can see which version we're using */
+ debug_ksym_exists = bpf_ksym_exists(scx_bpf_dsq_peek) ? 1 : 0;
+
+ /* Initialize state first */
+ insert_test_cpu = -1;
+ enqueue_count = 0;
+ dispatch_count = 0;
+ phase1_complete = 0;
+ total_peek_attempts = 0;
+ successful_peeks = 0;
+
+ /* Create the test and real DSQs */
+ err = scx_bpf_create_dsq(test_dsq_id, -1);
+ if (err) {
+ scx_bpf_error("Failed to create DSQ %d: %d", test_dsq_id, err);
+ return err;
+ }
+ err = scx_bpf_create_dsq(real_dsq_id, -1);
+ if (err) {
+ scx_bpf_error("Failed to create DSQ %d: %d", real_dsq_id, err);
+ return err;
+ }
+
+ /* Create the DSQ pool for stress testing */
+ bpf_for(i, 0, DSQ_POOL_SIZE) {
+ int dsq_id = dsq_pool_base_id + i;
+
+ err = scx_bpf_create_dsq(dsq_id, -1);
+ if (err) {
+ scx_bpf_error("Failed to create DSQ pool entry %d: %d", dsq_id, err);
+ return err;
+ }
+ }
+
+ /* Initialize the peek results map */
+ bpf_for(i, 0, MAX_SAMPLES) {
+ u32 key = i;
+ long pid = -1;
+
+ bpf_map_update_elem(&peek_results, &key, &pid, BPF_ANY);
+ }
+
+ return 0;
+}
+
+void BPF_STRUCT_OPS(peek_dsq_exit, struct scx_exit_info *ei)
+{
+ int i;
+
+ /* Destroy the primary DSQs */
+ scx_bpf_destroy_dsq(test_dsq_id);
+ scx_bpf_destroy_dsq(real_dsq_id);
+
+ /* Destroy the DSQ pool */
+ bpf_for(i, 0, DSQ_POOL_SIZE) {
+ int dsq_id = dsq_pool_base_id + i;
+
+ scx_bpf_destroy_dsq(dsq_id);
+ }
+
+ UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops peek_dsq_ops = {
+ .enqueue = (void *)peek_dsq_enqueue,
+ .dispatch = (void *)peek_dsq_dispatch,
+ .init = (void *)peek_dsq_init,
+ .exit = (void *)peek_dsq_exit,
+ .name = "peek_dsq",
+};
diff --git a/tools/testing/selftests/sched_ext/peek_dsq.c b/tools/testing/selftests/sched_ext/peek_dsq.c
new file mode 100644
index 000000000000..a717384a3224
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/peek_dsq.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test for DSQ operations including create, destroy, and peek operations.
+ *
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Ryan Newton <ryan.newton@alum.mit.edu>
+ */
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <string.h>
+#include <sched.h>
+#include "peek_dsq.bpf.skel.h"
+#include "scx_test.h"
+
+#define NUM_WORKERS 4
+
+static bool workload_running = true;
+static pthread_t workload_threads[NUM_WORKERS];
+
+/**
+ * Background workload thread that sleeps and wakes rapidly to exercise
+ * the scheduler's enqueue operations and ensure DSQ operations get tested.
+ */
+static void *workload_thread_fn(void *arg)
+{
+ while (workload_running) {
+ /* Sleep for a very short time to trigger scheduler activity */
+ usleep(1000); /* 1ms sleep */
+ /* Yield to ensure we go through the scheduler */
+ sched_yield();
+ }
+ return NULL;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+ struct peek_dsq *skel;
+
+ skel = peek_dsq__open();
+ SCX_FAIL_IF(!skel, "Failed to open");
+ SCX_ENUM_INIT(skel);
+ SCX_FAIL_IF(peek_dsq__load(skel), "Failed to load skel");
+
+ *ctx = skel;
+
+ return SCX_TEST_PASS;
+}
+
+static int print_observed_pids(struct bpf_map *map, int max_samples, const char *dsq_name)
+{
+ long count = 0;
+
+ printf("Observed %s DSQ peek pids:\n", dsq_name);
+ for (int i = 0; i < max_samples; i++) {
+ long pid;
+ int err;
+
+ err = bpf_map_lookup_elem(bpf_map__fd(map), &i, &pid);
+ if (err == 0) {
+ if (pid == 0) {
+ printf(" Sample %d: NULL peek\n", i);
+ } else if (pid > 0) {
+ printf(" Sample %d: pid %ld\n", i, pid);
+ count++;
+ }
+ } else {
+ printf(" Sample %d: error reading pid (err=%d)\n", i, err);
+ }
+ }
+ printf("Observed ~%ld pids in the %s DSQ(s)\n", count, dsq_name);
+ return count;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+ struct peek_dsq *skel = ctx;
+ bool failed = false;
+ int seconds = 3;
+ int err;
+
+ /* Enable the scheduler to test DSQ operations */
+ printf("Enabling scheduler to test DSQ insert operations...\n");
+
+ struct bpf_link *link =
+ bpf_map__attach_struct_ops(skel->maps.peek_dsq_ops);
+
+ if (!link) {
+ SCX_ERR("Failed to attach struct_ops");
+ return SCX_TEST_FAIL;
+ }
+
+ printf("Starting %d background workload threads...\n", NUM_WORKERS);
+ workload_running = true;
+ for (int i = 0; i < NUM_WORKERS; i++) {
+ err = pthread_create(&workload_threads[i], NULL, workload_thread_fn, NULL);
+ if (err) {
+ SCX_ERR("Failed to create workload thread %d: %s", i, strerror(err));
+ /* Stop already created threads */
+ workload_running = false;
+ for (int j = 0; j < i; j++)
+ pthread_join(workload_threads[j], NULL);
+ bpf_link__destroy(link);
+ return SCX_TEST_FAIL;
+ }
+ }
+
+ printf("Waiting for enqueue events.\n");
+ sleep(seconds);
+ while (skel->data->enqueue_count <= 0) {
+ printf(".");
+ fflush(stdout);
+ sleep(1);
+ seconds++;
+ if (seconds >= 30) {
+ printf("\n\u2717 Timeout waiting for enqueue events\n");
+ /* Stop workload threads and cleanup */
+ workload_running = false;
+ for (int i = 0; i < NUM_WORKERS; i++)
+ pthread_join(workload_threads[i], NULL);
+ bpf_link__destroy(link);
+ return SCX_TEST_FAIL;
+ }
+ }
+
+ workload_running = false;
+ for (int i = 0; i < NUM_WORKERS; i++) {
+ err = pthread_join(workload_threads[i], NULL);
+ if (err) {
+ SCX_ERR("Failed to join workload thread %d: %s", i, strerror(err));
+ bpf_link__destroy(link);
+ return SCX_TEST_FAIL;
+ }
+ }
+ printf("Background workload threads stopped.\n");
+
+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE));
+
+ /* Detach the scheduler */
+ bpf_link__destroy(link);
+
+ printf("Enqueue/dispatch count over %d seconds: %d / %d\n", seconds,
+ skel->data->enqueue_count, skel->data->dispatch_count);
+ printf("Debug: ksym_exists=%d\n",
+ skel->bss->debug_ksym_exists);
+
+ /* Check DSQ insert result */
+ printf("DSQ insert test done on cpu: %d\n", skel->data->insert_test_cpu);
+ if (skel->data->insert_test_cpu != -1)
+ printf("\u2713 DSQ insert succeeded !\n");
+ else {
+ printf("\u2717 DSQ insert failed or not attempted\n");
+ failed = true;
+ }
+
+ /* Check DSQ peek results */
+ printf(" DSQ peek result 1 (before insert): %d\n",
+ skel->data->dsq_peek_result1);
+ if (skel->data->dsq_peek_result1 == 0)
+ printf("\u2713 DSQ peek verification success: peek returned NULL!\n");
+ else {
+ printf("\u2717 DSQ peek verification failed\n");
+ failed = true;
+ }
+
+ printf(" DSQ peek result 2 (after insert): %ld\n",
+ skel->data->dsq_peek_result2);
+ printf(" DSQ peek result 2, expected: %ld\n",
+ skel->data->dsq_peek_result2_expected);
+ if (skel->data->dsq_peek_result2 ==
+ skel->data->dsq_peek_result2_expected)
+ printf("\u2713 DSQ peek verification success: peek returned the inserted task!\n");
+ else {
+ printf("\u2717 DSQ peek verification failed\n");
+ failed = true;
+ }
+
+ printf(" Inserted test task -> pid: %ld\n", skel->data->dsq_inserted_pid);
+ printf(" DSQ peek result 2 -> pid: %ld\n", skel->data->dsq_peek_result2_pid);
+
+ int pid_count;
+
+ pid_count = print_observed_pids(skel->maps.peek_results,
+ skel->data->max_samples, "DSQ pool");
+ printf("Total non-null peek observations: %ld out of %ld\n",
+ skel->data->successful_peeks, skel->data->total_peek_attempts);
+
+ if (skel->bss->debug_ksym_exists && pid_count == 0) {
+ printf("\u2717 DSQ pool test failed: no successful peeks in native mode\n");
+ failed = true;
+ }
+ if (skel->bss->debug_ksym_exists && pid_count > 0)
+ printf("\u2713 DSQ pool test success: observed successful peeks in native mode\n");
+
+ if (failed)
+ return SCX_TEST_FAIL;
+ else
+ return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+ struct peek_dsq *skel = ctx;
+
+ if (workload_running) {
+ workload_running = false;
+ for (int i = 0; i < NUM_WORKERS; i++)
+ pthread_join(workload_threads[i], NULL);
+ }
+
+ peek_dsq__destroy(skel);
+}
+
+struct scx_test peek_dsq = {
+ .name = "peek_dsq",
+ .description =
+ "Test DSQ create/destroy operations and future peek functionality",
+ .setup = setup,
+ .run = run,
+ .cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&peek_dsq)
diff --git a/tools/testing/selftests/sched_ext/reload_loop.c b/tools/testing/selftests/sched_ext/reload_loop.c
index 308211d80436..49297b83d748 100644
--- a/tools/testing/selftests/sched_ext/reload_loop.c
+++ b/tools/testing/selftests/sched_ext/reload_loop.c
@@ -23,6 +23,9 @@ static enum scx_test_status setup(void **ctx)
SCX_ENUM_INIT(skel);
SCX_FAIL_IF(maximal__load(skel), "Failed to load skel");
+ bpf_map__set_autoattach(skel->maps.maximal_ops, false);
+ SCX_FAIL_IF(maximal__attach(skel), "Failed to attach skel");
+
return SCX_TEST_PASS;
}
diff --git a/tools/testing/selftests/sched_ext/rt_stall.bpf.c b/tools/testing/selftests/sched_ext/rt_stall.bpf.c
new file mode 100644
index 000000000000..80086779dd1e
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/rt_stall.bpf.c
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * A scheduler that verified if RT tasks can stall SCHED_EXT tasks.
+ *
+ * Copyright (c) 2025 NVIDIA Corporation.
+ */
+
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+UEI_DEFINE(uei);
+
+void BPF_STRUCT_OPS(rt_stall_exit, struct scx_exit_info *ei)
+{
+ UEI_RECORD(uei, ei);
+}
+
+SEC(".struct_ops.link")
+struct sched_ext_ops rt_stall_ops = {
+ .exit = (void *)rt_stall_exit,
+ .name = "rt_stall",
+};
diff --git a/tools/testing/selftests/sched_ext/rt_stall.c b/tools/testing/selftests/sched_ext/rt_stall.c
new file mode 100644
index 000000000000..a5041fc2e44f
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/rt_stall.c
@@ -0,0 +1,293 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2025 NVIDIA Corporation.
+ */
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sched.h>
+#include <sys/prctl.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <linux/sched.h>
+#include <signal.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include "rt_stall.bpf.skel.h"
+#include "scx_test.h"
+#include "../kselftest.h"
+
+#define CORE_ID 0 /* CPU to pin tasks to */
+#define RUN_TIME 5 /* How long to run the test in seconds */
+
+/* Signal the parent that setup is complete by writing to a pipe */
+static void signal_ready(int fd)
+{
+ char c = 1;
+
+ if (write(fd, &c, 1) != 1) {
+ perror("write to ready pipe");
+ exit(EXIT_FAILURE);
+ }
+ close(fd);
+}
+
+/* Wait for a child to signal readiness via a pipe */
+static void wait_ready(int fd)
+{
+ char c;
+
+ if (read(fd, &c, 1) != 1) {
+ perror("read from ready pipe");
+ exit(EXIT_FAILURE);
+ }
+ close(fd);
+}
+
+/* Simple busy-wait function for test tasks */
+static void process_func(void)
+{
+ while (1) {
+ /* Busy wait */
+ for (volatile unsigned long i = 0; i < 10000000UL; i++)
+ ;
+ }
+}
+
+/* Set CPU affinity to a specific core */
+static void set_affinity(int cpu)
+{
+ cpu_set_t mask;
+
+ CPU_ZERO(&mask);
+ CPU_SET(cpu, &mask);
+ if (sched_setaffinity(0, sizeof(mask), &mask) != 0) {
+ perror("sched_setaffinity");
+ exit(EXIT_FAILURE);
+ }
+}
+
+/* Set task scheduling policy and priority */
+static void set_sched(int policy, int priority)
+{
+ struct sched_param param;
+
+ param.sched_priority = priority;
+ if (sched_setscheduler(0, policy, &param) != 0) {
+ perror("sched_setscheduler");
+ exit(EXIT_FAILURE);
+ }
+}
+
+/* Get process runtime from /proc/<pid>/stat */
+static float get_process_runtime(int pid)
+{
+ char path[256];
+ FILE *file;
+ long utime, stime;
+ int fields;
+
+ snprintf(path, sizeof(path), "/proc/%d/stat", pid);
+ file = fopen(path, "r");
+ if (file == NULL) {
+ perror("Failed to open stat file");
+ return -1;
+ }
+
+ /* Skip the first 13 fields and read the 14th and 15th */
+ fields = fscanf(file,
+ "%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %lu %lu",
+ &utime, &stime);
+ fclose(file);
+
+ if (fields != 2) {
+ fprintf(stderr, "Failed to read stat file\n");
+ return -1;
+ }
+
+ /* Calculate the total time spent in the process */
+ long total_time = utime + stime;
+ long ticks_per_second = sysconf(_SC_CLK_TCK);
+ float runtime_seconds = total_time * 1.0 / ticks_per_second;
+
+ return runtime_seconds;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+ struct rt_stall *skel;
+
+ if (!__COMPAT_struct_has_field("rq", "ext_server")) {
+ fprintf(stderr, "SKIP: ext DL server not supported\n");
+ return SCX_TEST_SKIP;
+ }
+
+ skel = rt_stall__open();
+ SCX_FAIL_IF(!skel, "Failed to open");
+ SCX_ENUM_INIT(skel);
+ SCX_FAIL_IF(rt_stall__load(skel), "Failed to load skel");
+
+ *ctx = skel;
+
+ return SCX_TEST_PASS;
+}
+
+static bool sched_stress_test(bool is_ext)
+{
+ /*
+ * We're expecting the EXT task to get around 5% of CPU time when
+ * competing with the RT task (small 1% fluctuations are expected).
+ *
+ * However, the EXT task should get at least 4% of the CPU to prove
+ * that the EXT deadline server is working correctly. A percentage
+ * less than 4% indicates a bug where RT tasks can potentially
+ * stall SCHED_EXT tasks, causing the test to fail.
+ */
+ const float expected_min_ratio = 0.04; /* 4% */
+ const char *class_str = is_ext ? "EXT" : "FAIR";
+
+ float ext_runtime, rt_runtime, actual_ratio;
+ int ext_pid, rt_pid;
+ int ext_ready[2], rt_ready[2];
+
+ ksft_print_header();
+ ksft_set_plan(1);
+
+ if (pipe(ext_ready) || pipe(rt_ready)) {
+ perror("pipe");
+ ksft_exit_fail();
+ }
+
+ /* Create and set up a EXT task */
+ ext_pid = fork();
+ if (ext_pid == 0) {
+ close(ext_ready[0]);
+ close(rt_ready[0]);
+ close(rt_ready[1]);
+ set_affinity(CORE_ID);
+ signal_ready(ext_ready[1]);
+ process_func();
+ exit(0);
+ } else if (ext_pid < 0) {
+ perror("fork task");
+ ksft_exit_fail();
+ }
+
+ /* Create an RT task */
+ rt_pid = fork();
+ if (rt_pid == 0) {
+ close(ext_ready[0]);
+ close(ext_ready[1]);
+ close(rt_ready[0]);
+ set_affinity(CORE_ID);
+ set_sched(SCHED_FIFO, 50);
+ signal_ready(rt_ready[1]);
+ process_func();
+ exit(0);
+ } else if (rt_pid < 0) {
+ perror("fork for RT task");
+ ksft_exit_fail();
+ }
+
+ /*
+ * Wait for both children to complete their setup (affinity and
+ * scheduling policy) before starting the measurement window.
+ * This prevents flaky failures caused by the RT child's setup
+ * time eating into the measurement period.
+ */
+ close(ext_ready[1]);
+ close(rt_ready[1]);
+ wait_ready(ext_ready[0]);
+ wait_ready(rt_ready[0]);
+
+ /* Let the processes run for the specified time */
+ sleep(RUN_TIME);
+
+ /* Get runtime for the EXT task */
+ ext_runtime = get_process_runtime(ext_pid);
+ if (ext_runtime == -1)
+ ksft_exit_fail_msg("Error getting runtime for %s task (PID %d)\n",
+ class_str, ext_pid);
+ ksft_print_msg("Runtime of %s task (PID %d) is %f seconds\n",
+ class_str, ext_pid, ext_runtime);
+
+ /* Get runtime for the RT task */
+ rt_runtime = get_process_runtime(rt_pid);
+ if (rt_runtime == -1)
+ ksft_exit_fail_msg("Error getting runtime for RT task (PID %d)\n", rt_pid);
+ ksft_print_msg("Runtime of RT task (PID %d) is %f seconds\n", rt_pid, rt_runtime);
+
+ /* Kill the processes */
+ kill(ext_pid, SIGKILL);
+ kill(rt_pid, SIGKILL);
+ waitpid(ext_pid, NULL, 0);
+ waitpid(rt_pid, NULL, 0);
+
+ /* Verify that the scx task got enough runtime */
+ actual_ratio = ext_runtime / (ext_runtime + rt_runtime);
+ ksft_print_msg("%s task got %.2f%% of total runtime\n",
+ class_str, actual_ratio * 100);
+
+ if (actual_ratio >= expected_min_ratio) {
+ ksft_test_result_pass("PASS: %s task got more than %.2f%% of runtime\n",
+ class_str, expected_min_ratio * 100);
+ return true;
+ }
+ ksft_test_result_fail("FAIL: %s task got less than %.2f%% of runtime\n",
+ class_str, expected_min_ratio * 100);
+ return false;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+ struct rt_stall *skel = ctx;
+ struct bpf_link *link = NULL;
+ bool res;
+ int i;
+
+ /*
+ * Test if the dl_server is working both with and without the
+ * sched_ext scheduler attached.
+ *
+ * This ensures all the scenarios are covered:
+ * - fair_server stop -> ext_server start
+ * - ext_server stop -> fair_server stop
+ */
+ for (i = 0; i < 4; i++) {
+ bool is_ext = i % 2;
+
+ if (is_ext) {
+ memset(&skel->data->uei, 0, sizeof(skel->data->uei));
+ link = bpf_map__attach_struct_ops(skel->maps.rt_stall_ops);
+ SCX_FAIL_IF(!link, "Failed to attach scheduler");
+ }
+ res = sched_stress_test(is_ext);
+ if (is_ext) {
+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE));
+ bpf_link__destroy(link);
+ }
+
+ if (!res)
+ ksft_exit_fail();
+ }
+
+ return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+ struct rt_stall *skel = ctx;
+
+ rt_stall__destroy(skel);
+}
+
+struct scx_test rt_stall = {
+ .name = "rt_stall",
+ .description = "Verify that RT tasks cannot stall SCHED_EXT tasks",
+ .setup = setup,
+ .run = run,
+ .cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&rt_stall)
diff --git a/tools/testing/selftests/sched_ext/runner.c b/tools/testing/selftests/sched_ext/runner.c
index aa2d7d32dda9..c264807caa91 100644
--- a/tools/testing/selftests/sched_ext/runner.c
+++ b/tools/testing/selftests/sched_ext/runner.c
@@ -18,7 +18,7 @@ const char help_fmt[] =
"It's required for the testcases to be serial, as only a single host-wide sched_ext\n"
"scheduler may be loaded at any given time."
"\n"
-"Usage: %s [-t TEST] [-h]\n"
+"Usage: %s [-t TEST] [-s] [-l] [-q]\n"
"\n"
" -t TEST Only run tests whose name includes this string\n"
" -s Include print output for skipped tests\n"
@@ -46,6 +46,14 @@ static void print_test_preamble(const struct scx_test *test, bool quiet)
if (!quiet)
printf("DESCRIPTION: %s\n", test->description);
printf("OUTPUT:\n");
+
+ /*
+ * The tests may fork with the preamble buffered
+ * in the children's stdout. Flush before the test
+ * to avoid printing the message multiple times.
+ */
+ fflush(stdout);
+ fflush(stderr);
}
static const char *status_to_result(enum scx_test_status status)
@@ -125,6 +133,8 @@ static bool test_valid(const struct scx_test *test)
int main(int argc, char **argv)
{
const char *filter = NULL;
+ const char *failed_tests[MAX_SCX_TESTS];
+ const char *skipped_tests[MAX_SCX_TESTS];
unsigned testnum = 0, i;
unsigned passed = 0, skipped = 0, failed = 0;
int opt;
@@ -154,10 +164,33 @@ int main(int argc, char **argv)
}
}
+ if (optind < argc) {
+ fprintf(stderr, "Unexpected argument '%s'. Use -t to filter tests.\n",
+ argv[optind]);
+ return 1;
+ }
+
+ if (filter) {
+ for (i = 0; i < __scx_num_tests; i++) {
+ if (!should_skip_test(&__scx_tests[i], filter))
+ break;
+ }
+ if (i == __scx_num_tests) {
+ fprintf(stderr, "No tests matched filter '%s'\n", filter);
+ fprintf(stderr, "Available tests (use -l to list):\n");
+ for (i = 0; i < __scx_num_tests; i++)
+ fprintf(stderr, " %s\n", __scx_tests[i].name);
+ return 1;
+ }
+ }
+
for (i = 0; i < __scx_num_tests; i++) {
enum scx_test_status status;
struct scx_test *test = &__scx_tests[i];
+ if (exit_req)
+ break;
+
if (list) {
printf("%s\n", test->name);
if (i == (__scx_num_tests - 1))
@@ -187,10 +220,10 @@ int main(int argc, char **argv)
passed++;
break;
case SCX_TEST_SKIP:
- skipped++;
+ skipped_tests[skipped++] = test->name;
break;
case SCX_TEST_FAIL:
- failed++;
+ failed_tests[failed++] = test->name;
break;
}
}
@@ -199,8 +232,18 @@ int main(int argc, char **argv)
printf("PASSED: %u\n", passed);
printf("SKIPPED: %u\n", skipped);
printf("FAILED: %u\n", failed);
+ if (skipped > 0) {
+ printf("\nSkipped tests:\n");
+ for (i = 0; i < skipped; i++)
+ printf(" - %s\n", skipped_tests[i]);
+ }
+ if (failed > 0) {
+ printf("\nFailed tests:\n");
+ for (i = 0; i < failed; i++)
+ printf(" - %s\n", failed_tests[i]);
+ }
- return 0;
+ return failed > 0 ? 1 : 0;
}
void scx_test_register(struct scx_test *test)
diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
index bfcb96cd4954..eec70d388cbf 100644
--- a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
+++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c
@@ -53,7 +53,7 @@ ddsp:
void BPF_STRUCT_OPS(select_cpu_vtime_dispatch, s32 cpu, struct task_struct *p)
{
- if (scx_bpf_dsq_move_to_local(VTIME_DSQ))
+ if (scx_bpf_dsq_move_to_local(VTIME_DSQ, 0))
consumed = true;
}
@@ -66,12 +66,14 @@ void BPF_STRUCT_OPS(select_cpu_vtime_running, struct task_struct *p)
void BPF_STRUCT_OPS(select_cpu_vtime_stopping, struct task_struct *p,
bool runnable)
{
- p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
+ u64 delta = scale_by_task_weight_inverse(p, SCX_SLICE_DFL - p->scx.slice);
+
+ scx_bpf_task_set_dsq_vtime(p, p->scx.dsq_vtime + delta);
}
void BPF_STRUCT_OPS(select_cpu_vtime_enable, struct task_struct *p)
{
- p->scx.dsq_vtime = vtime_now;
+ scx_bpf_task_set_dsq_vtime(p, vtime_now);
}
s32 BPF_STRUCT_OPS_SLEEPABLE(select_cpu_vtime_init)
diff --git a/tools/testing/selftests/sched_ext/total_bw.c b/tools/testing/selftests/sched_ext/total_bw.c
new file mode 100644
index 000000000000..5b0a619bab86
--- /dev/null
+++ b/tools/testing/selftests/sched_ext/total_bw.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test to verify that total_bw value remains consistent across all CPUs
+ * in different BPF program states.
+ *
+ * Copyright (C) 2025 NVIDIA Corporation.
+ */
+#include <bpf/bpf.h>
+#include <errno.h>
+#include <pthread.h>
+#include <scx/common.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "minimal.bpf.skel.h"
+#include "scx_test.h"
+
+#define MAX_CPUS 512
+#define STRESS_DURATION_SEC 5
+
+struct total_bw_ctx {
+ struct minimal *skel;
+ long baseline_bw[MAX_CPUS];
+ int nr_cpus;
+};
+
+static void *cpu_stress_thread(void *arg)
+{
+ volatile int i;
+ time_t end_time = time(NULL) + STRESS_DURATION_SEC;
+
+ while (time(NULL) < end_time)
+ for (i = 0; i < 1000000; i++)
+ ;
+
+ return NULL;
+}
+
+/*
+ * The first enqueue on a CPU causes the DL server to start, for that
+ * reason run stressor threads in the hopes it schedules on all CPUs.
+ */
+static int run_cpu_stress(int nr_cpus)
+{
+ pthread_t *threads;
+ int i, ret = 0;
+
+ threads = calloc(nr_cpus, sizeof(pthread_t));
+ if (!threads)
+ return -ENOMEM;
+
+ /* Create threads to run on each CPU */
+ for (i = 0; i < nr_cpus; i++) {
+ if (pthread_create(&threads[i], NULL, cpu_stress_thread, NULL)) {
+ ret = -errno;
+ fprintf(stderr, "Failed to create thread %d: %s\n", i, strerror(-ret));
+ break;
+ }
+ }
+
+ /* Wait for all threads to complete */
+ for (i = 0; i < nr_cpus; i++) {
+ if (threads[i])
+ pthread_join(threads[i], NULL);
+ }
+
+ free(threads);
+ return ret;
+}
+
+static int read_total_bw_values(long *bw_values, int max_cpus)
+{
+ FILE *fp;
+ char line[256];
+ int cpu_count = 0;
+
+ fp = fopen("/sys/kernel/debug/sched/debug", "r");
+ if (!fp) {
+ SCX_ERR("Failed to open debug file");
+ return -1;
+ }
+
+ while (fgets(line, sizeof(line), fp)) {
+ char *bw_str = strstr(line, "total_bw");
+
+ if (bw_str) {
+ bw_str = strchr(bw_str, ':');
+ if (bw_str) {
+ /* Only store up to max_cpus values */
+ if (cpu_count < max_cpus)
+ bw_values[cpu_count] = atol(bw_str + 1);
+ cpu_count++;
+ }
+ }
+ }
+
+ fclose(fp);
+ return cpu_count;
+}
+
+static bool verify_total_bw_consistency(long *bw_values, int count)
+{
+ int i;
+ long first_value;
+
+ if (count <= 0)
+ return false;
+
+ first_value = bw_values[0];
+
+ for (i = 1; i < count; i++) {
+ if (bw_values[i] != first_value) {
+ SCX_ERR("Inconsistent total_bw: CPU0=%ld, CPU%d=%ld",
+ first_value, i, bw_values[i]);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static int fetch_verify_total_bw(long *bw_values, int nr_cpus)
+{
+ int attempts = 0;
+ int max_attempts = 10;
+ int count;
+
+ /*
+ * The first enqueue on a CPU causes the DL server to start, for that
+ * reason run stressor threads in the hopes it schedules on all CPUs.
+ */
+ if (run_cpu_stress(nr_cpus) < 0) {
+ SCX_ERR("Failed to run CPU stress");
+ return -1;
+ }
+
+ /* Try multiple times to get stable values */
+ while (attempts < max_attempts) {
+ count = read_total_bw_values(bw_values, nr_cpus);
+ fprintf(stderr, "Read %d total_bw values (testing %d CPUs)\n", count, nr_cpus);
+ /* If system has more CPUs than we're testing, that's OK */
+ if (count < nr_cpus) {
+ SCX_ERR("Expected at least %d CPUs, got %d", nr_cpus, count);
+ attempts++;
+ sleep(1);
+ continue;
+ }
+
+ /* Only verify the CPUs we're testing */
+ if (verify_total_bw_consistency(bw_values, nr_cpus)) {
+ fprintf(stderr, "Values are consistent: %ld\n", bw_values[0]);
+ return 0;
+ }
+
+ attempts++;
+ sleep(1);
+ }
+
+ return -1;
+}
+
+static enum scx_test_status setup(void **ctx)
+{
+ struct total_bw_ctx *test_ctx;
+
+ if (access("/sys/kernel/debug/sched/debug", R_OK) != 0) {
+ fprintf(stderr, "Skipping test: debugfs sched/debug not accessible\n");
+ return SCX_TEST_SKIP;
+ }
+
+ test_ctx = calloc(1, sizeof(*test_ctx));
+ if (!test_ctx)
+ return SCX_TEST_FAIL;
+
+ test_ctx->nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+ if (test_ctx->nr_cpus <= 0) {
+ free(test_ctx);
+ return SCX_TEST_FAIL;
+ }
+
+ /* If system has more CPUs than MAX_CPUS, just test the first MAX_CPUS */
+ if (test_ctx->nr_cpus > MAX_CPUS)
+ test_ctx->nr_cpus = MAX_CPUS;
+
+ /* Test scenario 1: BPF program not loaded */
+ /* Read and verify baseline total_bw before loading BPF program */
+ fprintf(stderr, "BPF prog initially not loaded, reading total_bw values\n");
+ if (fetch_verify_total_bw(test_ctx->baseline_bw, test_ctx->nr_cpus) < 0) {
+ SCX_ERR("Failed to get stable baseline values");
+ free(test_ctx);
+ return SCX_TEST_FAIL;
+ }
+
+ /* Load the BPF skeleton */
+ test_ctx->skel = minimal__open();
+ if (!test_ctx->skel) {
+ free(test_ctx);
+ return SCX_TEST_FAIL;
+ }
+
+ SCX_ENUM_INIT(test_ctx->skel);
+ if (minimal__load(test_ctx->skel)) {
+ minimal__destroy(test_ctx->skel);
+ free(test_ctx);
+ return SCX_TEST_FAIL;
+ }
+
+ *ctx = test_ctx;
+ return SCX_TEST_PASS;
+}
+
+static enum scx_test_status run(void *ctx)
+{
+ struct total_bw_ctx *test_ctx = ctx;
+ struct bpf_link *link;
+ long loaded_bw[MAX_CPUS];
+ long unloaded_bw[MAX_CPUS];
+ int i;
+
+ /* Test scenario 2: BPF program loaded */
+ link = bpf_map__attach_struct_ops(test_ctx->skel->maps.minimal_ops);
+ if (!link) {
+ SCX_ERR("Failed to attach scheduler");
+ return SCX_TEST_FAIL;
+ }
+
+ fprintf(stderr, "BPF program loaded, reading total_bw values\n");
+ if (fetch_verify_total_bw(loaded_bw, test_ctx->nr_cpus) < 0) {
+ SCX_ERR("Failed to get stable values with BPF loaded");
+ bpf_link__destroy(link);
+ return SCX_TEST_FAIL;
+ }
+ bpf_link__destroy(link);
+
+ /* Test scenario 3: BPF program unloaded */
+ fprintf(stderr, "BPF program unloaded, reading total_bw values\n");
+ if (fetch_verify_total_bw(unloaded_bw, test_ctx->nr_cpus) < 0) {
+ SCX_ERR("Failed to get stable values after BPF unload");
+ return SCX_TEST_FAIL;
+ }
+
+ /* Verify all three scenarios have the same total_bw values */
+ for (i = 0; i < test_ctx->nr_cpus; i++) {
+ if (test_ctx->baseline_bw[i] != loaded_bw[i]) {
+ SCX_ERR("CPU%d: baseline_bw=%ld != loaded_bw=%ld",
+ i, test_ctx->baseline_bw[i], loaded_bw[i]);
+ return SCX_TEST_FAIL;
+ }
+
+ if (test_ctx->baseline_bw[i] != unloaded_bw[i]) {
+ SCX_ERR("CPU%d: baseline_bw=%ld != unloaded_bw=%ld",
+ i, test_ctx->baseline_bw[i], unloaded_bw[i]);
+ return SCX_TEST_FAIL;
+ }
+ }
+
+ fprintf(stderr, "All total_bw values are consistent across all scenarios\n");
+ return SCX_TEST_PASS;
+}
+
+static void cleanup(void *ctx)
+{
+ struct total_bw_ctx *test_ctx = ctx;
+
+ if (test_ctx) {
+ if (test_ctx->skel)
+ minimal__destroy(test_ctx->skel);
+ free(test_ctx);
+ }
+}
+
+struct scx_test total_bw = {
+ .name = "total_bw",
+ .description = "Verify total_bw consistency across BPF program states",
+ .setup = setup,
+ .run = run,
+ .cleanup = cleanup,
+};
+REGISTER_SCX_TEST(&total_bw)
diff --git a/tools/testing/selftests/sched_ext/util.c b/tools/testing/selftests/sched_ext/util.c
index e47769c91918..2111329ed289 100644
--- a/tools/testing/selftests/sched_ext/util.c
+++ b/tools/testing/selftests/sched_ext/util.c
@@ -60,11 +60,11 @@ int file_write_long(const char *path, long val)
char buf[64];
int ret;
- ret = sprintf(buf, "%lu", val);
+ ret = sprintf(buf, "%ld", val);
if (ret < 0)
return ret;
- if (write_text(path, buf, sizeof(buf)) <= 0)
+ if (write_text(path, buf, ret) <= 0)
return -1;
return 0;
diff --git a/tools/testing/selftests/sched_ext/util.h b/tools/testing/selftests/sched_ext/util.h
index bc13dfec1267..681cec04b439 100644
--- a/tools/testing/selftests/sched_ext/util.h
+++ b/tools/testing/selftests/sched_ext/util.h
@@ -10,4 +10,4 @@
long file_read_long(const char *path);
int file_write_long(const char *path, long val);
-#endif // __SCX_TEST_H__
+#endif // __SCX_TEST_UTIL_H__