diff options
Diffstat (limited to 'tools/testing/selftests/sched_ext')
31 files changed, 2664 insertions, 141 deletions
diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile index 011762224600..789037be44c7 100644 --- a/tools/testing/selftests/sched_ext/Makefile +++ b/tools/testing/selftests/sched_ext/Makefile @@ -93,6 +93,8 @@ BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) \ $(CLANG_SYS_INCLUDES) \ -Wall -Wno-compare-distinct-pointer-types \ -Wno-incompatible-function-pointer-types \ + -Wno-microsoft-anon-tag \ + -fms-extensions \ -O2 -mcpu=v3 # sort removes libbpf duplicates when not cross-building @@ -161,17 +163,21 @@ all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubs auto-test-targets := \ create_dsq \ + dequeue \ enq_last_no_enq_fails \ - enq_select_cpu_fails \ ddsp_bogus_dsq_fail \ ddsp_vtimelocal_fail \ dsp_local_on \ + enq_select_cpu \ exit \ hotplug \ init_enable_count \ maximal \ maybe_null \ minimal \ + numa \ + allowed_cpus \ + peek_dsq \ prog_run \ reload_loop \ select_cpu_dfl \ @@ -180,7 +186,10 @@ auto-test-targets := \ select_cpu_dispatch_bad_dsq \ select_cpu_dispatch_dbl_dsp \ select_cpu_vtime \ + rt_stall \ test_example \ + total_bw \ + cyclic_kick_wait \ testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets))) diff --git a/tools/testing/selftests/sched_ext/allowed_cpus.bpf.c b/tools/testing/selftests/sched_ext/allowed_cpus.bpf.c new file mode 100644 index 000000000000..35923e74a2ec --- /dev/null +++ b/tools/testing/selftests/sched_ext/allowed_cpus.bpf.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A scheduler that validates the behavior of scx_bpf_select_cpu_and() by + * selecting idle CPUs strictly within a subset of allowed CPUs. + * + * Copyright (c) 2025 Andrea Righi <arighi@nvidia.com> + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +private(PREF_CPUS) struct bpf_cpumask __kptr * allowed_cpumask; + +static void +validate_idle_cpu(const struct task_struct *p, const struct cpumask *allowed, s32 cpu) +{ + if (scx_bpf_test_and_clear_cpu_idle(cpu)) + scx_bpf_error("CPU %d should be marked as busy", cpu); + + if (bpf_cpumask_subset(allowed, p->cpus_ptr) && + !bpf_cpumask_test_cpu(cpu, allowed)) + scx_bpf_error("CPU %d not in the allowed domain for %d (%s)", + cpu, p->pid, p->comm); +} + +s32 BPF_STRUCT_OPS(allowed_cpus_select_cpu, + struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + const struct cpumask *allowed; + s32 cpu; + + allowed = cast_mask(allowed_cpumask); + if (!allowed) { + scx_bpf_error("allowed domain not initialized"); + return -EINVAL; + } + + /* + * Select an idle CPU strictly within the allowed domain. + */ + cpu = scx_bpf_select_cpu_and(p, prev_cpu, wake_flags, allowed, 0); + if (cpu >= 0) { + validate_idle_cpu(p, allowed, cpu); + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); + + return cpu; + } + + return prev_cpu; +} + +void BPF_STRUCT_OPS(allowed_cpus_enqueue, struct task_struct *p, u64 enq_flags) +{ + const struct cpumask *allowed; + s32 prev_cpu = scx_bpf_task_cpu(p), cpu; + + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + + allowed = cast_mask(allowed_cpumask); + if (!allowed) { + scx_bpf_error("allowed domain not initialized"); + return; + } + + /* + * Use scx_bpf_select_cpu_and() to proactively kick an idle CPU + * within @allowed_cpumask, usable by @p. + */ + cpu = scx_bpf_select_cpu_and(p, prev_cpu, 0, allowed, 0); + if (cpu >= 0) { + validate_idle_cpu(p, allowed, cpu); + scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE); + } +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(allowed_cpus_init) +{ + struct bpf_cpumask *mask; + + mask = bpf_cpumask_create(); + if (!mask) + return -ENOMEM; + + mask = bpf_kptr_xchg(&allowed_cpumask, mask); + if (mask) + bpf_cpumask_release(mask); + + bpf_rcu_read_lock(); + + /* + * Assign the first online CPU to the allowed domain. + */ + mask = allowed_cpumask; + if (mask) { + const struct cpumask *online = scx_bpf_get_online_cpumask(); + + bpf_cpumask_set_cpu(bpf_cpumask_first(online), mask); + scx_bpf_put_cpumask(online); + } + + bpf_rcu_read_unlock(); + + return 0; +} + +void BPF_STRUCT_OPS(allowed_cpus_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +struct task_cpu_arg { + pid_t pid; +}; + +SEC("syscall") +int select_cpu_from_user(struct task_cpu_arg *input) +{ + struct task_struct *p; + int cpu; + + p = bpf_task_from_pid(input->pid); + if (!p) + return -EINVAL; + + bpf_rcu_read_lock(); + cpu = scx_bpf_select_cpu_and(p, bpf_get_smp_processor_id(), 0, p->cpus_ptr, 0); + bpf_rcu_read_unlock(); + + bpf_task_release(p); + + return cpu; +} + +SEC(".struct_ops.link") +struct sched_ext_ops allowed_cpus_ops = { + .select_cpu = (void *)allowed_cpus_select_cpu, + .enqueue = (void *)allowed_cpus_enqueue, + .init = (void *)allowed_cpus_init, + .exit = (void *)allowed_cpus_exit, + .name = "allowed_cpus", +}; diff --git a/tools/testing/selftests/sched_ext/allowed_cpus.c b/tools/testing/selftests/sched_ext/allowed_cpus.c new file mode 100644 index 000000000000..093f285ab4ba --- /dev/null +++ b/tools/testing/selftests/sched_ext/allowed_cpus.c @@ -0,0 +1,84 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2025 Andrea Righi <arighi@nvidia.com> + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include "allowed_cpus.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct allowed_cpus *skel; + + skel = allowed_cpus__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(allowed_cpus__load(skel), "Failed to load skel"); + + *ctx = skel; + + return SCX_TEST_PASS; +} + +static int test_select_cpu_from_user(const struct allowed_cpus *skel) +{ + int fd, ret; + __u64 args[1]; + + LIBBPF_OPTS(bpf_test_run_opts, attr, + .ctx_in = args, + .ctx_size_in = sizeof(args), + ); + + args[0] = getpid(); + fd = bpf_program__fd(skel->progs.select_cpu_from_user); + if (fd < 0) + return fd; + + ret = bpf_prog_test_run_opts(fd, &attr); + if (ret < 0) + return ret; + + fprintf(stderr, "%s: CPU %d\n", __func__, attr.retval); + + return 0; +} + +static enum scx_test_status run(void *ctx) +{ + struct allowed_cpus *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.allowed_cpus_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + /* Pick an idle CPU from user-space */ + SCX_FAIL_IF(test_select_cpu_from_user(skel), "Failed to pick idle CPU"); + + /* Just sleeping is fine, plenty of scheduling events happening */ + sleep(1); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE)); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct allowed_cpus *skel = ctx; + + allowed_cpus__destroy(skel); +} + +struct scx_test allowed_cpus = { + .name = "allowed_cpus", + .description = "Verify scx_bpf_select_cpu_and()", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&allowed_cpus) diff --git a/tools/testing/selftests/sched_ext/config b/tools/testing/selftests/sched_ext/config index 0de9b4ee249d..aa901b05c8ad 100644 --- a/tools/testing/selftests/sched_ext/config +++ b/tools/testing/selftests/sched_ext/config @@ -1,4 +1,3 @@ -CONFIG_SCHED_DEBUG=y CONFIG_SCHED_CLASS_EXT=y CONFIG_CGROUPS=y CONFIG_CGROUP_SCHED=y diff --git a/tools/testing/selftests/sched_ext/cyclic_kick_wait.bpf.c b/tools/testing/selftests/sched_ext/cyclic_kick_wait.bpf.c new file mode 100644 index 000000000000..cb34d3335917 --- /dev/null +++ b/tools/testing/selftests/sched_ext/cyclic_kick_wait.bpf.c @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Stress concurrent SCX_KICK_WAIT calls to reproduce wait-cycle deadlock. + * + * Three CPUs are designated from userspace. Every enqueue from one of the + * three CPUs kicks the next CPU in the ring with SCX_KICK_WAIT, creating a + * persistent A -> B -> C -> A wait cycle pressure. + */ +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +const volatile s32 test_cpu_a; +const volatile s32 test_cpu_b; +const volatile s32 test_cpu_c; + +u64 nr_enqueues; +u64 nr_wait_kicks; + +UEI_DEFINE(uei); + +static s32 target_cpu(s32 cpu) +{ + if (cpu == test_cpu_a) + return test_cpu_b; + if (cpu == test_cpu_b) + return test_cpu_c; + if (cpu == test_cpu_c) + return test_cpu_a; + return -1; +} + +void BPF_STRUCT_OPS(cyclic_kick_wait_enqueue, struct task_struct *p, + u64 enq_flags) +{ + s32 this_cpu = bpf_get_smp_processor_id(); + s32 tgt; + + __sync_fetch_and_add(&nr_enqueues, 1); + + if (p->flags & PF_KTHREAD) { + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_INF, + enq_flags | SCX_ENQ_PREEMPT); + return; + } + + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + + tgt = target_cpu(this_cpu); + if (tgt < 0 || tgt == this_cpu) + return; + + __sync_fetch_and_add(&nr_wait_kicks, 1); + scx_bpf_kick_cpu(tgt, SCX_KICK_WAIT); +} + +void BPF_STRUCT_OPS(cyclic_kick_wait_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops cyclic_kick_wait_ops = { + .enqueue = cyclic_kick_wait_enqueue, + .exit = cyclic_kick_wait_exit, + .name = "cyclic_kick_wait", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/cyclic_kick_wait.c b/tools/testing/selftests/sched_ext/cyclic_kick_wait.c new file mode 100644 index 000000000000..c2e5aa9de715 --- /dev/null +++ b/tools/testing/selftests/sched_ext/cyclic_kick_wait.c @@ -0,0 +1,194 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Test SCX_KICK_WAIT forward progress under cyclic wait pressure. + * + * SCX_KICK_WAIT busy-waits until the target CPU enters the scheduling path. + * If multiple CPUs form a wait cycle (A waits for B, B waits for C, C waits + * for A), all CPUs deadlock unless the implementation breaks the cycle. + * + * This test creates that scenario: three CPUs are arranged in a ring. The BPF + * scheduler's ops.enqueue() kicks the next CPU in the ring with SCX_KICK_WAIT + * on every enqueue. Userspace pins 4 worker threads per CPU that loop calling + * sched_yield(), generating a steady stream of enqueues and thus sustained + * A->B->C->A kick_wait cycle pressure. The test passes if the system remains + * responsive for 5 seconds without the scheduler being killed by the watchdog. + */ +#define _GNU_SOURCE + +#include <bpf/bpf.h> +#include <errno.h> +#include <pthread.h> +#include <sched.h> +#include <scx/common.h> +#include <stdint.h> +#include <string.h> +#include <time.h> +#include <unistd.h> + +#include "scx_test.h" +#include "cyclic_kick_wait.bpf.skel.h" + +#define WORKERS_PER_CPU 4 +#define NR_TEST_CPUS 3 +#define NR_WORKERS (NR_TEST_CPUS * WORKERS_PER_CPU) + +struct worker_ctx { + pthread_t tid; + int cpu; + volatile bool stop; + volatile __u64 iters; + bool started; +}; + +static void *worker_fn(void *arg) +{ + struct worker_ctx *worker = arg; + cpu_set_t mask; + + CPU_ZERO(&mask); + CPU_SET(worker->cpu, &mask); + + if (sched_setaffinity(0, sizeof(mask), &mask)) + return (void *)(uintptr_t)errno; + + while (!worker->stop) { + sched_yield(); + worker->iters++; + } + + return NULL; +} + +static int join_worker(struct worker_ctx *worker) +{ + void *ret; + struct timespec ts; + int err; + + if (!worker->started) + return 0; + + if (clock_gettime(CLOCK_REALTIME, &ts)) + return -errno; + + ts.tv_sec += 2; + err = pthread_timedjoin_np(worker->tid, &ret, &ts); + if (err == ETIMEDOUT) + pthread_detach(worker->tid); + if (err) + return -err; + + if ((uintptr_t)ret) + return -(int)(uintptr_t)ret; + + return 0; +} + +static enum scx_test_status setup(void **ctx) +{ + struct cyclic_kick_wait *skel; + + skel = cyclic_kick_wait__open(); + SCX_FAIL_IF(!skel, "Failed to open skel"); + SCX_ENUM_INIT(skel); + + *ctx = skel; + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct cyclic_kick_wait *skel = ctx; + struct worker_ctx workers[NR_WORKERS] = {}; + struct bpf_link *link = NULL; + enum scx_test_status status = SCX_TEST_PASS; + int test_cpus[NR_TEST_CPUS]; + int nr_cpus = 0; + cpu_set_t mask; + int ret, i; + + if (sched_getaffinity(0, sizeof(mask), &mask)) { + SCX_ERR("Failed to get affinity (%d)", errno); + return SCX_TEST_FAIL; + } + + for (i = 0; i < CPU_SETSIZE; i++) { + if (CPU_ISSET(i, &mask)) + test_cpus[nr_cpus++] = i; + if (nr_cpus == NR_TEST_CPUS) + break; + } + + if (nr_cpus < NR_TEST_CPUS) + return SCX_TEST_SKIP; + + skel->rodata->test_cpu_a = test_cpus[0]; + skel->rodata->test_cpu_b = test_cpus[1]; + skel->rodata->test_cpu_c = test_cpus[2]; + + if (cyclic_kick_wait__load(skel)) { + SCX_ERR("Failed to load skel"); + return SCX_TEST_FAIL; + } + + link = bpf_map__attach_struct_ops(skel->maps.cyclic_kick_wait_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler"); + return SCX_TEST_FAIL; + } + + for (i = 0; i < NR_WORKERS; i++) + workers[i].cpu = test_cpus[i / WORKERS_PER_CPU]; + + for (i = 0; i < NR_WORKERS; i++) { + ret = pthread_create(&workers[i].tid, NULL, worker_fn, &workers[i]); + if (ret) { + SCX_ERR("Failed to create worker thread %d (%d)", i, ret); + status = SCX_TEST_FAIL; + goto out; + } + workers[i].started = true; + } + + sleep(5); + + if (skel->data->uei.kind != EXIT_KIND(SCX_EXIT_NONE)) { + SCX_ERR("Scheduler exited unexpectedly (kind=%llu code=%lld)", + (unsigned long long)skel->data->uei.kind, + (long long)skel->data->uei.exit_code); + status = SCX_TEST_FAIL; + } + +out: + for (i = 0; i < NR_WORKERS; i++) + workers[i].stop = true; + + for (i = 0; i < NR_WORKERS; i++) { + ret = join_worker(&workers[i]); + if (ret && status == SCX_TEST_PASS) { + SCX_ERR("Failed to join worker thread %d (%d)", i, ret); + status = SCX_TEST_FAIL; + } + } + + if (link) + bpf_link__destroy(link); + + return status; +} + +static void cleanup(void *ctx) +{ + struct cyclic_kick_wait *skel = ctx; + + cyclic_kick_wait__destroy(skel); +} + +struct scx_test cyclic_kick_wait = { + .name = "cyclic_kick_wait", + .description = "Verify SCX_KICK_WAIT forward progress under a 3-CPU wait cycle", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&cyclic_kick_wait) diff --git a/tools/testing/selftests/sched_ext/dequeue.bpf.c b/tools/testing/selftests/sched_ext/dequeue.bpf.c new file mode 100644 index 000000000000..624e2ccb0688 --- /dev/null +++ b/tools/testing/selftests/sched_ext/dequeue.bpf.c @@ -0,0 +1,389 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A scheduler that validates ops.dequeue() is called correctly: + * - Tasks dispatched to terminal DSQs (local, global) bypass the BPF + * scheduler entirely: no ops.dequeue() should be called + * - Tasks dispatched to user DSQs from ops.enqueue() enter BPF custody: + * ops.dequeue() must be called when they leave custody + * - Every ops.enqueue() dispatch to non-terminal DSQs is followed by + * exactly one ops.dequeue() (validate 1:1 pairing and state machine) + * + * Copyright (c) 2026 NVIDIA Corporation. + */ + +#include <scx/common.bpf.h> + +#define SHARED_DSQ 0 + +/* + * BPF internal queue. + * + * Tasks are stored here and consumed from ops.dispatch(), validating that + * tasks on BPF internal structures still get ops.dequeue() when they + * leave. + */ +struct { + __uint(type, BPF_MAP_TYPE_QUEUE); + __uint(max_entries, 32768); + __type(value, s32); +} global_queue SEC(".maps"); + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +/* + * Counters to track the lifecycle of tasks: + * - enqueue_cnt: Number of times ops.enqueue() was called + * - dequeue_cnt: Number of times ops.dequeue() was called (any type) + * - dispatch_dequeue_cnt: Number of regular dispatch dequeues (no flag) + * - change_dequeue_cnt: Number of property change dequeues + * - bpf_queue_full: Number of times the BPF internal queue was full + */ +u64 enqueue_cnt, dequeue_cnt, dispatch_dequeue_cnt, change_dequeue_cnt, bpf_queue_full; + +/* + * Test scenarios: + * 0) Dispatch to local DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF + * scheduler, no dequeue callbacks) + * 1) Dispatch to global DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF + * scheduler, no dequeue callbacks) + * 2) Dispatch to shared user DSQ from ops.select_cpu() (enters BPF scheduler, + * dequeue callbacks expected) + * 3) Dispatch to local DSQ from ops.enqueue() (terminal DSQ, bypasses BPF + * scheduler, no dequeue callbacks) + * 4) Dispatch to global DSQ from ops.enqueue() (terminal DSQ, bypasses BPF + * scheduler, no dequeue callbacks) + * 5) Dispatch to shared user DSQ from ops.enqueue() (enters BPF scheduler, + * dequeue callbacks expected) + * 6) BPF internal queue from ops.enqueue(): store task PIDs in ops.enqueue(), + * consume in ops.dispatch() and dispatch to local DSQ (validates dequeue + * for tasks stored in internal BPF data structures) + */ +u32 test_scenario; + +/* + * Per-task state to track lifecycle and validate workflow semantics. + * State transitions: + * NONE -> ENQUEUED (on enqueue) + * NONE -> DISPATCHED (on direct dispatch to terminal DSQ) + * ENQUEUED -> DISPATCHED (on dispatch dequeue) + * DISPATCHED -> NONE (on property change dequeue or re-enqueue) + * ENQUEUED -> NONE (on property change dequeue before dispatch) + */ +enum task_state { + TASK_NONE = 0, + TASK_ENQUEUED, + TASK_DISPATCHED, +}; + +struct task_ctx { + enum task_state state; /* Current state in the workflow */ + u64 enqueue_seq; /* Sequence number for debugging */ +}; + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct task_ctx); +} task_ctx_stor SEC(".maps"); + +static struct task_ctx *try_lookup_task_ctx(struct task_struct *p) +{ + return bpf_task_storage_get(&task_ctx_stor, p, 0, 0); +} + +s32 BPF_STRUCT_OPS(dequeue_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + struct task_ctx *tctx; + + tctx = try_lookup_task_ctx(p); + if (!tctx) + return prev_cpu; + + switch (test_scenario) { + case 0: + /* + * Direct dispatch to the local DSQ. + * + * Task bypasses BPF scheduler entirely: no enqueue + * tracking, no ops.dequeue() callbacks. + */ + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); + tctx->state = TASK_DISPATCHED; + break; + case 1: + /* + * Direct dispatch to the global DSQ. + * + * Task bypasses BPF scheduler entirely: no enqueue + * tracking, no ops.dequeue() callbacks. + */ + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + tctx->state = TASK_DISPATCHED; + break; + case 2: + /* + * Dispatch to a shared user DSQ. + * + * Task enters BPF scheduler management: track + * enqueue/dequeue lifecycle and validate state + * transitions. + */ + if (tctx->state == TASK_ENQUEUED) + scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu", + p->pid, p->comm, tctx->enqueue_seq); + + scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, 0); + + __sync_fetch_and_add(&enqueue_cnt, 1); + + tctx->state = TASK_ENQUEUED; + tctx->enqueue_seq++; + break; + } + + return prev_cpu; +} + +void BPF_STRUCT_OPS(dequeue_enqueue, struct task_struct *p, u64 enq_flags) +{ + struct task_ctx *tctx; + s32 pid = p->pid; + + tctx = try_lookup_task_ctx(p); + if (!tctx) + return; + + switch (test_scenario) { + case 3: + /* + * Direct dispatch to the local DSQ. + * + * Task bypasses BPF scheduler entirely: no enqueue + * tracking, no ops.dequeue() callbacks. + */ + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags); + tctx->state = TASK_DISPATCHED; + break; + case 4: + /* + * Direct dispatch to the global DSQ. + * + * Task bypasses BPF scheduler entirely: no enqueue + * tracking, no ops.dequeue() callbacks. + */ + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + tctx->state = TASK_DISPATCHED; + break; + case 5: + /* + * Dispatch to shared user DSQ. + * + * Task enters BPF scheduler management: track + * enqueue/dequeue lifecycle and validate state + * transitions. + */ + if (tctx->state == TASK_ENQUEUED) + scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu", + p->pid, p->comm, tctx->enqueue_seq); + + scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags); + + __sync_fetch_and_add(&enqueue_cnt, 1); + + tctx->state = TASK_ENQUEUED; + tctx->enqueue_seq++; + break; + case 6: + /* + * Store task in BPF internal queue. + * + * Task enters BPF scheduler management: track + * enqueue/dequeue lifecycle and validate state + * transitions. + */ + if (tctx->state == TASK_ENQUEUED) + scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu", + p->pid, p->comm, tctx->enqueue_seq); + + if (bpf_map_push_elem(&global_queue, &pid, 0)) { + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + __sync_fetch_and_add(&bpf_queue_full, 1); + + tctx->state = TASK_DISPATCHED; + } else { + __sync_fetch_and_add(&enqueue_cnt, 1); + + tctx->state = TASK_ENQUEUED; + tctx->enqueue_seq++; + } + break; + default: + /* For all other scenarios, dispatch to the global DSQ */ + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); + tctx->state = TASK_DISPATCHED; + break; + } + + scx_bpf_kick_cpu(scx_bpf_task_cpu(p), SCX_KICK_IDLE); +} + +void BPF_STRUCT_OPS(dequeue_dequeue, struct task_struct *p, u64 deq_flags) +{ + struct task_ctx *tctx; + + __sync_fetch_and_add(&dequeue_cnt, 1); + + tctx = try_lookup_task_ctx(p); + if (!tctx) + return; + + /* + * For scenarios 0, 1, 3, and 4 (terminal DSQs: local and global), + * ops.dequeue() should never be called because tasks bypass the + * BPF scheduler entirely. If we get here, it's a kernel bug. + */ + if (test_scenario == 0 || test_scenario == 3) { + scx_bpf_error("%d (%s): dequeue called for local DSQ scenario", + p->pid, p->comm); + return; + } + + if (test_scenario == 1 || test_scenario == 4) { + scx_bpf_error("%d (%s): dequeue called for global DSQ scenario", + p->pid, p->comm); + return; + } + + if (deq_flags & SCX_DEQ_SCHED_CHANGE) { + /* + * Property change interrupting the workflow. Valid from + * both ENQUEUED and DISPATCHED states. Transitions task + * back to NONE state. + */ + __sync_fetch_and_add(&change_dequeue_cnt, 1); + + /* Validate state transition */ + if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_DISPATCHED) + scx_bpf_error("%d (%s): invalid property change dequeue state=%d seq=%llu", + p->pid, p->comm, tctx->state, tctx->enqueue_seq); + + /* + * Transition back to NONE: task outside scheduler control. + * + * Scenario 6: dispatch() checks tctx->state after popping a + * PID, if the task is in state NONE, it was dequeued by + * property change and must not be dispatched (this + * prevents "target CPU not allowed"). + */ + tctx->state = TASK_NONE; + } else { + /* + * Regular dispatch dequeue: kernel is moving the task from + * BPF custody to a terminal DSQ. Normally we come from + * ENQUEUED state. We can also see TASK_NONE if the task + * was dequeued by property change (SCX_DEQ_SCHED_CHANGE) + * while it was already on a DSQ (dispatched but not yet + * consumed); in that case we just leave state as NONE. + */ + __sync_fetch_and_add(&dispatch_dequeue_cnt, 1); + + /* + * Must be ENQUEUED (normal path) or NONE (already dequeued + * by property change while on a DSQ). + */ + if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_NONE) + scx_bpf_error("%d (%s): dispatch dequeue from state %d seq=%llu", + p->pid, p->comm, tctx->state, tctx->enqueue_seq); + + if (tctx->state == TASK_ENQUEUED) + tctx->state = TASK_DISPATCHED; + + /* NONE: leave as-is, task was already property-change dequeued */ + } +} + +void BPF_STRUCT_OPS(dequeue_dispatch, s32 cpu, struct task_struct *prev) +{ + if (test_scenario == 6) { + struct task_ctx *tctx; + struct task_struct *p; + s32 pid; + + if (bpf_map_pop_elem(&global_queue, &pid)) + return; + + p = bpf_task_from_pid(pid); + if (!p) + return; + + /* + * If the task was dequeued by property change + * (ops.dequeue() set tctx->state = TASK_NONE), skip + * dispatch. + */ + tctx = try_lookup_task_ctx(p); + if (!tctx || tctx->state == TASK_NONE) { + bpf_task_release(p); + return; + } + + /* + * Dispatch to this CPU's local DSQ if allowed, otherwise + * fallback to the global DSQ. + */ + if (bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0); + else + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); + + bpf_task_release(p); + } else { + scx_bpf_dsq_move_to_local(SHARED_DSQ, 0); + } +} + +s32 BPF_STRUCT_OPS(dequeue_init_task, struct task_struct *p, + struct scx_init_task_args *args) +{ + struct task_ctx *tctx; + + tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!tctx) + return -ENOMEM; + + return 0; +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(dequeue_init) +{ + s32 ret; + + ret = scx_bpf_create_dsq(SHARED_DSQ, -1); + if (ret) + return ret; + + return 0; +} + +void BPF_STRUCT_OPS(dequeue_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops dequeue_ops = { + .select_cpu = (void *)dequeue_select_cpu, + .enqueue = (void *)dequeue_enqueue, + .dequeue = (void *)dequeue_dequeue, + .dispatch = (void *)dequeue_dispatch, + .init_task = (void *)dequeue_init_task, + .init = (void *)dequeue_init, + .exit = (void *)dequeue_exit, + .flags = SCX_OPS_ENQ_LAST, + .name = "dequeue_test", +}; diff --git a/tools/testing/selftests/sched_ext/dequeue.c b/tools/testing/selftests/sched_ext/dequeue.c new file mode 100644 index 000000000000..4e93262703ca --- /dev/null +++ b/tools/testing/selftests/sched_ext/dequeue.c @@ -0,0 +1,274 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2025 NVIDIA Corporation. + */ +#define _GNU_SOURCE +#include <stdio.h> +#include <unistd.h> +#include <signal.h> +#include <time.h> +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <sched.h> +#include <pthread.h> +#include "scx_test.h" +#include "dequeue.bpf.skel.h" + +#define NUM_WORKERS 8 +#define AFFINITY_HAMMER_MS 500 + +/* + * Worker function that creates enqueue/dequeue events via CPU work and + * sleep. + */ +static void worker_fn(int id) +{ + int i; + volatile int sum = 0; + + for (i = 0; i < 1000; i++) { + volatile int j; + + /* Do some work to trigger scheduling events */ + for (j = 0; j < 10000; j++) + sum += j; + + /* Sleep to trigger dequeue */ + usleep(1000 + (id * 100)); + } + + exit(0); +} + +/* + * This thread changes workers' affinity from outside so that some changes + * hit tasks while they are still in the scheduler's queue and trigger + * property-change dequeues. + */ +static void *affinity_hammer_fn(void *arg) +{ + pid_t *pids = arg; + cpu_set_t cpuset; + int i = 0, n = NUM_WORKERS; + struct timespec start, now; + + clock_gettime(CLOCK_MONOTONIC, &start); + while (1) { + int w = i % n; + int cpu = (i / n) % 4; + + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + sched_setaffinity(pids[w], sizeof(cpuset), &cpuset); + i++; + + /* Check elapsed time every 256 iterations to limit gettime cost */ + if ((i & 255) == 0) { + long long elapsed_ms; + + clock_gettime(CLOCK_MONOTONIC, &now); + elapsed_ms = (now.tv_sec - start.tv_sec) * 1000LL + + (now.tv_nsec - start.tv_nsec) / 1000000; + if (elapsed_ms >= AFFINITY_HAMMER_MS) + break; + } + } + return NULL; +} + +static enum scx_test_status run_scenario(struct dequeue *skel, u32 scenario, + const char *scenario_name) +{ + struct bpf_link *link; + pid_t pids[NUM_WORKERS]; + pthread_t hammer; + + int i, status; + u64 enq_start, deq_start, + dispatch_deq_start, change_deq_start, bpf_queue_full_start; + u64 enq_delta, deq_delta, + dispatch_deq_delta, change_deq_delta, bpf_queue_full_delta; + + /* Set the test scenario */ + skel->bss->test_scenario = scenario; + + /* Record starting counts */ + enq_start = skel->bss->enqueue_cnt; + deq_start = skel->bss->dequeue_cnt; + dispatch_deq_start = skel->bss->dispatch_dequeue_cnt; + change_deq_start = skel->bss->change_dequeue_cnt; + bpf_queue_full_start = skel->bss->bpf_queue_full; + + link = bpf_map__attach_struct_ops(skel->maps.dequeue_ops); + SCX_FAIL_IF(!link, "Failed to attach struct_ops for scenario %s", scenario_name); + + /* Fork worker processes to generate enqueue/dequeue events */ + for (i = 0; i < NUM_WORKERS; i++) { + pids[i] = fork(); + SCX_FAIL_IF(pids[i] < 0, "Failed to fork worker %d", i); + + if (pids[i] == 0) { + worker_fn(i); + /* Should not reach here */ + exit(1); + } + } + + /* + * Run an "affinity hammer" so that some property changes hit tasks + * while they are still in BPF custody (e.g., in user DSQ or BPF + * queue), triggering SCX_DEQ_SCHED_CHANGE dequeues. + */ + SCX_FAIL_IF(pthread_create(&hammer, NULL, affinity_hammer_fn, pids) != 0, + "Failed to create affinity hammer thread"); + pthread_join(hammer, NULL); + + /* Wait for all workers to complete */ + for (i = 0; i < NUM_WORKERS; i++) { + SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], + "Failed to wait for worker %d", i); + SCX_FAIL_IF(status != 0, "Worker %d exited with status %d", i, status); + } + + bpf_link__destroy(link); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG)); + + /* Calculate deltas */ + enq_delta = skel->bss->enqueue_cnt - enq_start; + deq_delta = skel->bss->dequeue_cnt - deq_start; + dispatch_deq_delta = skel->bss->dispatch_dequeue_cnt - dispatch_deq_start; + change_deq_delta = skel->bss->change_dequeue_cnt - change_deq_start; + bpf_queue_full_delta = skel->bss->bpf_queue_full - bpf_queue_full_start; + + printf("%s:\n", scenario_name); + printf(" enqueues: %lu\n", (unsigned long)enq_delta); + printf(" dequeues: %lu (dispatch: %lu, property_change: %lu)\n", + (unsigned long)deq_delta, + (unsigned long)dispatch_deq_delta, + (unsigned long)change_deq_delta); + printf(" BPF queue full: %lu\n", (unsigned long)bpf_queue_full_delta); + + /* + * Validate enqueue/dequeue lifecycle tracking. + * + * For scenarios 0, 1, 3, 4 (local and global DSQs from + * ops.select_cpu() and ops.enqueue()), both enqueues and dequeues + * should be 0 because tasks bypass the BPF scheduler entirely: + * tasks never enter BPF scheduler's custody. + * + * For scenarios 2, 5, 6 (user DSQ or BPF internal queue) we expect + * both enqueues and dequeues. + * + * The BPF code does strict state machine validation with + * scx_bpf_error() to ensure the workflow semantics are correct. + * + * If we reach this point without errors, the semantics are + * validated correctly. + */ + if (scenario == 0 || scenario == 1 || + scenario == 3 || scenario == 4) { + /* Tasks bypass BPF scheduler completely */ + SCX_EQ(enq_delta, 0); + SCX_EQ(deq_delta, 0); + SCX_EQ(dispatch_deq_delta, 0); + SCX_EQ(change_deq_delta, 0); + } else { + /* + * User DSQ from ops.enqueue() or ops.select_cpu(): tasks + * enter BPF scheduler's custody. + * + * Also validate 1:1 enqueue/dequeue pairing. + */ + SCX_GT(enq_delta, 0); + SCX_GT(deq_delta, 0); + SCX_EQ(enq_delta, deq_delta); + } + + return SCX_TEST_PASS; +} + +static enum scx_test_status setup(void **ctx) +{ + struct dequeue *skel; + + skel = dequeue__open(); + SCX_FAIL_IF(!skel, "Failed to open skel"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(dequeue__load(skel), "Failed to load skel"); + + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct dequeue *skel = ctx; + enum scx_test_status status; + + status = run_scenario(skel, 0, "Scenario 0: Local DSQ from ops.select_cpu()"); + if (status != SCX_TEST_PASS) + return status; + + status = run_scenario(skel, 1, "Scenario 1: Global DSQ from ops.select_cpu()"); + if (status != SCX_TEST_PASS) + return status; + + status = run_scenario(skel, 2, "Scenario 2: User DSQ from ops.select_cpu()"); + if (status != SCX_TEST_PASS) + return status; + + status = run_scenario(skel, 3, "Scenario 3: Local DSQ from ops.enqueue()"); + if (status != SCX_TEST_PASS) + return status; + + status = run_scenario(skel, 4, "Scenario 4: Global DSQ from ops.enqueue()"); + if (status != SCX_TEST_PASS) + return status; + + status = run_scenario(skel, 5, "Scenario 5: User DSQ from ops.enqueue()"); + if (status != SCX_TEST_PASS) + return status; + + status = run_scenario(skel, 6, "Scenario 6: BPF queue from ops.enqueue()"); + if (status != SCX_TEST_PASS) + return status; + + printf("\n=== Summary ===\n"); + printf("Total enqueues: %lu\n", (unsigned long)skel->bss->enqueue_cnt); + printf("Total dequeues: %lu\n", (unsigned long)skel->bss->dequeue_cnt); + printf(" Dispatch dequeues: %lu (no flag, normal workflow)\n", + (unsigned long)skel->bss->dispatch_dequeue_cnt); + printf(" Property change dequeues: %lu (SCX_DEQ_SCHED_CHANGE flag)\n", + (unsigned long)skel->bss->change_dequeue_cnt); + printf(" BPF queue full: %lu\n", + (unsigned long)skel->bss->bpf_queue_full); + printf("\nAll scenarios passed - no state machine violations detected\n"); + printf("-> Validated: Local DSQ dispatch bypasses BPF scheduler\n"); + printf("-> Validated: Global DSQ dispatch bypasses BPF scheduler\n"); + printf("-> Validated: User DSQ dispatch triggers ops.dequeue() callbacks\n"); + printf("-> Validated: Dispatch dequeues have no flags (normal workflow)\n"); + printf("-> Validated: Property change dequeues have SCX_DEQ_SCHED_CHANGE flag\n"); + printf("-> Validated: No duplicate enqueues or invalid state transitions\n"); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct dequeue *skel = ctx; + + dequeue__destroy(skel); +} + +struct scx_test dequeue_test = { + .name = "dequeue", + .description = "Verify ops.dequeue() semantics", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; + +REGISTER_SCX_TEST(&dequeue_test) diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu.bpf.c b/tools/testing/selftests/sched_ext/enq_select_cpu.bpf.c new file mode 100644 index 000000000000..ee2c9b89716e --- /dev/null +++ b/tools/testing/selftests/sched_ext/enq_select_cpu.bpf.c @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet <dvernet@meta.com> + * Copyright (c) 2023 Tejun Heo <tj@kernel.org> + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +s32 BPF_STRUCT_OPS(enq_select_cpu_select_cpu, struct task_struct *p, + s32 prev_cpu, u64 wake_flags) +{ + /* Bounce all tasks to ops.enqueue() */ + return prev_cpu; +} + +void BPF_STRUCT_OPS(enq_select_cpu_enqueue, struct task_struct *p, + u64 enq_flags) +{ + s32 cpu, prev_cpu = scx_bpf_task_cpu(p); + bool found = false; + + cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, 0, &found); + if (found) { + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, enq_flags); + return; + } + + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); +} + +void BPF_STRUCT_OPS(enq_select_cpu_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +struct task_cpu_arg { + pid_t pid; +}; + +SEC("syscall") +int select_cpu_from_user(struct task_cpu_arg *input) +{ + struct task_struct *p; + bool found = false; + s32 cpu; + + p = bpf_task_from_pid(input->pid); + if (!p) + return -EINVAL; + + bpf_rcu_read_lock(); + cpu = scx_bpf_select_cpu_dfl(p, bpf_get_smp_processor_id(), 0, &found); + if (!found) + cpu = -EBUSY; + bpf_rcu_read_unlock(); + + bpf_task_release(p); + + return cpu; +} + +SEC(".struct_ops.link") +struct sched_ext_ops enq_select_cpu_ops = { + .select_cpu = (void *)enq_select_cpu_select_cpu, + .enqueue = (void *)enq_select_cpu_enqueue, + .exit = (void *)enq_select_cpu_exit, + .name = "enq_select_cpu", + .timeout_ms = 1000U, +}; diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu.c b/tools/testing/selftests/sched_ext/enq_select_cpu.c new file mode 100644 index 000000000000..340c6f8b86da --- /dev/null +++ b/tools/testing/selftests/sched_ext/enq_select_cpu.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2023 David Vernet <dvernet@meta.com> + * Copyright (c) 2023 Tejun Heo <tj@kernel.org> + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include "enq_select_cpu.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct enq_select_cpu *skel; + + skel = enq_select_cpu__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(enq_select_cpu__load(skel), "Failed to load skel"); + + *ctx = skel; + + return SCX_TEST_PASS; +} + +static int test_select_cpu_from_user(const struct enq_select_cpu *skel) +{ + int fd, ret; + __u64 args[1]; + + LIBBPF_OPTS(bpf_test_run_opts, attr, + .ctx_in = args, + .ctx_size_in = sizeof(args), + ); + + args[0] = getpid(); + fd = bpf_program__fd(skel->progs.select_cpu_from_user); + if (fd < 0) + return fd; + + ret = bpf_prog_test_run_opts(fd, &attr); + if (ret < 0) + return ret; + + fprintf(stderr, "%s: CPU %d\n", __func__, attr.retval); + + return 0; +} + +static enum scx_test_status run(void *ctx) +{ + struct enq_select_cpu *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.enq_select_cpu_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler"); + return SCX_TEST_FAIL; + } + + /* Pick an idle CPU from user-space */ + SCX_FAIL_IF(test_select_cpu_from_user(skel), "Failed to pick idle CPU"); + + sleep(1); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE)); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct enq_select_cpu *skel = ctx; + + enq_select_cpu__destroy(skel); +} + +struct scx_test enq_select_cpu = { + .name = "enq_select_cpu", + .description = "Verify scx_bpf_select_cpu_dfl() from multiple contexts", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&enq_select_cpu) diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c deleted file mode 100644 index a7cf868d5e31..000000000000 --- a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c +++ /dev/null @@ -1,43 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. - * Copyright (c) 2023 David Vernet <dvernet@meta.com> - * Copyright (c) 2023 Tejun Heo <tj@kernel.org> - */ - -#include <scx/common.bpf.h> - -char _license[] SEC("license") = "GPL"; - -/* Manually specify the signature until the kfunc is added to the scx repo. */ -s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, - bool *found) __ksym; - -s32 BPF_STRUCT_OPS(enq_select_cpu_fails_select_cpu, struct task_struct *p, - s32 prev_cpu, u64 wake_flags) -{ - return prev_cpu; -} - -void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p, - u64 enq_flags) -{ - /* - * Need to initialize the variable or the verifier will fail to load. - * Improving these semantics is actively being worked on. - */ - bool found = false; - - /* Can only call from ops.select_cpu() */ - scx_bpf_select_cpu_dfl(p, 0, 0, &found); - - scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); -} - -SEC(".struct_ops.link") -struct sched_ext_ops enq_select_cpu_fails_ops = { - .select_cpu = (void *) enq_select_cpu_fails_select_cpu, - .enqueue = (void *) enq_select_cpu_fails_enqueue, - .name = "enq_select_cpu_fails", - .timeout_ms = 1000U, -}; diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c deleted file mode 100644 index a80e3a3b3698..000000000000 --- a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c +++ /dev/null @@ -1,61 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. - * Copyright (c) 2023 David Vernet <dvernet@meta.com> - * Copyright (c) 2023 Tejun Heo <tj@kernel.org> - */ -#include <bpf/bpf.h> -#include <scx/common.h> -#include <sys/wait.h> -#include <unistd.h> -#include "enq_select_cpu_fails.bpf.skel.h" -#include "scx_test.h" - -static enum scx_test_status setup(void **ctx) -{ - struct enq_select_cpu_fails *skel; - - skel = enq_select_cpu_fails__open(); - SCX_FAIL_IF(!skel, "Failed to open"); - SCX_ENUM_INIT(skel); - SCX_FAIL_IF(enq_select_cpu_fails__load(skel), "Failed to load skel"); - - *ctx = skel; - - return SCX_TEST_PASS; -} - -static enum scx_test_status run(void *ctx) -{ - struct enq_select_cpu_fails *skel = ctx; - struct bpf_link *link; - - link = bpf_map__attach_struct_ops(skel->maps.enq_select_cpu_fails_ops); - if (!link) { - SCX_ERR("Failed to attach scheduler"); - return SCX_TEST_FAIL; - } - - sleep(1); - - bpf_link__destroy(link); - - return SCX_TEST_PASS; -} - -static void cleanup(void *ctx) -{ - struct enq_select_cpu_fails *skel = ctx; - - enq_select_cpu_fails__destroy(skel); -} - -struct scx_test enq_select_cpu_fails = { - .name = "enq_select_cpu_fails", - .description = "Verify we fail to call scx_bpf_select_cpu_dfl() " - "from ops.enqueue()", - .setup = setup, - .run = run, - .cleanup = cleanup, -}; -REGISTER_SCX_TEST(&enq_select_cpu_fails) diff --git a/tools/testing/selftests/sched_ext/exit.bpf.c b/tools/testing/selftests/sched_ext/exit.bpf.c index 4bc36182d3ff..2e848820a44b 100644 --- a/tools/testing/selftests/sched_ext/exit.bpf.c +++ b/tools/testing/selftests/sched_ext/exit.bpf.c @@ -41,7 +41,7 @@ void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p) if (exit_point == EXIT_DISPATCH) EXIT_CLEANLY(); - scx_bpf_dsq_move_to_local(DSQ_ID); + scx_bpf_dsq_move_to_local(DSQ_ID, 0); } void BPF_STRUCT_OPS(exit_enable, struct task_struct *p) diff --git a/tools/testing/selftests/sched_ext/exit.c b/tools/testing/selftests/sched_ext/exit.c index 9451782689de..b987611789d1 100644 --- a/tools/testing/selftests/sched_ext/exit.c +++ b/tools/testing/selftests/sched_ext/exit.c @@ -22,10 +22,18 @@ static enum scx_test_status run(void *ctx) struct bpf_link *link; char buf[16]; + /* + * On single-CPU systems, ops.select_cpu() is never + * invoked, so skip this test to avoid getting stuck + * indefinitely. + */ + if (tc == EXIT_SELECT_CPU && libbpf_num_possible_cpus() == 1) + continue; + skel = exit__open(); SCX_ENUM_INIT(skel); skel->rodata->exit_point = tc; - exit__load(skel); + SCX_FAIL_IF(exit__load(skel), "Failed to load skel"); link = bpf_map__attach_struct_ops(skel->maps.exit_ops); if (!link) { SCX_ERR("Failed to attach scheduler"); diff --git a/tools/testing/selftests/sched_ext/exit_test.h b/tools/testing/selftests/sched_ext/exit_test.h index 94f0268b9cb8..2723e0fda801 100644 --- a/tools/testing/selftests/sched_ext/exit_test.h +++ b/tools/testing/selftests/sched_ext/exit_test.h @@ -17,4 +17,4 @@ enum exit_test_case { NUM_EXITS, }; -#endif // # __EXIT_TEST_H__ +#endif // __EXIT_TEST_H__ diff --git a/tools/testing/selftests/sched_ext/hotplug.c b/tools/testing/selftests/sched_ext/hotplug.c index 1c9ceb661c43..0cfbb111a2d0 100644 --- a/tools/testing/selftests/sched_ext/hotplug.c +++ b/tools/testing/selftests/sched_ext/hotplug.c @@ -6,7 +6,6 @@ #include <bpf/bpf.h> #include <sched.h> #include <scx/common.h> -#include <sched.h> #include <sys/wait.h> #include <unistd.h> diff --git a/tools/testing/selftests/sched_ext/init_enable_count.c b/tools/testing/selftests/sched_ext/init_enable_count.c index eddf9e0e26e7..44577e30e764 100644 --- a/tools/testing/selftests/sched_ext/init_enable_count.c +++ b/tools/testing/selftests/sched_ext/init_enable_count.c @@ -4,6 +4,7 @@ * Copyright (c) 2023 David Vernet <dvernet@meta.com> * Copyright (c) 2023 Tejun Heo <tj@kernel.org> */ +#include <signal.h> #include <stdio.h> #include <unistd.h> #include <sched.h> @@ -23,6 +24,9 @@ static enum scx_test_status run_test(bool global) int ret, i, status; struct sched_param param = {}; pid_t pids[num_pre_forks]; + int pipe_fds[2]; + + SCX_FAIL_IF(pipe(pipe_fds) < 0, "Failed to create pipe"); skel = init_enable_count__open(); SCX_FAIL_IF(!skel, "Failed to open"); @@ -38,26 +42,35 @@ static enum scx_test_status run_test(bool global) * ensure (at least in practical terms) that there are more tasks that * transition from SCHED_OTHER -> SCHED_EXT than there are tasks that * take the fork() path either below or in other processes. + * + * All children will block on read() on the pipe until the parent closes + * the write end after attaching the scheduler, which signals all of + * them to exit simultaneously. Auto-reap so we don't have to wait on + * them. */ + signal(SIGCHLD, SIG_IGN); for (i = 0; i < num_pre_forks; i++) { - pids[i] = fork(); - SCX_FAIL_IF(pids[i] < 0, "Failed to fork child"); - if (pids[i] == 0) { - sleep(1); + pid_t pid = fork(); + + SCX_FAIL_IF(pid < 0, "Failed to fork child"); + if (pid == 0) { + char buf; + + close(pipe_fds[1]); + if (read(pipe_fds[0], &buf, 1) < 0) + exit(1); + close(pipe_fds[0]); exit(0); } } + close(pipe_fds[0]); link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops); SCX_FAIL_IF(!link, "Failed to attach struct_ops"); - for (i = 0; i < num_pre_forks; i++) { - SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], - "Failed to wait for pre-forked child\n"); - - SCX_FAIL_IF(status != 0, "Pre-forked child %d exited with status %d\n", i, - status); - } + /* Signal all pre-forked children to exit. */ + close(pipe_fds[1]); + signal(SIGCHLD, SIG_DFL); bpf_link__destroy(link); SCX_GE(skel->bss->init_task_cnt, num_pre_forks); diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c index 430f5e13bf55..04a369078aac 100644 --- a/tools/testing/selftests/sched_ext/maximal.bpf.c +++ b/tools/testing/selftests/sched_ext/maximal.bpf.c @@ -30,7 +30,7 @@ void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev) { - scx_bpf_dsq_move_to_local(DSQ_ID); + scx_bpf_dsq_move_to_local(DSQ_ID, 0); } void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags) @@ -67,13 +67,12 @@ void BPF_STRUCT_OPS(maximal_set_cpumask, struct task_struct *p, void BPF_STRUCT_OPS(maximal_update_idle, s32 cpu, bool idle) {} -void BPF_STRUCT_OPS(maximal_cpu_acquire, s32 cpu, - struct scx_cpu_acquire_args *args) -{} - -void BPF_STRUCT_OPS(maximal_cpu_release, s32 cpu, - struct scx_cpu_release_args *args) -{} +SEC("tp_btf/sched_switch") +int BPF_PROG(maximal_sched_switch, bool preempt, struct task_struct *prev, + struct task_struct *next, unsigned int prev_state) +{ + return 0; +} void BPF_STRUCT_OPS(maximal_cpu_online, s32 cpu) {} @@ -123,6 +122,10 @@ void BPF_STRUCT_OPS(maximal_cgroup_cancel_move, struct task_struct *p, void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight) {} +void BPF_STRUCT_OPS(maximal_cgroup_set_bandwidth, struct cgroup *cgrp, + u64 period_us, u64 quota_us, u64 burst_us) +{} + s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init) { return scx_bpf_create_dsq(DSQ_ID, -1); @@ -146,8 +149,6 @@ struct sched_ext_ops maximal_ops = { .set_weight = (void *) maximal_set_weight, .set_cpumask = (void *) maximal_set_cpumask, .update_idle = (void *) maximal_update_idle, - .cpu_acquire = (void *) maximal_cpu_acquire, - .cpu_release = (void *) maximal_cpu_release, .cpu_online = (void *) maximal_cpu_online, .cpu_offline = (void *) maximal_cpu_offline, .init_task = (void *) maximal_init_task, @@ -160,6 +161,7 @@ struct sched_ext_ops maximal_ops = { .cgroup_move = (void *) maximal_cgroup_move, .cgroup_cancel_move = (void *) maximal_cgroup_cancel_move, .cgroup_set_weight = (void *) maximal_cgroup_set_weight, + .cgroup_set_bandwidth = (void *) maximal_cgroup_set_bandwidth, .init = (void *) maximal_init, .exit = (void *) maximal_exit, .name = "maximal", diff --git a/tools/testing/selftests/sched_ext/maximal.c b/tools/testing/selftests/sched_ext/maximal.c index c6be50a9941d..1dc369224670 100644 --- a/tools/testing/selftests/sched_ext/maximal.c +++ b/tools/testing/selftests/sched_ext/maximal.c @@ -19,6 +19,9 @@ static enum scx_test_status setup(void **ctx) SCX_ENUM_INIT(skel); SCX_FAIL_IF(maximal__load(skel), "Failed to load skel"); + bpf_map__set_autoattach(skel->maps.maximal_ops, false); + SCX_FAIL_IF(maximal__attach(skel), "Failed to attach skel"); + *ctx = skel; return SCX_TEST_PASS; diff --git a/tools/testing/selftests/sched_ext/numa.bpf.c b/tools/testing/selftests/sched_ext/numa.bpf.c new file mode 100644 index 000000000000..78cc49a7f9a6 --- /dev/null +++ b/tools/testing/selftests/sched_ext/numa.bpf.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A scheduler that validates the behavior of the NUMA-aware + * functionalities. + * + * The scheduler creates a separate DSQ for each NUMA node, ensuring tasks + * are exclusively processed by CPUs within their respective nodes. Idle + * CPUs are selected only within the same node, so task migration can only + * occurs between CPUs belonging to the same node. + * + * Copyright (c) 2025 Andrea Righi <arighi@nvidia.com> + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +const volatile unsigned int __COMPAT_SCX_PICK_IDLE_IN_NODE; + +static bool is_cpu_idle(s32 cpu, int node) +{ + const struct cpumask *idle_cpumask; + bool idle; + + idle_cpumask = __COMPAT_scx_bpf_get_idle_cpumask_node(node); + idle = bpf_cpumask_test_cpu(cpu, idle_cpumask); + scx_bpf_put_cpumask(idle_cpumask); + + return idle; +} + +s32 BPF_STRUCT_OPS(numa_select_cpu, + struct task_struct *p, s32 prev_cpu, u64 wake_flags) +{ + int node = __COMPAT_scx_bpf_cpu_node(scx_bpf_task_cpu(p)); + s32 cpu; + + /* + * We could just use __COMPAT_scx_bpf_pick_any_cpu_node() here, + * since it already tries to pick an idle CPU within the node + * first, but let's use both functions for better testing coverage. + */ + cpu = __COMPAT_scx_bpf_pick_idle_cpu_node(p->cpus_ptr, node, + __COMPAT_SCX_PICK_IDLE_IN_NODE); + if (cpu < 0) + cpu = __COMPAT_scx_bpf_pick_any_cpu_node(p->cpus_ptr, node, + __COMPAT_SCX_PICK_IDLE_IN_NODE); + + if (is_cpu_idle(cpu, node)) + scx_bpf_error("CPU %d should be marked as busy", cpu); + + if (__COMPAT_scx_bpf_cpu_node(cpu) != node) + scx_bpf_error("CPU %d should be in node %d", cpu, node); + + return cpu; +} + +void BPF_STRUCT_OPS(numa_enqueue, struct task_struct *p, u64 enq_flags) +{ + int node = __COMPAT_scx_bpf_cpu_node(scx_bpf_task_cpu(p)); + + scx_bpf_dsq_insert(p, node, SCX_SLICE_DFL, enq_flags); +} + +void BPF_STRUCT_OPS(numa_dispatch, s32 cpu, struct task_struct *prev) +{ + int node = __COMPAT_scx_bpf_cpu_node(cpu); + + scx_bpf_dsq_move_to_local(node, 0); +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(numa_init) +{ + int node, err; + + bpf_for(node, 0, __COMPAT_scx_bpf_nr_node_ids()) { + err = scx_bpf_create_dsq(node, node); + if (err) + return err; + } + + return 0; +} + +void BPF_STRUCT_OPS(numa_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops numa_ops = { + .select_cpu = (void *)numa_select_cpu, + .enqueue = (void *)numa_enqueue, + .dispatch = (void *)numa_dispatch, + .init = (void *)numa_init, + .exit = (void *)numa_exit, + .name = "numa", +}; diff --git a/tools/testing/selftests/sched_ext/numa.c b/tools/testing/selftests/sched_ext/numa.c new file mode 100644 index 000000000000..b060c3b65c82 --- /dev/null +++ b/tools/testing/selftests/sched_ext/numa.c @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2025 Andrea Righi <arighi@nvidia.com> + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include "numa.bpf.skel.h" +#include "scx_test.h" + +static enum scx_test_status setup(void **ctx) +{ + struct numa *skel; + + skel = numa__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + skel->rodata->__COMPAT_SCX_PICK_IDLE_IN_NODE = SCX_PICK_IDLE_IN_NODE; + skel->struct_ops.numa_ops->flags = SCX_OPS_BUILTIN_IDLE_PER_NODE; + SCX_FAIL_IF(numa__load(skel), "Failed to load skel"); + + *ctx = skel; + + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct numa *skel = ctx; + struct bpf_link *link; + + link = bpf_map__attach_struct_ops(skel->maps.numa_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + + /* Just sleeping is fine, plenty of scheduling events happening */ + sleep(1); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE)); + bpf_link__destroy(link); + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct numa *skel = ctx; + + numa__destroy(skel); +} + +struct scx_test numa = { + .name = "numa", + .description = "Verify NUMA-aware functionalities", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&numa) diff --git a/tools/testing/selftests/sched_ext/peek_dsq.bpf.c b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c new file mode 100644 index 000000000000..7f23fb17b1e0 --- /dev/null +++ b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A BPF program for testing DSQ operations and peek in particular. + * + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025 Ryan Newton <ryan.newton@alum.mit.edu> + */ + +#include <scx/common.bpf.h> +#include <scx/compat.bpf.h> + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); /* Error handling */ + +#define MAX_SAMPLES 100 +#define MAX_CPUS 512 +#define DSQ_POOL_SIZE 8 +int max_samples = MAX_SAMPLES; +int max_cpus = MAX_CPUS; +int dsq_pool_size = DSQ_POOL_SIZE; + +/* Global variables to store test results */ +int dsq_peek_result1 = -1; +long dsq_inserted_pid = -1; +int insert_test_cpu = -1; /* Set to the cpu that performs the test */ +long dsq_peek_result2 = -1; +long dsq_peek_result2_pid = -1; +long dsq_peek_result2_expected = -1; +int test_dsq_id = 1234; /* Use a simple ID like create_dsq example */ +int real_dsq_id = 1235; /* DSQ for normal operation */ +int enqueue_count = -1; +int dispatch_count = -1; +bool debug_ksym_exists; + +/* DSQ pool for stress testing */ +int dsq_pool_base_id = 2000; +int phase1_complete = -1; +long total_peek_attempts = -1; +long successful_peeks = -1; + +/* BPF map for sharing peek results with userspace */ +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, MAX_SAMPLES); + __type(key, u32); + __type(value, long); +} peek_results SEC(".maps"); + +static int get_random_dsq_id(void) +{ + u64 time = bpf_ktime_get_ns(); + + return dsq_pool_base_id + (time % DSQ_POOL_SIZE); +} + +static void record_peek_result(long pid) +{ + u32 slot_key; + long *slot_pid_ptr; + u32 ix; + + if (pid <= 0) + return; + + /* Find an empty slot or one with the same PID */ + bpf_for(ix, 0, 10) { + slot_key = ((u64)pid + ix) % MAX_SAMPLES; + slot_pid_ptr = bpf_map_lookup_elem(&peek_results, &slot_key); + if (!slot_pid_ptr) + continue; + + if (*slot_pid_ptr == -1 || *slot_pid_ptr == pid) { + *slot_pid_ptr = pid; + break; + } + } +} + +/* Scan all DSQs in the pool and try to move a task to local */ +static int scan_dsq_pool(void) +{ + struct task_struct *task; + int moved = 0; + int i; + + bpf_for(i, 0, DSQ_POOL_SIZE) { + int dsq_id = dsq_pool_base_id + i; + + total_peek_attempts++; + + task = __COMPAT_scx_bpf_dsq_peek(dsq_id); + if (task) { + successful_peeks++; + record_peek_result(task->pid); + + /* Try to move this task to local */ + if (!moved && scx_bpf_dsq_move_to_local(dsq_id, 0) == 0) { + moved = 1; + break; + } + } + } + return moved; +} + +/* Struct_ops scheduler for testing DSQ peek operations */ +void BPF_STRUCT_OPS(peek_dsq_enqueue, struct task_struct *p, u64 enq_flags) +{ + struct task_struct *peek_result; + int last_insert_test_cpu, cpu; + + enqueue_count++; + cpu = bpf_get_smp_processor_id(); + last_insert_test_cpu = __sync_val_compare_and_swap(&insert_test_cpu, -1, cpu); + + /* Phase 1: Simple insert-then-peek test (only on first task) */ + if (last_insert_test_cpu == -1) { + bpf_printk("peek_dsq_enqueue beginning phase 1 peek test on cpu %d", cpu); + + /* Test 1: Peek empty DSQ - should return NULL */ + peek_result = __COMPAT_scx_bpf_dsq_peek(test_dsq_id); + dsq_peek_result1 = (long)peek_result; /* Should be 0 (NULL) */ + + /* Test 2: Insert task into test DSQ for testing in dispatch callback */ + dsq_inserted_pid = p->pid; + scx_bpf_dsq_insert(p, test_dsq_id, 0, enq_flags); + dsq_peek_result2_expected = (long)p; /* Expected the task we just inserted */ + } else if (!phase1_complete) { + /* Still in phase 1, use real DSQ */ + scx_bpf_dsq_insert(p, real_dsq_id, 0, enq_flags); + } else { + /* Phase 2: Random DSQ insertion for stress testing */ + int random_dsq_id = get_random_dsq_id(); + + scx_bpf_dsq_insert(p, random_dsq_id, 0, enq_flags); + } +} + +void BPF_STRUCT_OPS(peek_dsq_dispatch, s32 cpu, struct task_struct *prev) +{ + dispatch_count++; + + /* Phase 1: Complete the simple peek test if we inserted a task but + * haven't tested peek yet + */ + if (insert_test_cpu == cpu && dsq_peek_result2 == -1) { + struct task_struct *peek_result; + + bpf_printk("peek_dsq_dispatch completing phase 1 peek test on cpu %d", cpu); + + /* Test 3: Peek DSQ after insert - should return the task we inserted */ + peek_result = __COMPAT_scx_bpf_dsq_peek(test_dsq_id); + /* Store the PID of the peeked task for comparison */ + dsq_peek_result2 = (long)peek_result; + dsq_peek_result2_pid = peek_result ? peek_result->pid : -1; + + /* Now consume the task since we've peeked at it */ + scx_bpf_dsq_move_to_local(test_dsq_id, 0); + + /* Mark phase 1 as complete */ + phase1_complete = 1; + bpf_printk("Phase 1 complete, starting phase 2 stress testing"); + } else if (!phase1_complete) { + /* Still in phase 1, use real DSQ */ + scx_bpf_dsq_move_to_local(real_dsq_id, 0); + } else { + /* Phase 2: Scan all DSQs in the pool and try to move a task */ + if (!scan_dsq_pool()) { + /* No tasks found in DSQ pool, fall back to real DSQ */ + scx_bpf_dsq_move_to_local(real_dsq_id, 0); + } + } +} + +s32 BPF_STRUCT_OPS_SLEEPABLE(peek_dsq_init) +{ + s32 err; + int i; + + /* Always set debug values so we can see which version we're using */ + debug_ksym_exists = bpf_ksym_exists(scx_bpf_dsq_peek) ? 1 : 0; + + /* Initialize state first */ + insert_test_cpu = -1; + enqueue_count = 0; + dispatch_count = 0; + phase1_complete = 0; + total_peek_attempts = 0; + successful_peeks = 0; + + /* Create the test and real DSQs */ + err = scx_bpf_create_dsq(test_dsq_id, -1); + if (err) { + scx_bpf_error("Failed to create DSQ %d: %d", test_dsq_id, err); + return err; + } + err = scx_bpf_create_dsq(real_dsq_id, -1); + if (err) { + scx_bpf_error("Failed to create DSQ %d: %d", real_dsq_id, err); + return err; + } + + /* Create the DSQ pool for stress testing */ + bpf_for(i, 0, DSQ_POOL_SIZE) { + int dsq_id = dsq_pool_base_id + i; + + err = scx_bpf_create_dsq(dsq_id, -1); + if (err) { + scx_bpf_error("Failed to create DSQ pool entry %d: %d", dsq_id, err); + return err; + } + } + + /* Initialize the peek results map */ + bpf_for(i, 0, MAX_SAMPLES) { + u32 key = i; + long pid = -1; + + bpf_map_update_elem(&peek_results, &key, &pid, BPF_ANY); + } + + return 0; +} + +void BPF_STRUCT_OPS(peek_dsq_exit, struct scx_exit_info *ei) +{ + int i; + + /* Destroy the primary DSQs */ + scx_bpf_destroy_dsq(test_dsq_id); + scx_bpf_destroy_dsq(real_dsq_id); + + /* Destroy the DSQ pool */ + bpf_for(i, 0, DSQ_POOL_SIZE) { + int dsq_id = dsq_pool_base_id + i; + + scx_bpf_destroy_dsq(dsq_id); + } + + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops peek_dsq_ops = { + .enqueue = (void *)peek_dsq_enqueue, + .dispatch = (void *)peek_dsq_dispatch, + .init = (void *)peek_dsq_init, + .exit = (void *)peek_dsq_exit, + .name = "peek_dsq", +}; diff --git a/tools/testing/selftests/sched_ext/peek_dsq.c b/tools/testing/selftests/sched_ext/peek_dsq.c new file mode 100644 index 000000000000..a717384a3224 --- /dev/null +++ b/tools/testing/selftests/sched_ext/peek_dsq.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test for DSQ operations including create, destroy, and peek operations. + * + * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. + * Copyright (c) 2025 Ryan Newton <ryan.newton@alum.mit.edu> + */ +#include <bpf/bpf.h> +#include <scx/common.h> +#include <sys/wait.h> +#include <unistd.h> +#include <pthread.h> +#include <string.h> +#include <sched.h> +#include "peek_dsq.bpf.skel.h" +#include "scx_test.h" + +#define NUM_WORKERS 4 + +static bool workload_running = true; +static pthread_t workload_threads[NUM_WORKERS]; + +/** + * Background workload thread that sleeps and wakes rapidly to exercise + * the scheduler's enqueue operations and ensure DSQ operations get tested. + */ +static void *workload_thread_fn(void *arg) +{ + while (workload_running) { + /* Sleep for a very short time to trigger scheduler activity */ + usleep(1000); /* 1ms sleep */ + /* Yield to ensure we go through the scheduler */ + sched_yield(); + } + return NULL; +} + +static enum scx_test_status setup(void **ctx) +{ + struct peek_dsq *skel; + + skel = peek_dsq__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(peek_dsq__load(skel), "Failed to load skel"); + + *ctx = skel; + + return SCX_TEST_PASS; +} + +static int print_observed_pids(struct bpf_map *map, int max_samples, const char *dsq_name) +{ + long count = 0; + + printf("Observed %s DSQ peek pids:\n", dsq_name); + for (int i = 0; i < max_samples; i++) { + long pid; + int err; + + err = bpf_map_lookup_elem(bpf_map__fd(map), &i, &pid); + if (err == 0) { + if (pid == 0) { + printf(" Sample %d: NULL peek\n", i); + } else if (pid > 0) { + printf(" Sample %d: pid %ld\n", i, pid); + count++; + } + } else { + printf(" Sample %d: error reading pid (err=%d)\n", i, err); + } + } + printf("Observed ~%ld pids in the %s DSQ(s)\n", count, dsq_name); + return count; +} + +static enum scx_test_status run(void *ctx) +{ + struct peek_dsq *skel = ctx; + bool failed = false; + int seconds = 3; + int err; + + /* Enable the scheduler to test DSQ operations */ + printf("Enabling scheduler to test DSQ insert operations...\n"); + + struct bpf_link *link = + bpf_map__attach_struct_ops(skel->maps.peek_dsq_ops); + + if (!link) { + SCX_ERR("Failed to attach struct_ops"); + return SCX_TEST_FAIL; + } + + printf("Starting %d background workload threads...\n", NUM_WORKERS); + workload_running = true; + for (int i = 0; i < NUM_WORKERS; i++) { + err = pthread_create(&workload_threads[i], NULL, workload_thread_fn, NULL); + if (err) { + SCX_ERR("Failed to create workload thread %d: %s", i, strerror(err)); + /* Stop already created threads */ + workload_running = false; + for (int j = 0; j < i; j++) + pthread_join(workload_threads[j], NULL); + bpf_link__destroy(link); + return SCX_TEST_FAIL; + } + } + + printf("Waiting for enqueue events.\n"); + sleep(seconds); + while (skel->data->enqueue_count <= 0) { + printf("."); + fflush(stdout); + sleep(1); + seconds++; + if (seconds >= 30) { + printf("\n\u2717 Timeout waiting for enqueue events\n"); + /* Stop workload threads and cleanup */ + workload_running = false; + for (int i = 0; i < NUM_WORKERS; i++) + pthread_join(workload_threads[i], NULL); + bpf_link__destroy(link); + return SCX_TEST_FAIL; + } + } + + workload_running = false; + for (int i = 0; i < NUM_WORKERS; i++) { + err = pthread_join(workload_threads[i], NULL); + if (err) { + SCX_ERR("Failed to join workload thread %d: %s", i, strerror(err)); + bpf_link__destroy(link); + return SCX_TEST_FAIL; + } + } + printf("Background workload threads stopped.\n"); + + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE)); + + /* Detach the scheduler */ + bpf_link__destroy(link); + + printf("Enqueue/dispatch count over %d seconds: %d / %d\n", seconds, + skel->data->enqueue_count, skel->data->dispatch_count); + printf("Debug: ksym_exists=%d\n", + skel->bss->debug_ksym_exists); + + /* Check DSQ insert result */ + printf("DSQ insert test done on cpu: %d\n", skel->data->insert_test_cpu); + if (skel->data->insert_test_cpu != -1) + printf("\u2713 DSQ insert succeeded !\n"); + else { + printf("\u2717 DSQ insert failed or not attempted\n"); + failed = true; + } + + /* Check DSQ peek results */ + printf(" DSQ peek result 1 (before insert): %d\n", + skel->data->dsq_peek_result1); + if (skel->data->dsq_peek_result1 == 0) + printf("\u2713 DSQ peek verification success: peek returned NULL!\n"); + else { + printf("\u2717 DSQ peek verification failed\n"); + failed = true; + } + + printf(" DSQ peek result 2 (after insert): %ld\n", + skel->data->dsq_peek_result2); + printf(" DSQ peek result 2, expected: %ld\n", + skel->data->dsq_peek_result2_expected); + if (skel->data->dsq_peek_result2 == + skel->data->dsq_peek_result2_expected) + printf("\u2713 DSQ peek verification success: peek returned the inserted task!\n"); + else { + printf("\u2717 DSQ peek verification failed\n"); + failed = true; + } + + printf(" Inserted test task -> pid: %ld\n", skel->data->dsq_inserted_pid); + printf(" DSQ peek result 2 -> pid: %ld\n", skel->data->dsq_peek_result2_pid); + + int pid_count; + + pid_count = print_observed_pids(skel->maps.peek_results, + skel->data->max_samples, "DSQ pool"); + printf("Total non-null peek observations: %ld out of %ld\n", + skel->data->successful_peeks, skel->data->total_peek_attempts); + + if (skel->bss->debug_ksym_exists && pid_count == 0) { + printf("\u2717 DSQ pool test failed: no successful peeks in native mode\n"); + failed = true; + } + if (skel->bss->debug_ksym_exists && pid_count > 0) + printf("\u2713 DSQ pool test success: observed successful peeks in native mode\n"); + + if (failed) + return SCX_TEST_FAIL; + else + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct peek_dsq *skel = ctx; + + if (workload_running) { + workload_running = false; + for (int i = 0; i < NUM_WORKERS; i++) + pthread_join(workload_threads[i], NULL); + } + + peek_dsq__destroy(skel); +} + +struct scx_test peek_dsq = { + .name = "peek_dsq", + .description = + "Test DSQ create/destroy operations and future peek functionality", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&peek_dsq) diff --git a/tools/testing/selftests/sched_ext/reload_loop.c b/tools/testing/selftests/sched_ext/reload_loop.c index 308211d80436..49297b83d748 100644 --- a/tools/testing/selftests/sched_ext/reload_loop.c +++ b/tools/testing/selftests/sched_ext/reload_loop.c @@ -23,6 +23,9 @@ static enum scx_test_status setup(void **ctx) SCX_ENUM_INIT(skel); SCX_FAIL_IF(maximal__load(skel), "Failed to load skel"); + bpf_map__set_autoattach(skel->maps.maximal_ops, false); + SCX_FAIL_IF(maximal__attach(skel), "Failed to attach skel"); + return SCX_TEST_PASS; } diff --git a/tools/testing/selftests/sched_ext/rt_stall.bpf.c b/tools/testing/selftests/sched_ext/rt_stall.bpf.c new file mode 100644 index 000000000000..80086779dd1e --- /dev/null +++ b/tools/testing/selftests/sched_ext/rt_stall.bpf.c @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A scheduler that verified if RT tasks can stall SCHED_EXT tasks. + * + * Copyright (c) 2025 NVIDIA Corporation. + */ + +#include <scx/common.bpf.h> + +char _license[] SEC("license") = "GPL"; + +UEI_DEFINE(uei); + +void BPF_STRUCT_OPS(rt_stall_exit, struct scx_exit_info *ei) +{ + UEI_RECORD(uei, ei); +} + +SEC(".struct_ops.link") +struct sched_ext_ops rt_stall_ops = { + .exit = (void *)rt_stall_exit, + .name = "rt_stall", +}; diff --git a/tools/testing/selftests/sched_ext/rt_stall.c b/tools/testing/selftests/sched_ext/rt_stall.c new file mode 100644 index 000000000000..a5041fc2e44f --- /dev/null +++ b/tools/testing/selftests/sched_ext/rt_stall.c @@ -0,0 +1,293 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2025 NVIDIA Corporation. + */ +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sched.h> +#include <sys/prctl.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <time.h> +#include <linux/sched.h> +#include <signal.h> +#include <bpf/bpf.h> +#include <scx/common.h> +#include "rt_stall.bpf.skel.h" +#include "scx_test.h" +#include "../kselftest.h" + +#define CORE_ID 0 /* CPU to pin tasks to */ +#define RUN_TIME 5 /* How long to run the test in seconds */ + +/* Signal the parent that setup is complete by writing to a pipe */ +static void signal_ready(int fd) +{ + char c = 1; + + if (write(fd, &c, 1) != 1) { + perror("write to ready pipe"); + exit(EXIT_FAILURE); + } + close(fd); +} + +/* Wait for a child to signal readiness via a pipe */ +static void wait_ready(int fd) +{ + char c; + + if (read(fd, &c, 1) != 1) { + perror("read from ready pipe"); + exit(EXIT_FAILURE); + } + close(fd); +} + +/* Simple busy-wait function for test tasks */ +static void process_func(void) +{ + while (1) { + /* Busy wait */ + for (volatile unsigned long i = 0; i < 10000000UL; i++) + ; + } +} + +/* Set CPU affinity to a specific core */ +static void set_affinity(int cpu) +{ + cpu_set_t mask; + + CPU_ZERO(&mask); + CPU_SET(cpu, &mask); + if (sched_setaffinity(0, sizeof(mask), &mask) != 0) { + perror("sched_setaffinity"); + exit(EXIT_FAILURE); + } +} + +/* Set task scheduling policy and priority */ +static void set_sched(int policy, int priority) +{ + struct sched_param param; + + param.sched_priority = priority; + if (sched_setscheduler(0, policy, ¶m) != 0) { + perror("sched_setscheduler"); + exit(EXIT_FAILURE); + } +} + +/* Get process runtime from /proc/<pid>/stat */ +static float get_process_runtime(int pid) +{ + char path[256]; + FILE *file; + long utime, stime; + int fields; + + snprintf(path, sizeof(path), "/proc/%d/stat", pid); + file = fopen(path, "r"); + if (file == NULL) { + perror("Failed to open stat file"); + return -1; + } + + /* Skip the first 13 fields and read the 14th and 15th */ + fields = fscanf(file, + "%*d %*s %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u %lu %lu", + &utime, &stime); + fclose(file); + + if (fields != 2) { + fprintf(stderr, "Failed to read stat file\n"); + return -1; + } + + /* Calculate the total time spent in the process */ + long total_time = utime + stime; + long ticks_per_second = sysconf(_SC_CLK_TCK); + float runtime_seconds = total_time * 1.0 / ticks_per_second; + + return runtime_seconds; +} + +static enum scx_test_status setup(void **ctx) +{ + struct rt_stall *skel; + + if (!__COMPAT_struct_has_field("rq", "ext_server")) { + fprintf(stderr, "SKIP: ext DL server not supported\n"); + return SCX_TEST_SKIP; + } + + skel = rt_stall__open(); + SCX_FAIL_IF(!skel, "Failed to open"); + SCX_ENUM_INIT(skel); + SCX_FAIL_IF(rt_stall__load(skel), "Failed to load skel"); + + *ctx = skel; + + return SCX_TEST_PASS; +} + +static bool sched_stress_test(bool is_ext) +{ + /* + * We're expecting the EXT task to get around 5% of CPU time when + * competing with the RT task (small 1% fluctuations are expected). + * + * However, the EXT task should get at least 4% of the CPU to prove + * that the EXT deadline server is working correctly. A percentage + * less than 4% indicates a bug where RT tasks can potentially + * stall SCHED_EXT tasks, causing the test to fail. + */ + const float expected_min_ratio = 0.04; /* 4% */ + const char *class_str = is_ext ? "EXT" : "FAIR"; + + float ext_runtime, rt_runtime, actual_ratio; + int ext_pid, rt_pid; + int ext_ready[2], rt_ready[2]; + + ksft_print_header(); + ksft_set_plan(1); + + if (pipe(ext_ready) || pipe(rt_ready)) { + perror("pipe"); + ksft_exit_fail(); + } + + /* Create and set up a EXT task */ + ext_pid = fork(); + if (ext_pid == 0) { + close(ext_ready[0]); + close(rt_ready[0]); + close(rt_ready[1]); + set_affinity(CORE_ID); + signal_ready(ext_ready[1]); + process_func(); + exit(0); + } else if (ext_pid < 0) { + perror("fork task"); + ksft_exit_fail(); + } + + /* Create an RT task */ + rt_pid = fork(); + if (rt_pid == 0) { + close(ext_ready[0]); + close(ext_ready[1]); + close(rt_ready[0]); + set_affinity(CORE_ID); + set_sched(SCHED_FIFO, 50); + signal_ready(rt_ready[1]); + process_func(); + exit(0); + } else if (rt_pid < 0) { + perror("fork for RT task"); + ksft_exit_fail(); + } + + /* + * Wait for both children to complete their setup (affinity and + * scheduling policy) before starting the measurement window. + * This prevents flaky failures caused by the RT child's setup + * time eating into the measurement period. + */ + close(ext_ready[1]); + close(rt_ready[1]); + wait_ready(ext_ready[0]); + wait_ready(rt_ready[0]); + + /* Let the processes run for the specified time */ + sleep(RUN_TIME); + + /* Get runtime for the EXT task */ + ext_runtime = get_process_runtime(ext_pid); + if (ext_runtime == -1) + ksft_exit_fail_msg("Error getting runtime for %s task (PID %d)\n", + class_str, ext_pid); + ksft_print_msg("Runtime of %s task (PID %d) is %f seconds\n", + class_str, ext_pid, ext_runtime); + + /* Get runtime for the RT task */ + rt_runtime = get_process_runtime(rt_pid); + if (rt_runtime == -1) + ksft_exit_fail_msg("Error getting runtime for RT task (PID %d)\n", rt_pid); + ksft_print_msg("Runtime of RT task (PID %d) is %f seconds\n", rt_pid, rt_runtime); + + /* Kill the processes */ + kill(ext_pid, SIGKILL); + kill(rt_pid, SIGKILL); + waitpid(ext_pid, NULL, 0); + waitpid(rt_pid, NULL, 0); + + /* Verify that the scx task got enough runtime */ + actual_ratio = ext_runtime / (ext_runtime + rt_runtime); + ksft_print_msg("%s task got %.2f%% of total runtime\n", + class_str, actual_ratio * 100); + + if (actual_ratio >= expected_min_ratio) { + ksft_test_result_pass("PASS: %s task got more than %.2f%% of runtime\n", + class_str, expected_min_ratio * 100); + return true; + } + ksft_test_result_fail("FAIL: %s task got less than %.2f%% of runtime\n", + class_str, expected_min_ratio * 100); + return false; +} + +static enum scx_test_status run(void *ctx) +{ + struct rt_stall *skel = ctx; + struct bpf_link *link = NULL; + bool res; + int i; + + /* + * Test if the dl_server is working both with and without the + * sched_ext scheduler attached. + * + * This ensures all the scenarios are covered: + * - fair_server stop -> ext_server start + * - ext_server stop -> fair_server stop + */ + for (i = 0; i < 4; i++) { + bool is_ext = i % 2; + + if (is_ext) { + memset(&skel->data->uei, 0, sizeof(skel->data->uei)); + link = bpf_map__attach_struct_ops(skel->maps.rt_stall_ops); + SCX_FAIL_IF(!link, "Failed to attach scheduler"); + } + res = sched_stress_test(is_ext); + if (is_ext) { + SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE)); + bpf_link__destroy(link); + } + + if (!res) + ksft_exit_fail(); + } + + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct rt_stall *skel = ctx; + + rt_stall__destroy(skel); +} + +struct scx_test rt_stall = { + .name = "rt_stall", + .description = "Verify that RT tasks cannot stall SCHED_EXT tasks", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&rt_stall) diff --git a/tools/testing/selftests/sched_ext/runner.c b/tools/testing/selftests/sched_ext/runner.c index aa2d7d32dda9..c264807caa91 100644 --- a/tools/testing/selftests/sched_ext/runner.c +++ b/tools/testing/selftests/sched_ext/runner.c @@ -18,7 +18,7 @@ const char help_fmt[] = "It's required for the testcases to be serial, as only a single host-wide sched_ext\n" "scheduler may be loaded at any given time." "\n" -"Usage: %s [-t TEST] [-h]\n" +"Usage: %s [-t TEST] [-s] [-l] [-q]\n" "\n" " -t TEST Only run tests whose name includes this string\n" " -s Include print output for skipped tests\n" @@ -46,6 +46,14 @@ static void print_test_preamble(const struct scx_test *test, bool quiet) if (!quiet) printf("DESCRIPTION: %s\n", test->description); printf("OUTPUT:\n"); + + /* + * The tests may fork with the preamble buffered + * in the children's stdout. Flush before the test + * to avoid printing the message multiple times. + */ + fflush(stdout); + fflush(stderr); } static const char *status_to_result(enum scx_test_status status) @@ -125,6 +133,8 @@ static bool test_valid(const struct scx_test *test) int main(int argc, char **argv) { const char *filter = NULL; + const char *failed_tests[MAX_SCX_TESTS]; + const char *skipped_tests[MAX_SCX_TESTS]; unsigned testnum = 0, i; unsigned passed = 0, skipped = 0, failed = 0; int opt; @@ -154,10 +164,33 @@ int main(int argc, char **argv) } } + if (optind < argc) { + fprintf(stderr, "Unexpected argument '%s'. Use -t to filter tests.\n", + argv[optind]); + return 1; + } + + if (filter) { + for (i = 0; i < __scx_num_tests; i++) { + if (!should_skip_test(&__scx_tests[i], filter)) + break; + } + if (i == __scx_num_tests) { + fprintf(stderr, "No tests matched filter '%s'\n", filter); + fprintf(stderr, "Available tests (use -l to list):\n"); + for (i = 0; i < __scx_num_tests; i++) + fprintf(stderr, " %s\n", __scx_tests[i].name); + return 1; + } + } + for (i = 0; i < __scx_num_tests; i++) { enum scx_test_status status; struct scx_test *test = &__scx_tests[i]; + if (exit_req) + break; + if (list) { printf("%s\n", test->name); if (i == (__scx_num_tests - 1)) @@ -187,10 +220,10 @@ int main(int argc, char **argv) passed++; break; case SCX_TEST_SKIP: - skipped++; + skipped_tests[skipped++] = test->name; break; case SCX_TEST_FAIL: - failed++; + failed_tests[failed++] = test->name; break; } } @@ -199,8 +232,18 @@ int main(int argc, char **argv) printf("PASSED: %u\n", passed); printf("SKIPPED: %u\n", skipped); printf("FAILED: %u\n", failed); + if (skipped > 0) { + printf("\nSkipped tests:\n"); + for (i = 0; i < skipped; i++) + printf(" - %s\n", skipped_tests[i]); + } + if (failed > 0) { + printf("\nFailed tests:\n"); + for (i = 0; i < failed; i++) + printf(" - %s\n", failed_tests[i]); + } - return 0; + return failed > 0 ? 1 : 0; } void scx_test_register(struct scx_test *test) diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c index bfcb96cd4954..eec70d388cbf 100644 --- a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c +++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c @@ -53,7 +53,7 @@ ddsp: void BPF_STRUCT_OPS(select_cpu_vtime_dispatch, s32 cpu, struct task_struct *p) { - if (scx_bpf_dsq_move_to_local(VTIME_DSQ)) + if (scx_bpf_dsq_move_to_local(VTIME_DSQ, 0)) consumed = true; } @@ -66,12 +66,14 @@ void BPF_STRUCT_OPS(select_cpu_vtime_running, struct task_struct *p) void BPF_STRUCT_OPS(select_cpu_vtime_stopping, struct task_struct *p, bool runnable) { - p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; + u64 delta = scale_by_task_weight_inverse(p, SCX_SLICE_DFL - p->scx.slice); + + scx_bpf_task_set_dsq_vtime(p, p->scx.dsq_vtime + delta); } void BPF_STRUCT_OPS(select_cpu_vtime_enable, struct task_struct *p) { - p->scx.dsq_vtime = vtime_now; + scx_bpf_task_set_dsq_vtime(p, vtime_now); } s32 BPF_STRUCT_OPS_SLEEPABLE(select_cpu_vtime_init) diff --git a/tools/testing/selftests/sched_ext/total_bw.c b/tools/testing/selftests/sched_ext/total_bw.c new file mode 100644 index 000000000000..5b0a619bab86 --- /dev/null +++ b/tools/testing/selftests/sched_ext/total_bw.c @@ -0,0 +1,281 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test to verify that total_bw value remains consistent across all CPUs + * in different BPF program states. + * + * Copyright (C) 2025 NVIDIA Corporation. + */ +#include <bpf/bpf.h> +#include <errno.h> +#include <pthread.h> +#include <scx/common.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/wait.h> +#include <unistd.h> +#include "minimal.bpf.skel.h" +#include "scx_test.h" + +#define MAX_CPUS 512 +#define STRESS_DURATION_SEC 5 + +struct total_bw_ctx { + struct minimal *skel; + long baseline_bw[MAX_CPUS]; + int nr_cpus; +}; + +static void *cpu_stress_thread(void *arg) +{ + volatile int i; + time_t end_time = time(NULL) + STRESS_DURATION_SEC; + + while (time(NULL) < end_time) + for (i = 0; i < 1000000; i++) + ; + + return NULL; +} + +/* + * The first enqueue on a CPU causes the DL server to start, for that + * reason run stressor threads in the hopes it schedules on all CPUs. + */ +static int run_cpu_stress(int nr_cpus) +{ + pthread_t *threads; + int i, ret = 0; + + threads = calloc(nr_cpus, sizeof(pthread_t)); + if (!threads) + return -ENOMEM; + + /* Create threads to run on each CPU */ + for (i = 0; i < nr_cpus; i++) { + if (pthread_create(&threads[i], NULL, cpu_stress_thread, NULL)) { + ret = -errno; + fprintf(stderr, "Failed to create thread %d: %s\n", i, strerror(-ret)); + break; + } + } + + /* Wait for all threads to complete */ + for (i = 0; i < nr_cpus; i++) { + if (threads[i]) + pthread_join(threads[i], NULL); + } + + free(threads); + return ret; +} + +static int read_total_bw_values(long *bw_values, int max_cpus) +{ + FILE *fp; + char line[256]; + int cpu_count = 0; + + fp = fopen("/sys/kernel/debug/sched/debug", "r"); + if (!fp) { + SCX_ERR("Failed to open debug file"); + return -1; + } + + while (fgets(line, sizeof(line), fp)) { + char *bw_str = strstr(line, "total_bw"); + + if (bw_str) { + bw_str = strchr(bw_str, ':'); + if (bw_str) { + /* Only store up to max_cpus values */ + if (cpu_count < max_cpus) + bw_values[cpu_count] = atol(bw_str + 1); + cpu_count++; + } + } + } + + fclose(fp); + return cpu_count; +} + +static bool verify_total_bw_consistency(long *bw_values, int count) +{ + int i; + long first_value; + + if (count <= 0) + return false; + + first_value = bw_values[0]; + + for (i = 1; i < count; i++) { + if (bw_values[i] != first_value) { + SCX_ERR("Inconsistent total_bw: CPU0=%ld, CPU%d=%ld", + first_value, i, bw_values[i]); + return false; + } + } + + return true; +} + +static int fetch_verify_total_bw(long *bw_values, int nr_cpus) +{ + int attempts = 0; + int max_attempts = 10; + int count; + + /* + * The first enqueue on a CPU causes the DL server to start, for that + * reason run stressor threads in the hopes it schedules on all CPUs. + */ + if (run_cpu_stress(nr_cpus) < 0) { + SCX_ERR("Failed to run CPU stress"); + return -1; + } + + /* Try multiple times to get stable values */ + while (attempts < max_attempts) { + count = read_total_bw_values(bw_values, nr_cpus); + fprintf(stderr, "Read %d total_bw values (testing %d CPUs)\n", count, nr_cpus); + /* If system has more CPUs than we're testing, that's OK */ + if (count < nr_cpus) { + SCX_ERR("Expected at least %d CPUs, got %d", nr_cpus, count); + attempts++; + sleep(1); + continue; + } + + /* Only verify the CPUs we're testing */ + if (verify_total_bw_consistency(bw_values, nr_cpus)) { + fprintf(stderr, "Values are consistent: %ld\n", bw_values[0]); + return 0; + } + + attempts++; + sleep(1); + } + + return -1; +} + +static enum scx_test_status setup(void **ctx) +{ + struct total_bw_ctx *test_ctx; + + if (access("/sys/kernel/debug/sched/debug", R_OK) != 0) { + fprintf(stderr, "Skipping test: debugfs sched/debug not accessible\n"); + return SCX_TEST_SKIP; + } + + test_ctx = calloc(1, sizeof(*test_ctx)); + if (!test_ctx) + return SCX_TEST_FAIL; + + test_ctx->nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + if (test_ctx->nr_cpus <= 0) { + free(test_ctx); + return SCX_TEST_FAIL; + } + + /* If system has more CPUs than MAX_CPUS, just test the first MAX_CPUS */ + if (test_ctx->nr_cpus > MAX_CPUS) + test_ctx->nr_cpus = MAX_CPUS; + + /* Test scenario 1: BPF program not loaded */ + /* Read and verify baseline total_bw before loading BPF program */ + fprintf(stderr, "BPF prog initially not loaded, reading total_bw values\n"); + if (fetch_verify_total_bw(test_ctx->baseline_bw, test_ctx->nr_cpus) < 0) { + SCX_ERR("Failed to get stable baseline values"); + free(test_ctx); + return SCX_TEST_FAIL; + } + + /* Load the BPF skeleton */ + test_ctx->skel = minimal__open(); + if (!test_ctx->skel) { + free(test_ctx); + return SCX_TEST_FAIL; + } + + SCX_ENUM_INIT(test_ctx->skel); + if (minimal__load(test_ctx->skel)) { + minimal__destroy(test_ctx->skel); + free(test_ctx); + return SCX_TEST_FAIL; + } + + *ctx = test_ctx; + return SCX_TEST_PASS; +} + +static enum scx_test_status run(void *ctx) +{ + struct total_bw_ctx *test_ctx = ctx; + struct bpf_link *link; + long loaded_bw[MAX_CPUS]; + long unloaded_bw[MAX_CPUS]; + int i; + + /* Test scenario 2: BPF program loaded */ + link = bpf_map__attach_struct_ops(test_ctx->skel->maps.minimal_ops); + if (!link) { + SCX_ERR("Failed to attach scheduler"); + return SCX_TEST_FAIL; + } + + fprintf(stderr, "BPF program loaded, reading total_bw values\n"); + if (fetch_verify_total_bw(loaded_bw, test_ctx->nr_cpus) < 0) { + SCX_ERR("Failed to get stable values with BPF loaded"); + bpf_link__destroy(link); + return SCX_TEST_FAIL; + } + bpf_link__destroy(link); + + /* Test scenario 3: BPF program unloaded */ + fprintf(stderr, "BPF program unloaded, reading total_bw values\n"); + if (fetch_verify_total_bw(unloaded_bw, test_ctx->nr_cpus) < 0) { + SCX_ERR("Failed to get stable values after BPF unload"); + return SCX_TEST_FAIL; + } + + /* Verify all three scenarios have the same total_bw values */ + for (i = 0; i < test_ctx->nr_cpus; i++) { + if (test_ctx->baseline_bw[i] != loaded_bw[i]) { + SCX_ERR("CPU%d: baseline_bw=%ld != loaded_bw=%ld", + i, test_ctx->baseline_bw[i], loaded_bw[i]); + return SCX_TEST_FAIL; + } + + if (test_ctx->baseline_bw[i] != unloaded_bw[i]) { + SCX_ERR("CPU%d: baseline_bw=%ld != unloaded_bw=%ld", + i, test_ctx->baseline_bw[i], unloaded_bw[i]); + return SCX_TEST_FAIL; + } + } + + fprintf(stderr, "All total_bw values are consistent across all scenarios\n"); + return SCX_TEST_PASS; +} + +static void cleanup(void *ctx) +{ + struct total_bw_ctx *test_ctx = ctx; + + if (test_ctx) { + if (test_ctx->skel) + minimal__destroy(test_ctx->skel); + free(test_ctx); + } +} + +struct scx_test total_bw = { + .name = "total_bw", + .description = "Verify total_bw consistency across BPF program states", + .setup = setup, + .run = run, + .cleanup = cleanup, +}; +REGISTER_SCX_TEST(&total_bw) diff --git a/tools/testing/selftests/sched_ext/util.c b/tools/testing/selftests/sched_ext/util.c index e47769c91918..2111329ed289 100644 --- a/tools/testing/selftests/sched_ext/util.c +++ b/tools/testing/selftests/sched_ext/util.c @@ -60,11 +60,11 @@ int file_write_long(const char *path, long val) char buf[64]; int ret; - ret = sprintf(buf, "%lu", val); + ret = sprintf(buf, "%ld", val); if (ret < 0) return ret; - if (write_text(path, buf, sizeof(buf)) <= 0) + if (write_text(path, buf, ret) <= 0) return -1; return 0; diff --git a/tools/testing/selftests/sched_ext/util.h b/tools/testing/selftests/sched_ext/util.h index bc13dfec1267..681cec04b439 100644 --- a/tools/testing/selftests/sched_ext/util.h +++ b/tools/testing/selftests/sched_ext/util.h @@ -10,4 +10,4 @@ long file_read_long(const char *path); int file_write_long(const char *path, long val); -#endif // __SCX_TEST_H__ +#endif // __SCX_TEST_UTIL_H__ |
