summaryrefslogtreecommitdiff
path: root/tools/sched_ext/scx_qmap.bpf.c
diff options
context:
space:
mode:
Diffstat (limited to 'tools/sched_ext/scx_qmap.bpf.c')
-rw-r--r--tools/sched_ext/scx_qmap.bpf.c334
1 files changed, 270 insertions, 64 deletions
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 3a20bb0c014a..aad698fe294b 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -11,8 +11,6 @@
*
* - BPF-side queueing using PIDs.
* - Sleepable per-task storage allocation using ops.prep_enable().
- * - Using ops.cpu_release() to handle a higher priority scheduling class taking
- * the CPU away.
* - Core-sched support.
*
* This scheduler is primarily for demonstration and testing of sched_ext
@@ -26,8 +24,11 @@
enum consts {
ONE_SEC_IN_NS = 1000000000,
+ ONE_MSEC_IN_NS = 1000000,
+ LOWPRI_INTV_NS = 10 * ONE_MSEC_IN_NS,
SHARED_DSQ = 0,
HIGHPRI_DSQ = 1,
+ LOWPRI_DSQ = 2,
HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */
};
@@ -39,13 +40,20 @@ const volatile u32 stall_kernel_nth;
const volatile u32 dsp_inf_loop_after;
const volatile u32 dsp_batch;
const volatile bool highpri_boosting;
-const volatile bool print_shared_dsq;
+const volatile bool print_dsqs_and_events;
+const volatile bool print_msgs;
+const volatile u64 sub_cgroup_id;
const volatile s32 disallow_tgid;
const volatile bool suppress_dump;
+const volatile bool always_enq_immed;
+const volatile u32 immed_stress_nth;
u64 nr_highpri_queued;
u32 test_error_cnt;
+#define MAX_SUB_SCHEDS 8
+u64 sub_sched_cgroup_ids[MAX_SUB_SCHEDS];
+
UEI_DEFINE(uei);
struct qmap {
@@ -56,7 +64,8 @@ struct qmap {
queue1 SEC(".maps"),
queue2 SEC(".maps"),
queue3 SEC(".maps"),
- queue4 SEC(".maps");
+ queue4 SEC(".maps"),
+ dump_store SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
@@ -125,7 +134,7 @@ struct {
} cpu_ctx_stor SEC(".maps");
/* Statistics */
-u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq;
+u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_reenqueued_cpu0, nr_dequeued, nr_ddsp_from_enq;
u64 nr_core_sched_execed;
u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
u32 cpuperf_min, cpuperf_avg, cpuperf_max;
@@ -135,8 +144,10 @@ static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
{
s32 cpu;
- if (p->nr_cpus_allowed == 1 ||
- scx_bpf_test_and_clear_cpu_idle(prev_cpu))
+ if (!always_enq_immed && p->nr_cpus_allowed == 1)
+ return prev_cpu;
+
+ if (scx_bpf_test_and_clear_cpu_idle(prev_cpu))
return prev_cpu;
cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
@@ -148,13 +159,7 @@ static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
static struct task_ctx *lookup_task_ctx(struct task_struct *p)
{
- struct task_ctx *tctx;
-
- if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
- scx_bpf_error("task_ctx lookup failed");
- return NULL;
- }
- return tctx;
+ return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
}
s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
@@ -164,7 +169,10 @@ s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
s32 cpu;
if (!(tctx = lookup_task_ctx(p)))
- return -ESRCH;
+ return prev_cpu;
+
+ if (p->scx.weight < 2 && !(p->flags & PF_KTHREAD))
+ return prev_cpu;
cpu = pick_direct_dispatch_cpu(p, prev_cpu);
@@ -200,6 +208,12 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
void *ring;
s32 cpu;
+ if (enq_flags & SCX_ENQ_REENQ) {
+ __sync_fetch_and_add(&nr_reenqueued, 1);
+ if (scx_bpf_task_cpu(p) == 0)
+ __sync_fetch_and_add(&nr_reenqueued_cpu0, 1);
+ }
+
if (p->flags & PF_KTHREAD) {
if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
return;
@@ -221,6 +235,22 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
/*
+ * IMMED stress testing: Every immed_stress_nth'th enqueue, dispatch
+ * directly to prev_cpu's local DSQ even when busy to force dsq->nr > 1
+ * and exercise the kernel IMMED reenqueue trigger paths.
+ */
+ if (immed_stress_nth && !(enq_flags & SCX_ENQ_REENQ)) {
+ static u32 immed_stress_cnt;
+
+ if (!(++immed_stress_cnt % immed_stress_nth)) {
+ tctx->force_local = false;
+ scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | scx_bpf_task_cpu(p),
+ slice_ns, enq_flags);
+ return;
+ }
+ }
+
+ /*
* If qmap_select_cpu() is telling us to or this is the last runnable
* task on the CPU, enqueue locally.
*/
@@ -230,8 +260,15 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
return;
}
+ /* see lowpri_timerfn() */
+ if (__COMPAT_has_generic_reenq() &&
+ p->scx.weight < 2 && !(p->flags & PF_KTHREAD) && !(enq_flags & SCX_ENQ_REENQ)) {
+ scx_bpf_dsq_insert(p, LOWPRI_DSQ, slice_ns, enq_flags);
+ return;
+ }
+
/* if select_cpu() wasn't called, try direct dispatch */
- if (!(enq_flags & SCX_ENQ_CPU_SELECTED) &&
+ if (!__COMPAT_is_enq_cpu_selected(enq_flags) &&
(cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
__sync_fetch_and_add(&nr_ddsp_from_enq, 1);
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
@@ -318,12 +355,9 @@ static bool dispatch_highpri(bool from_timer)
if (tctx->highpri) {
/* exercise the set_*() and vtime interface too */
- __COMPAT_scx_bpf_dsq_move_set_slice(
- BPF_FOR_EACH_ITER, slice_ns * 2);
- __COMPAT_scx_bpf_dsq_move_set_vtime(
- BPF_FOR_EACH_ITER, highpri_seq++);
- __COMPAT_scx_bpf_dsq_move_vtime(
- BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
+ scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2);
+ scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++);
+ scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
}
}
@@ -340,9 +374,8 @@ static bool dispatch_highpri(bool from_timer)
else
cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
- if (__COMPAT_scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p,
- SCX_DSQ_LOCAL_ON | cpu,
- SCX_ENQ_PREEMPT)) {
+ if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu,
+ SCX_ENQ_PREEMPT)) {
if (cpu == this_cpu) {
dispatched = true;
__sync_fetch_and_add(&nr_expedited_local, 1);
@@ -374,7 +407,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
if (dispatch_highpri(false))
return;
- if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ))
+ if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ, 0))
return;
if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
@@ -432,6 +465,46 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
__sync_fetch_and_add(&nr_dispatched, 1);
scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
+
+ /*
+ * scx_qmap uses a global BPF queue that any CPU's
+ * dispatch can pop from. If this CPU popped a task that
+ * can't run here, it gets stranded on SHARED_DSQ after
+ * consume_dispatch_q() skips it. Kick the task's home
+ * CPU so it drains SHARED_DSQ.
+ *
+ * There's a race between the pop and the flush of the
+ * buffered dsq_insert:
+ *
+ * CPU 0 (dispatching) CPU 1 (home, idle)
+ * ~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~
+ * pop from BPF queue
+ * dsq_insert(buffered)
+ * balance:
+ * SHARED_DSQ empty
+ * BPF queue empty
+ * -> goes idle
+ * flush -> on SHARED
+ * kick CPU 1
+ * wakes, drains task
+ *
+ * The kick prevents indefinite stalls but a per-CPU
+ * kthread like ksoftirqd can be briefly stranded when
+ * its home CPU enters idle with softirq pending,
+ * triggering:
+ *
+ * "NOHZ tick-stop error: local softirq work is pending, handler #N!!!"
+ *
+ * from report_idle_softirq(). The kick lands shortly
+ * after and the home CPU drains the task. This could be
+ * avoided by e.g. dispatching pinned tasks to local or
+ * global DSQs, but the current code is left as-is to
+ * document this class of issue -- other schedulers
+ * seeing similar warnings can use this as a reference.
+ */
+ if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
+ scx_bpf_kick_cpu(scx_bpf_task_cpu(p), 0);
+
bpf_task_release(p);
batch--;
@@ -439,7 +512,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
if (!batch || !scx_bpf_dispatch_nr_slots()) {
if (dispatch_highpri(false))
return;
- scx_bpf_dsq_move_to_local(SHARED_DSQ);
+ scx_bpf_dsq_move_to_local(SHARED_DSQ, 0);
return;
}
if (!cpuc->dsp_cnt)
@@ -449,19 +522,21 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
cpuc->dsp_cnt = 0;
}
+ for (i = 0; i < MAX_SUB_SCHEDS; i++) {
+ if (sub_sched_cgroup_ids[i] &&
+ scx_bpf_sub_dispatch(sub_sched_cgroup_ids[i]))
+ return;
+ }
+
/*
* No other tasks. @prev will keep running. Update its core_sched_seq as
* if the task were enqueued and dispatched immediately.
*/
if (prev) {
tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
- if (!tctx) {
- scx_bpf_error("task_ctx lookup failed");
- return;
- }
-
- tctx->core_sched_seq =
- core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
+ if (tctx)
+ tctx->core_sched_seq =
+ core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
}
}
@@ -499,10 +574,8 @@ static s64 task_qdist(struct task_struct *p)
s64 qdist;
tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
- if (!tctx) {
- scx_bpf_error("task_ctx lookup failed");
+ if (!tctx)
return 0;
- }
qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
@@ -531,21 +604,11 @@ bool BPF_STRUCT_OPS(qmap_core_sched_before,
return task_qdist(a) > task_qdist(b);
}
-void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
-{
- u32 cnt;
-
- /*
- * Called when @cpu is taken by a higher priority scheduling class. This
- * makes @cpu no longer available for executing sched_ext tasks. As we
- * don't want the tasks in @cpu's local dsq to sit there until @cpu
- * becomes available again, re-enqueue them into the global dsq. See
- * %SCX_ENQ_REENQ handling in qmap_enqueue().
- */
- cnt = scx_bpf_reenqueue_local();
- if (cnt)
- __sync_fetch_and_add(&nr_reenqueued, cnt);
-}
+/*
+ * sched_switch tracepoint and cpu_release handlers are no longer needed.
+ * With SCX_OPS_ALWAYS_ENQ_IMMED, wakeup_preempt_scx() reenqueues IMMED
+ * tasks when a higher-priority scheduling class takes the CPU.
+ */
s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
struct scx_init_task_args *args)
@@ -578,11 +641,26 @@ void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
return;
scx_bpf_dump("QMAP FIFO[%d]:", i);
+
+ /*
+ * Dump can be invoked anytime and there is no way to iterate in
+ * a non-destructive way. Pop and store in dump_store and then
+ * restore afterwards. If racing against new enqueues, ordering
+ * can get mixed up.
+ */
bpf_repeat(4096) {
if (bpf_map_pop_elem(fifo, &pid))
break;
+ bpf_map_push_elem(&dump_store, &pid, 0);
scx_bpf_dump(" %d", pid);
}
+
+ bpf_repeat(4096) {
+ if (bpf_map_pop_elem(&dump_store, &pid))
+ break;
+ bpf_map_push_elem(fifo, &pid, 0);
+ }
+
scx_bpf_dump("\n");
}
}
@@ -615,6 +693,29 @@ void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struc
taskc->force_local, taskc->core_sched_seq);
}
+s32 BPF_STRUCT_OPS(qmap_cgroup_init, struct cgroup *cgrp, struct scx_cgroup_init_args *args)
+{
+ if (print_msgs)
+ bpf_printk("CGRP INIT %llu weight=%u period=%lu quota=%ld burst=%lu",
+ cgrp->kn->id, args->weight, args->bw_period_us,
+ args->bw_quota_us, args->bw_burst_us);
+ return 0;
+}
+
+void BPF_STRUCT_OPS(qmap_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
+{
+ if (print_msgs)
+ bpf_printk("CGRP SET %llu weight=%u", cgrp->kn->id, weight);
+}
+
+void BPF_STRUCT_OPS(qmap_cgroup_set_bandwidth, struct cgroup *cgrp,
+ u64 period_us, u64 quota_us, u64 burst_us)
+{
+ if (print_msgs)
+ bpf_printk("CGRP SET %llu period=%lu quota=%ld burst=%lu",
+ cgrp->kn->id, period_us, quota_us, burst_us);
+}
+
/*
* Print out the online and possible CPU map using bpf_printk() as a
* demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
@@ -656,16 +757,20 @@ static void print_cpus(void)
void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
{
- bpf_printk("CPU %d coming online", cpu);
- /* @cpu is already online at this point */
- print_cpus();
+ if (print_msgs) {
+ bpf_printk("CPU %d coming online", cpu);
+ /* @cpu is already online at this point */
+ print_cpus();
+ }
}
void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
{
- bpf_printk("CPU %d going offline", cpu);
- /* @cpu is still online at this point */
- print_cpus();
+ if (print_msgs) {
+ bpf_printk("CPU %d going offline", cpu);
+ /* @cpu is still online at this point */
+ print_cpus();
+ }
}
struct monitor_timer {
@@ -769,37 +874,104 @@ static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
monitor_cpuperf();
- if (print_shared_dsq)
+ if (print_dsqs_and_events) {
+ struct scx_event_stats events;
+
dump_shared_dsq();
+ __COMPAT_scx_bpf_events(&events, sizeof(events));
+
+ bpf_printk("%35s: %lld", "SCX_EV_SELECT_CPU_FALLBACK",
+ scx_read_event(&events, SCX_EV_SELECT_CPU_FALLBACK));
+ bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE",
+ scx_read_event(&events, SCX_EV_DISPATCH_LOCAL_DSQ_OFFLINE));
+ bpf_printk("%35s: %lld", "SCX_EV_DISPATCH_KEEP_LAST",
+ scx_read_event(&events, SCX_EV_DISPATCH_KEEP_LAST));
+ bpf_printk("%35s: %lld", "SCX_EV_ENQ_SKIP_EXITING",
+ scx_read_event(&events, SCX_EV_ENQ_SKIP_EXITING));
+ bpf_printk("%35s: %lld", "SCX_EV_REFILL_SLICE_DFL",
+ scx_read_event(&events, SCX_EV_REFILL_SLICE_DFL));
+ bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DURATION",
+ scx_read_event(&events, SCX_EV_BYPASS_DURATION));
+ bpf_printk("%35s: %lld", "SCX_EV_BYPASS_DISPATCH",
+ scx_read_event(&events, SCX_EV_BYPASS_DISPATCH));
+ bpf_printk("%35s: %lld", "SCX_EV_BYPASS_ACTIVATE",
+ scx_read_event(&events, SCX_EV_BYPASS_ACTIVATE));
+ }
+
bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
return 0;
}
+struct lowpri_timer {
+ struct bpf_timer timer;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, u32);
+ __type(value, struct lowpri_timer);
+} lowpri_timer SEC(".maps");
+
+/*
+ * Nice 19 tasks are put into the lowpri DSQ. Every 10ms, reenq is triggered and
+ * the tasks are transferred to SHARED_DSQ.
+ */
+static int lowpri_timerfn(void *map, int *key, struct bpf_timer *timer)
+{
+ scx_bpf_dsq_reenq(LOWPRI_DSQ, 0);
+ bpf_timer_start(timer, LOWPRI_INTV_NS, 0);
+ return 0;
+}
+
s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
{
u32 key = 0;
struct bpf_timer *timer;
s32 ret;
- print_cpus();
+ if (print_msgs && !sub_cgroup_id)
+ print_cpus();
ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
- if (ret)
+ if (ret) {
+ scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret);
return ret;
+ }
ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
+ if (ret) {
+ scx_bpf_error("failed to create DSQ %d (%d)", HIGHPRI_DSQ, ret);
+ return ret;
+ }
+
+ ret = scx_bpf_create_dsq(LOWPRI_DSQ, -1);
if (ret)
return ret;
timer = bpf_map_lookup_elem(&monitor_timer, &key);
if (!timer)
return -ESRCH;
-
bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
bpf_timer_set_callback(timer, monitor_timerfn);
+ ret = bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
+ if (ret)
+ return ret;
+
+ if (__COMPAT_has_generic_reenq()) {
+ /* see lowpri_timerfn() */
+ timer = bpf_map_lookup_elem(&lowpri_timer, &key);
+ if (!timer)
+ return -ESRCH;
+ bpf_timer_init(timer, &lowpri_timer, CLOCK_MONOTONIC);
+ bpf_timer_set_callback(timer, lowpri_timerfn);
+ ret = bpf_timer_start(timer, LOWPRI_INTV_NS, 0);
+ if (ret)
+ return ret;
+ }
- return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
+ return 0;
}
void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
@@ -807,6 +979,36 @@ void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
UEI_RECORD(uei, ei);
}
+s32 BPF_STRUCT_OPS(qmap_sub_attach, struct scx_sub_attach_args *args)
+{
+ s32 i;
+
+ for (i = 0; i < MAX_SUB_SCHEDS; i++) {
+ if (!sub_sched_cgroup_ids[i]) {
+ sub_sched_cgroup_ids[i] = args->ops->sub_cgroup_id;
+ bpf_printk("attaching sub-sched[%d] on %s",
+ i, args->cgroup_path);
+ return 0;
+ }
+ }
+
+ return -ENOSPC;
+}
+
+void BPF_STRUCT_OPS(qmap_sub_detach, struct scx_sub_detach_args *args)
+{
+ s32 i;
+
+ for (i = 0; i < MAX_SUB_SCHEDS; i++) {
+ if (sub_sched_cgroup_ids[i] == args->ops->sub_cgroup_id) {
+ sub_sched_cgroup_ids[i] = 0;
+ bpf_printk("detaching sub-sched[%d] on %s",
+ i, args->cgroup_path);
+ break;
+ }
+ }
+}
+
SCX_OPS_DEFINE(qmap_ops,
.select_cpu = (void *)qmap_select_cpu,
.enqueue = (void *)qmap_enqueue,
@@ -814,11 +1016,15 @@ SCX_OPS_DEFINE(qmap_ops,
.dispatch = (void *)qmap_dispatch,
.tick = (void *)qmap_tick,
.core_sched_before = (void *)qmap_core_sched_before,
- .cpu_release = (void *)qmap_cpu_release,
.init_task = (void *)qmap_init_task,
.dump = (void *)qmap_dump,
.dump_cpu = (void *)qmap_dump_cpu,
.dump_task = (void *)qmap_dump_task,
+ .cgroup_init = (void *)qmap_cgroup_init,
+ .cgroup_set_weight = (void *)qmap_cgroup_set_weight,
+ .cgroup_set_bandwidth = (void *)qmap_cgroup_set_bandwidth,
+ .sub_attach = (void *)qmap_sub_attach,
+ .sub_detach = (void *)qmap_sub_detach,
.cpu_online = (void *)qmap_cpu_online,
.cpu_offline = (void *)qmap_cpu_offline,
.init = (void *)qmap_init,