summaryrefslogtreecommitdiff
path: root/tools/testing/selftests/sched_ext/dequeue.bpf.c
blob: 624e2ccb06884e86140702740923437dd528d20a (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
// SPDX-License-Identifier: GPL-2.0
/*
 * A scheduler that validates ops.dequeue() is called correctly:
 * - Tasks dispatched to terminal DSQs (local, global) bypass the BPF
 *   scheduler entirely: no ops.dequeue() should be called
 * - Tasks dispatched to user DSQs from ops.enqueue() enter BPF custody:
 *   ops.dequeue() must be called when they leave custody
 * - Every ops.enqueue() dispatch to non-terminal DSQs is followed by
 *   exactly one ops.dequeue() (validate 1:1 pairing and state machine)
 *
 * Copyright (c) 2026 NVIDIA Corporation.
 */

#include <scx/common.bpf.h>

#define SHARED_DSQ	0

/*
 * BPF internal queue.
 *
 * Tasks are stored here and consumed from ops.dispatch(), validating that
 * tasks on BPF internal structures still get ops.dequeue() when they
 * leave.
 */
struct {
	__uint(type, BPF_MAP_TYPE_QUEUE);
	__uint(max_entries, 32768);
	__type(value, s32);
} global_queue SEC(".maps");

char _license[] SEC("license") = "GPL";

UEI_DEFINE(uei);

/*
 * Counters to track the lifecycle of tasks:
 * - enqueue_cnt: Number of times ops.enqueue() was called
 * - dequeue_cnt: Number of times ops.dequeue() was called (any type)
 * - dispatch_dequeue_cnt: Number of regular dispatch dequeues (no flag)
 * - change_dequeue_cnt: Number of property change dequeues
 * - bpf_queue_full: Number of times the BPF internal queue was full
 */
u64 enqueue_cnt, dequeue_cnt, dispatch_dequeue_cnt, change_dequeue_cnt, bpf_queue_full;

/*
 * Test scenarios:
 * 0) Dispatch to local DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
 *    scheduler, no dequeue callbacks)
 * 1) Dispatch to global DSQ from ops.select_cpu() (terminal DSQ, bypasses BPF
 *    scheduler, no dequeue callbacks)
 * 2) Dispatch to shared user DSQ from ops.select_cpu() (enters BPF scheduler,
 *    dequeue callbacks expected)
 * 3) Dispatch to local DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
 *    scheduler, no dequeue callbacks)
 * 4) Dispatch to global DSQ from ops.enqueue() (terminal DSQ, bypasses BPF
 *    scheduler, no dequeue callbacks)
 * 5) Dispatch to shared user DSQ from ops.enqueue() (enters BPF scheduler,
 *    dequeue callbacks expected)
 * 6) BPF internal queue from ops.enqueue(): store task PIDs in ops.enqueue(),
 *    consume in ops.dispatch() and dispatch to local DSQ (validates dequeue
 *    for tasks stored in internal BPF data structures)
 */
u32 test_scenario;

/*
 * Per-task state to track lifecycle and validate workflow semantics.
 * State transitions:
 *   NONE -> ENQUEUED (on enqueue)
 *   NONE -> DISPATCHED (on direct dispatch to terminal DSQ)
 *   ENQUEUED -> DISPATCHED (on dispatch dequeue)
 *   DISPATCHED -> NONE (on property change dequeue or re-enqueue)
 *   ENQUEUED -> NONE (on property change dequeue before dispatch)
 */
enum task_state {
	TASK_NONE = 0,
	TASK_ENQUEUED,
	TASK_DISPATCHED,
};

struct task_ctx {
	enum task_state state; /* Current state in the workflow */
	u64 enqueue_seq;       /* Sequence number for debugging */
};

struct {
	__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
	__uint(map_flags, BPF_F_NO_PREALLOC);
	__type(key, int);
	__type(value, struct task_ctx);
} task_ctx_stor SEC(".maps");

static struct task_ctx *try_lookup_task_ctx(struct task_struct *p)
{
	return bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
}

s32 BPF_STRUCT_OPS(dequeue_select_cpu, struct task_struct *p,
		   s32 prev_cpu, u64 wake_flags)
{
	struct task_ctx *tctx;

	tctx = try_lookup_task_ctx(p);
	if (!tctx)
		return prev_cpu;

	switch (test_scenario) {
	case 0:
		/*
		 * Direct dispatch to the local DSQ.
		 *
		 * Task bypasses BPF scheduler entirely: no enqueue
		 * tracking, no ops.dequeue() callbacks.
		 */
		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
		tctx->state = TASK_DISPATCHED;
		break;
	case 1:
		/*
		 * Direct dispatch to the global DSQ.
		 *
		 * Task bypasses BPF scheduler entirely: no enqueue
		 * tracking, no ops.dequeue() callbacks.
		 */
		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);
		tctx->state = TASK_DISPATCHED;
		break;
	case 2:
		/*
		 * Dispatch to a shared user DSQ.
		 *
		 * Task enters BPF scheduler management: track
		 * enqueue/dequeue lifecycle and validate state
		 * transitions.
		 */
		if (tctx->state == TASK_ENQUEUED)
			scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
				      p->pid, p->comm, tctx->enqueue_seq);

		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, 0);

		__sync_fetch_and_add(&enqueue_cnt, 1);

		tctx->state = TASK_ENQUEUED;
		tctx->enqueue_seq++;
		break;
	}

	return prev_cpu;
}

void BPF_STRUCT_OPS(dequeue_enqueue, struct task_struct *p, u64 enq_flags)
{
	struct task_ctx *tctx;
	s32 pid = p->pid;

	tctx = try_lookup_task_ctx(p);
	if (!tctx)
		return;

	switch (test_scenario) {
	case 3:
		/*
		 * Direct dispatch to the local DSQ.
		 *
		 * Task bypasses BPF scheduler entirely: no enqueue
		 * tracking, no ops.dequeue() callbacks.
		 */
		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
		tctx->state = TASK_DISPATCHED;
		break;
	case 4:
		/*
		 * Direct dispatch to the global DSQ.
		 *
		 * Task bypasses BPF scheduler entirely: no enqueue
		 * tracking, no ops.dequeue() callbacks.
		 */
		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
		tctx->state = TASK_DISPATCHED;
		break;
	case 5:
		/*
		 * Dispatch to shared user DSQ.
		 *
		 * Task enters BPF scheduler management: track
		 * enqueue/dequeue lifecycle and validate state
		 * transitions.
		 */
		if (tctx->state == TASK_ENQUEUED)
			scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
				      p->pid, p->comm, tctx->enqueue_seq);

		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);

		__sync_fetch_and_add(&enqueue_cnt, 1);

		tctx->state = TASK_ENQUEUED;
		tctx->enqueue_seq++;
		break;
	case 6:
		/*
		 * Store task in BPF internal queue.
		 *
		 * Task enters BPF scheduler management: track
		 * enqueue/dequeue lifecycle and validate state
		 * transitions.
		 */
		if (tctx->state == TASK_ENQUEUED)
			scx_bpf_error("%d (%s): enqueue while in ENQUEUED state seq=%llu",
				      p->pid, p->comm, tctx->enqueue_seq);

		if (bpf_map_push_elem(&global_queue, &pid, 0)) {
			scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
			__sync_fetch_and_add(&bpf_queue_full, 1);

			tctx->state = TASK_DISPATCHED;
		} else {
			__sync_fetch_and_add(&enqueue_cnt, 1);

			tctx->state = TASK_ENQUEUED;
			tctx->enqueue_seq++;
		}
		break;
	default:
		/* For all other scenarios, dispatch to the global DSQ */
		scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
		tctx->state = TASK_DISPATCHED;
		break;
	}

	scx_bpf_kick_cpu(scx_bpf_task_cpu(p), SCX_KICK_IDLE);
}

void BPF_STRUCT_OPS(dequeue_dequeue, struct task_struct *p, u64 deq_flags)
{
	struct task_ctx *tctx;

	__sync_fetch_and_add(&dequeue_cnt, 1);

	tctx = try_lookup_task_ctx(p);
	if (!tctx)
		return;

	/*
	 * For scenarios 0, 1, 3, and 4 (terminal DSQs: local and global),
	 * ops.dequeue() should never be called because tasks bypass the
	 * BPF scheduler entirely. If we get here, it's a kernel bug.
	 */
	if (test_scenario == 0 || test_scenario == 3) {
		scx_bpf_error("%d (%s): dequeue called for local DSQ scenario",
			      p->pid, p->comm);
		return;
	}

	if (test_scenario == 1 || test_scenario == 4) {
		scx_bpf_error("%d (%s): dequeue called for global DSQ scenario",
			      p->pid, p->comm);
		return;
	}

	if (deq_flags & SCX_DEQ_SCHED_CHANGE) {
		/*
		 * Property change interrupting the workflow. Valid from
		 * both ENQUEUED and DISPATCHED states. Transitions task
		 * back to NONE state.
		 */
		__sync_fetch_and_add(&change_dequeue_cnt, 1);

		/* Validate state transition */
		if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_DISPATCHED)
			scx_bpf_error("%d (%s): invalid property change dequeue state=%d seq=%llu",
				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);

		/*
		 * Transition back to NONE: task outside scheduler control.
		 *
		 * Scenario 6: dispatch() checks tctx->state after popping a
		 * PID, if the task is in state NONE, it was dequeued by
		 * property change and must not be dispatched (this
		 * prevents "target CPU not allowed").
		 */
		tctx->state = TASK_NONE;
	} else {
		/*
		 * Regular dispatch dequeue: kernel is moving the task from
		 * BPF custody to a terminal DSQ. Normally we come from
		 * ENQUEUED state. We can also see TASK_NONE if the task
		 * was dequeued by property change (SCX_DEQ_SCHED_CHANGE)
		 * while it was already on a DSQ (dispatched but not yet
		 * consumed); in that case we just leave state as NONE.
		 */
		__sync_fetch_and_add(&dispatch_dequeue_cnt, 1);

		/*
		 * Must be ENQUEUED (normal path) or NONE (already dequeued
		 * by property change while on a DSQ).
		 */
		if (tctx->state != TASK_ENQUEUED && tctx->state != TASK_NONE)
			scx_bpf_error("%d (%s): dispatch dequeue from state %d seq=%llu",
				      p->pid, p->comm, tctx->state, tctx->enqueue_seq);

		if (tctx->state == TASK_ENQUEUED)
			tctx->state = TASK_DISPATCHED;

		/* NONE: leave as-is, task was already property-change dequeued */
	}
}

void BPF_STRUCT_OPS(dequeue_dispatch, s32 cpu, struct task_struct *prev)
{
	if (test_scenario == 6) {
		struct task_ctx *tctx;
		struct task_struct *p;
		s32 pid;

		if (bpf_map_pop_elem(&global_queue, &pid))
			return;

		p = bpf_task_from_pid(pid);
		if (!p)
			return;

		/*
		 * If the task was dequeued by property change
		 * (ops.dequeue() set tctx->state = TASK_NONE), skip
		 * dispatch.
		 */
		tctx = try_lookup_task_ctx(p);
		if (!tctx || tctx->state == TASK_NONE) {
			bpf_task_release(p);
			return;
		}

		/*
		 * Dispatch to this CPU's local DSQ if allowed, otherwise
		 * fallback to the global DSQ.
		 */
		if (bpf_cpumask_test_cpu(cpu, p->cpus_ptr))
			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_DFL, 0);
		else
			scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0);

		bpf_task_release(p);
	} else {
		scx_bpf_dsq_move_to_local(SHARED_DSQ, 0);
	}
}

s32 BPF_STRUCT_OPS(dequeue_init_task, struct task_struct *p,
		   struct scx_init_task_args *args)
{
	struct task_ctx *tctx;

	tctx = bpf_task_storage_get(&task_ctx_stor, p, 0,
				   BPF_LOCAL_STORAGE_GET_F_CREATE);
	if (!tctx)
		return -ENOMEM;

	return 0;
}

s32 BPF_STRUCT_OPS_SLEEPABLE(dequeue_init)
{
	s32 ret;

	ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
	if (ret)
		return ret;

	return 0;
}

void BPF_STRUCT_OPS(dequeue_exit, struct scx_exit_info *ei)
{
	UEI_RECORD(uei, ei);
}

SEC(".struct_ops.link")
struct sched_ext_ops dequeue_ops = {
	.select_cpu		= (void *)dequeue_select_cpu,
	.enqueue		= (void *)dequeue_enqueue,
	.dequeue		= (void *)dequeue_dequeue,
	.dispatch		= (void *)dequeue_dispatch,
	.init_task		= (void *)dequeue_init_task,
	.init			= (void *)dequeue_init,
	.exit			= (void *)dequeue_exit,
	.flags			= SCX_OPS_ENQ_LAST,
	.name			= "dequeue_test",
};