// SPDX-License-Identifier: GPL-2.0 /* * Test to verify that total_bw value remains consistent across all CPUs * in different BPF program states. * * Copyright (C) 2025 NVIDIA Corporation. */ #include #include #include #include #include #include #include #include #include #include "minimal.bpf.skel.h" #include "scx_test.h" #define MAX_CPUS 512 #define STRESS_DURATION_SEC 5 struct total_bw_ctx { struct minimal *skel; long baseline_bw[MAX_CPUS]; int nr_cpus; }; static void *cpu_stress_thread(void *arg) { volatile int i; time_t end_time = time(NULL) + STRESS_DURATION_SEC; while (time(NULL) < end_time) for (i = 0; i < 1000000; i++) ; return NULL; } /* * The first enqueue on a CPU causes the DL server to start, for that * reason run stressor threads in the hopes it schedules on all CPUs. */ static int run_cpu_stress(int nr_cpus) { pthread_t *threads; int i, ret = 0; threads = calloc(nr_cpus, sizeof(pthread_t)); if (!threads) return -ENOMEM; /* Create threads to run on each CPU */ for (i = 0; i < nr_cpus; i++) { if (pthread_create(&threads[i], NULL, cpu_stress_thread, NULL)) { ret = -errno; fprintf(stderr, "Failed to create thread %d: %s\n", i, strerror(-ret)); break; } } /* Wait for all threads to complete */ for (i = 0; i < nr_cpus; i++) { if (threads[i]) pthread_join(threads[i], NULL); } free(threads); return ret; } static int read_total_bw_values(long *bw_values, int max_cpus) { FILE *fp; char line[256]; int cpu_count = 0; fp = fopen("/sys/kernel/debug/sched/debug", "r"); if (!fp) { SCX_ERR("Failed to open debug file"); return -1; } while (fgets(line, sizeof(line), fp)) { char *bw_str = strstr(line, "total_bw"); if (bw_str) { bw_str = strchr(bw_str, ':'); if (bw_str) { /* Only store up to max_cpus values */ if (cpu_count < max_cpus) bw_values[cpu_count] = atol(bw_str + 1); cpu_count++; } } } fclose(fp); return cpu_count; } /* * Read a per-CPU dl_server param (runtime or period) from debugfs. * Returns the value in nanoseconds, or -1 on failure. */ static long read_server_param(const char *server, const char *param, int cpu) { char path[128]; long value = -1; FILE *fp; snprintf(path, sizeof(path), "/sys/kernel/debug/sched/%s_server/cpu%d/%s", server, cpu, param); fp = fopen(path, "r"); if (!fp) return -1; if (fscanf(fp, "%ld", &value) != 1) value = -1; fclose(fp); return value; } /* * Write a per-CPU dl_server param to debugfs. Returns 0 on success. */ static int write_server_param(const char *server, const char *param, int cpu, long value) { char path[128]; FILE *fp; int ret = 0; snprintf(path, sizeof(path), "/sys/kernel/debug/sched/%s_server/cpu%d/%s", server, cpu, param); fp = fopen(path, "w"); if (!fp) return -1; if (fprintf(fp, "%ld", value) < 0) ret = -1; if (fclose(fp) != 0) ret = -1; return ret; } static int read_fair_runtime_all(int nr_cpus, long *runtimes) { int i; for (i = 0; i < nr_cpus; i++) { runtimes[i] = read_server_param("fair", "runtime", i); if (runtimes[i] <= 0) return -1; } return 0; } static int write_fair_runtime_all(int nr_cpus, long value) { int i; for (i = 0; i < nr_cpus; i++) { if (write_server_param("fair", "runtime", i, value) < 0) { SCX_ERR("Failed to write fair_server runtime on CPU %d", i); return -1; } } return 0; } /* * Restore per-CPU fair_server runtimes. */ static int restore_fair_runtime_all(int nr_cpus, const long *runtimes) { int ret = 0; int i; for (i = 0; i < nr_cpus; i++) { if (write_server_param("fair", "runtime", i, runtimes[i]) < 0) { SCX_ERR("Failed to restore fair_server runtime on CPU %d", i); ret = -1; } } return ret; } static bool verify_total_bw_consistency(long *bw_values, int count) { int i; long first_value; if (count <= 0) return false; first_value = bw_values[0]; for (i = 1; i < count; i++) { if (bw_values[i] != first_value) { SCX_ERR("Inconsistent total_bw: CPU0=%ld, CPU%d=%ld", first_value, i, bw_values[i]); return false; } } return true; } static int fetch_verify_total_bw(long *bw_values, int nr_cpus) { int attempts = 0; int max_attempts = 10; int count; /* * The first enqueue on a CPU causes the DL server to start, for that * reason run stressor threads in the hopes it schedules on all CPUs. */ if (run_cpu_stress(nr_cpus) < 0) { SCX_ERR("Failed to run CPU stress"); return -1; } /* Try multiple times to get stable values */ while (attempts < max_attempts) { count = read_total_bw_values(bw_values, nr_cpus); fprintf(stderr, "Read %d total_bw values (testing %d CPUs)\n", count, nr_cpus); /* If system has more CPUs than we're testing, that's OK */ if (count < nr_cpus) { SCX_ERR("Expected at least %d CPUs, got %d", nr_cpus, count); attempts++; sleep(1); continue; } /* Only verify the CPUs we're testing */ if (verify_total_bw_consistency(bw_values, nr_cpus)) { fprintf(stderr, "Values are consistent: %ld\n", bw_values[0]); return 0; } attempts++; sleep(1); } return -1; } static enum scx_test_status setup(void **ctx) { struct total_bw_ctx *test_ctx; if (access("/sys/kernel/debug/sched/debug", R_OK) != 0) { fprintf(stderr, "Skipping test: debugfs sched/debug not accessible\n"); return SCX_TEST_SKIP; } test_ctx = calloc(1, sizeof(*test_ctx)); if (!test_ctx) return SCX_TEST_FAIL; test_ctx->nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); if (test_ctx->nr_cpus <= 0) { free(test_ctx); return SCX_TEST_FAIL; } /* If system has more CPUs than MAX_CPUS, just test the first MAX_CPUS */ if (test_ctx->nr_cpus > MAX_CPUS) test_ctx->nr_cpus = MAX_CPUS; /* Test scenario 1: BPF program not loaded */ /* Read and verify baseline total_bw before loading BPF program */ fprintf(stderr, "BPF prog initially not loaded, reading total_bw values\n"); if (fetch_verify_total_bw(test_ctx->baseline_bw, test_ctx->nr_cpus) < 0) { SCX_ERR("Failed to get stable baseline values"); free(test_ctx); return SCX_TEST_FAIL; } /* Load the BPF skeleton */ test_ctx->skel = minimal__open(); if (!test_ctx->skel) { free(test_ctx); return SCX_TEST_FAIL; } SCX_ENUM_INIT(test_ctx->skel); if (minimal__load(test_ctx->skel)) { minimal__destroy(test_ctx->skel); free(test_ctx); return SCX_TEST_FAIL; } *ctx = test_ctx; return SCX_TEST_PASS; } static enum scx_test_status run(void *ctx) { struct total_bw_ctx *test_ctx = ctx; struct bpf_link *link; long loaded_bw[MAX_CPUS]; long unloaded_bw[MAX_CPUS]; long doubled_bw[MAX_CPUS]; long original_runtime[MAX_CPUS], doubled_runtime; enum scx_test_status ret; int i; /* Test scenario 2: BPF program loaded */ link = bpf_map__attach_struct_ops(test_ctx->skel->maps.minimal_ops); if (!link) { SCX_ERR("Failed to attach scheduler"); return SCX_TEST_FAIL; } fprintf(stderr, "BPF program loaded, reading total_bw values\n"); if (fetch_verify_total_bw(loaded_bw, test_ctx->nr_cpus) < 0) { SCX_ERR("Failed to get stable values with BPF loaded"); bpf_link__destroy(link); return SCX_TEST_FAIL; } bpf_link__destroy(link); /* Test scenario 3: BPF program unloaded */ fprintf(stderr, "BPF program unloaded, reading total_bw values\n"); if (fetch_verify_total_bw(unloaded_bw, test_ctx->nr_cpus) < 0) { SCX_ERR("Failed to get stable values after BPF unload"); return SCX_TEST_FAIL; } /* Verify all three scenarios have the same total_bw values */ for (i = 0; i < test_ctx->nr_cpus; i++) { if (test_ctx->baseline_bw[i] != loaded_bw[i]) { SCX_ERR("CPU%d: baseline_bw=%ld != loaded_bw=%ld", i, test_ctx->baseline_bw[i], loaded_bw[i]); return SCX_TEST_FAIL; } if (test_ctx->baseline_bw[i] != unloaded_bw[i]) { SCX_ERR("CPU%d: baseline_bw=%ld != unloaded_bw=%ld", i, test_ctx->baseline_bw[i], unloaded_bw[i]); return SCX_TEST_FAIL; } } fprintf(stderr, "All total_bw values are consistent across all scenarios\n"); /* * Validate auto-register/unregister of dl_server bandwidth reservations. * * Doubling fair_server's runtime doubles its bw contribution. With a * full-mode BPF scheduler (minimal_ops), the kernel should detach * fair_server and attach ext_server, dropping total_bw back to its * pre-customization (default ext_server-only) value. On unload, the * fair_server reservation should come back with its customized runtime * preserved, so total_bw doubles again. */ if (read_fair_runtime_all(test_ctx->nr_cpus, original_runtime) < 0) { fprintf(stderr, "Skipping attach/detach validation: debugfs not accessible\n"); return SCX_TEST_PASS; } doubled_runtime = original_runtime[0] * 2; fprintf(stderr, "Setting fair_server runtime to %ld ns on all CPUs (orig %ld)\n", doubled_runtime, original_runtime[0]); if (write_fair_runtime_all(test_ctx->nr_cpus, doubled_runtime) < 0) { ret = SCX_TEST_FAIL; goto restore; } if (fetch_verify_total_bw(doubled_bw, test_ctx->nr_cpus) < 0) { SCX_ERR("Failed to get stable values after doubling fair runtime"); ret = SCX_TEST_FAIL; goto restore; } /* * After doubling the runtime, fair_server's bw contribution must grow. * We don't assert exactly 2x, because the kernel's to_ratio() truncates * the value, so 2 * to_ratio(period, runtime) and * to_ratio(period, 2 * runtime) can differ. */ for (i = 0; i < test_ctx->nr_cpus; i++) { if (doubled_bw[i] <= test_ctx->baseline_bw[i]) { SCX_ERR("CPU%d: fair did not increase total_bw (baseline=%ld, doubled=%ld)", i, test_ctx->baseline_bw[i], doubled_bw[i]); ret = SCX_TEST_FAIL; goto restore; } } link = bpf_map__attach_struct_ops(test_ctx->skel->maps.minimal_ops); if (!link) { SCX_ERR("Failed to attach scheduler for detach test"); ret = SCX_TEST_FAIL; goto restore; } if (fetch_verify_total_bw(loaded_bw, test_ctx->nr_cpus) < 0) { SCX_ERR("Failed to get stable values with BPF loaded (detach test)"); bpf_link__destroy(link); ret = SCX_TEST_FAIL; goto restore; } /* * In full mode the customized fair_server is detached and ext_server is * attached at its default runtime, total_bw must match baseline. */ for (i = 0; i < test_ctx->nr_cpus; i++) { if (loaded_bw[i] != test_ctx->baseline_bw[i]) { SCX_ERR("CPU%d: expected bw %ld (fair detached, ext default), got %ld", i, test_ctx->baseline_bw[i], loaded_bw[i]); bpf_link__destroy(link); ret = SCX_TEST_FAIL; goto restore; } } bpf_link__destroy(link); if (fetch_verify_total_bw(unloaded_bw, test_ctx->nr_cpus) < 0) { SCX_ERR("Failed to get stable values after BPF unload (detach test)"); ret = SCX_TEST_FAIL; goto restore; } /* * After unload, fair_server is re-attached with its preserved 2x * runtime, so total_bw should return to the doubled value. */ for (i = 0; i < test_ctx->nr_cpus; i++) { if (unloaded_bw[i] != doubled_bw[i]) { SCX_ERR("CPU%d: BPF unloaded: expected %ld (fair restored at 2x), got %ld", i, doubled_bw[i], unloaded_bw[i]); ret = SCX_TEST_FAIL; goto restore; } } fprintf(stderr, "dl_server attach/detach with customized fair runtime verified\n"); ret = SCX_TEST_PASS; restore: if (restore_fair_runtime_all(test_ctx->nr_cpus, original_runtime) < 0) SCX_ERR("Failed to fully restore per-CPU fair_server runtimes"); return ret; } static void cleanup(void *ctx) { struct total_bw_ctx *test_ctx = ctx; if (test_ctx) { if (test_ctx->skel) minimal__destroy(test_ctx->skel); free(test_ctx); } } struct scx_test total_bw = { .name = "total_bw", .description = "Verify total_bw consistency across BPF program states", .setup = setup, .run = run, .cleanup = cleanup, }; REGISTER_SCX_TEST(&total_bw)