diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-04-15 10:32:08 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-04-15 10:32:08 -0700 |
| commit | 7de6b4a246330fe29fa2fd144b4724ca35d60d6c (patch) | |
| tree | e9ef69537787ded9bf98dba3e31ca1d50a829889 /lib | |
| parent | b71f0be2d23d876648758d57bc6761500e3b9c70 (diff) | |
| parent | 76af54648899abbd6b449c035583e47fd407078a (diff) | |
| download | lwn-7de6b4a246330fe29fa2fd144b4724ca35d60d6c.tar.gz lwn-7de6b4a246330fe29fa2fd144b4724ca35d60d6c.zip | |
Merge tag 'wq-for-7.1' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq
Pull workqueue updates from Tejun Heo:
- New default WQ_AFFN_CACHE_SHARD affinity scope subdivides LLCs into
smaller shards to improve scalability on machines with many CPUs per
LLC
- Misc:
- system_dfl_long_wq for long unbound works
- devm_alloc_workqueue() for device-managed allocation
- sysfs exposure for ordered workqueues and the EFI workqueue
- removal of HK_TYPE_WQ from wq_unbound_cpumask
- various small fixes
* tag 'wq-for-7.1' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq: (21 commits)
workqueue: validate cpumask_first() result in llc_populate_cpu_shard_id()
workqueue: use NR_STD_WORKER_POOLS instead of hardcoded value
workqueue: avoid unguarded 64-bit division
docs: workqueue: document WQ_AFFN_CACHE_SHARD affinity scope
workqueue: add test_workqueue benchmark module
tools/workqueue: add CACHE_SHARD support to wq_dump.py
workqueue: set WQ_AFFN_CACHE_SHARD as the default affinity scope
workqueue: add WQ_AFFN_CACHE_SHARD affinity scope
workqueue: fix typo in WQ_AFFN_SMT comment
workqueue: Remove HK_TYPE_WQ from affecting wq_unbound_cpumask
workqueue: unlink pwqs from wq->pwqs list in alloc_and_link_pwqs() error path
workqueue: Remove NULL wq WARN in __queue_delayed_work()
workqueue: fix parse_affn_scope() prefix matching bug
workqueue: devres: Add device-managed allocate workqueue
workqueue: Add system_dfl_long_wq for long unbound works
tools/workqueue/wq_dump.py: add NODE prefix to all node columns
tools/workqueue/wq_dump.py: fix column alignment in node_nr/max_active section
tools/workqueue/wq_dump.py: remove backslash separator from node_nr/max_active header
efi: Allow to expose the workqueue via sysfs
workqueue: Allow to expose ordered workqueues via sysfs
...
Diffstat (limited to 'lib')
| -rw-r--r-- | lib/Kconfig.debug | 10 | ||||
| -rw-r--r-- | lib/Makefile | 1 | ||||
| -rw-r--r-- | lib/test_workqueue.c | 294 |
3 files changed, 305 insertions, 0 deletions
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index aac60b6cfa4b..77c3774c1c49 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2636,6 +2636,16 @@ config TEST_VMALLOC If unsure, say N. +config TEST_WORKQUEUE + tristate "Test module for stress/performance analysis of workqueue" + default n + help + This builds the "test_workqueue" module for benchmarking + workqueue throughput under contention. Useful for evaluating + affinity scope changes (e.g., cache_shard vs cache). + + If unsure, say N. + config TEST_BPF tristate "Test BPF filter functionality" depends on m && NET diff --git a/lib/Makefile b/lib/Makefile index 1b9ee167517f..ea660cca04f4 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -79,6 +79,7 @@ UBSAN_SANITIZE_test_ubsan.o := y obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o obj-$(CONFIG_TEST_LKM) += test_module.o obj-$(CONFIG_TEST_VMALLOC) += test_vmalloc.o +obj-$(CONFIG_TEST_WORKQUEUE) += test_workqueue.o obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_keys.o obj-$(CONFIG_TEST_STATIC_KEYS) += test_static_key_base.o diff --git a/lib/test_workqueue.c b/lib/test_workqueue.c new file mode 100644 index 000000000000..99e160bd5ad1 --- /dev/null +++ b/lib/test_workqueue.c @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Test module for stress and performance analysis of workqueue. + * + * Benchmarks queue_work() throughput on an unbound workqueue to measure + * pool->lock contention under different affinity scope configurations + * (e.g., cache vs cache_shard). + * + * The affinity scope is changed between runs via the workqueue's sysfs + * affinity_scope attribute (WQ_SYSFS). + * + * Copyright (c) 2026 Meta Platforms, Inc. and affiliates + * Copyright (c) 2026 Breno Leitao <leitao@debian.org> + * + */ +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/workqueue.h> +#include <linux/kthread.h> +#include <linux/moduleparam.h> +#include <linux/completion.h> +#include <linux/atomic.h> +#include <linux/slab.h> +#include <linux/ktime.h> +#include <linux/cpumask.h> +#include <linux/sched.h> +#include <linux/sort.h> +#include <linux/fs.h> + +#define WQ_NAME "bench_wq" +#define SCOPE_PATH "/sys/bus/workqueue/devices/" WQ_NAME "/affinity_scope" + +static int nr_threads; +module_param(nr_threads, int, 0444); +MODULE_PARM_DESC(nr_threads, + "Number of threads to spawn (default: 0 = num_online_cpus())"); + +static int wq_items = 50000; +module_param(wq_items, int, 0444); +MODULE_PARM_DESC(wq_items, + "Number of work items each thread queues (default: 50000)"); + +static struct workqueue_struct *bench_wq; +static atomic_t threads_done; +static DECLARE_COMPLETION(start_comp); +static DECLARE_COMPLETION(all_done_comp); + +struct thread_ctx { + struct completion work_done; + struct work_struct work; + u64 *latencies; + int cpu; + int items; +}; + +static void bench_work_fn(struct work_struct *work) +{ + struct thread_ctx *ctx = container_of(work, struct thread_ctx, work); + + complete(&ctx->work_done); +} + +static int bench_kthread_fn(void *data) +{ + struct thread_ctx *ctx = data; + ktime_t t_start, t_end; + int i; + + /* Wait for all threads to be ready */ + wait_for_completion(&start_comp); + + if (kthread_should_stop()) + return 0; + + for (i = 0; i < ctx->items; i++) { + reinit_completion(&ctx->work_done); + INIT_WORK(&ctx->work, bench_work_fn); + + t_start = ktime_get(); + queue_work(bench_wq, &ctx->work); + t_end = ktime_get(); + + ctx->latencies[i] = ktime_to_ns(ktime_sub(t_end, t_start)); + wait_for_completion(&ctx->work_done); + } + + if (atomic_dec_and_test(&threads_done)) + complete(&all_done_comp); + + /* + * Wait for kthread_stop() so the module text isn't freed + * while we're still executing. + */ + while (!kthread_should_stop()) + schedule(); + + return 0; +} + +static int cmp_u64(const void *a, const void *b) +{ + u64 va = *(const u64 *)a; + u64 vb = *(const u64 *)b; + + if (va < vb) + return -1; + if (va > vb) + return 1; + return 0; +} + +static int __init set_affn_scope(const char *scope) +{ + struct file *f; + loff_t pos = 0; + ssize_t ret; + + f = filp_open(SCOPE_PATH, O_WRONLY, 0); + if (IS_ERR(f)) { + pr_err("test_workqueue: open %s failed: %ld\n", + SCOPE_PATH, PTR_ERR(f)); + return PTR_ERR(f); + } + + ret = kernel_write(f, scope, strlen(scope), &pos); + filp_close(f, NULL); + + if (ret < 0) { + pr_err("test_workqueue: write '%s' failed: %zd\n", scope, ret); + return ret; + } + + return 0; +} + +static int __init run_bench(int n_threads, const char *scope, const char *label) +{ + struct task_struct **tasks; + unsigned long total_items; + struct thread_ctx *ctxs; + u64 *all_latencies; + ktime_t start, end; + int cpu, i, j, ret; + s64 elapsed_us; + + ret = set_affn_scope(scope); + if (ret) + return ret; + + ctxs = kcalloc(n_threads, sizeof(*ctxs), GFP_KERNEL); + if (!ctxs) + return -ENOMEM; + + tasks = kcalloc(n_threads, sizeof(*tasks), GFP_KERNEL); + if (!tasks) { + kfree(ctxs); + return -ENOMEM; + } + + total_items = (unsigned long)n_threads * wq_items; + all_latencies = kvmalloc_array(total_items, sizeof(u64), GFP_KERNEL); + if (!all_latencies) { + kfree(tasks); + kfree(ctxs); + return -ENOMEM; + } + + /* Allocate per-thread latency arrays */ + for (i = 0; i < n_threads; i++) { + ctxs[i].latencies = kvmalloc_array(wq_items, sizeof(u64), + GFP_KERNEL); + if (!ctxs[i].latencies) { + while (--i >= 0) + kvfree(ctxs[i].latencies); + kvfree(all_latencies); + kfree(tasks); + kfree(ctxs); + return -ENOMEM; + } + } + + atomic_set(&threads_done, n_threads); + reinit_completion(&all_done_comp); + reinit_completion(&start_comp); + + /* Create kthreads, each bound to a different online CPU */ + i = 0; + for_each_online_cpu(cpu) { + if (i >= n_threads) + break; + + ctxs[i].cpu = cpu; + ctxs[i].items = wq_items; + init_completion(&ctxs[i].work_done); + + tasks[i] = kthread_create(bench_kthread_fn, &ctxs[i], + "wq_bench/%d", cpu); + if (IS_ERR(tasks[i])) { + ret = PTR_ERR(tasks[i]); + pr_err("test_workqueue: failed to create kthread %d: %d\n", + i, ret); + /* Unblock threads waiting on start_comp before stopping them */ + complete_all(&start_comp); + while (--i >= 0) + kthread_stop(tasks[i]); + goto out_free; + } + + kthread_bind(tasks[i], cpu); + wake_up_process(tasks[i]); + i++; + } + + /* Start timing and release all threads */ + start = ktime_get(); + complete_all(&start_comp); + + /* Wait for all threads to finish the benchmark */ + wait_for_completion(&all_done_comp); + + /* Drain any remaining work */ + flush_workqueue(bench_wq); + + /* Ensure all kthreads have fully exited before module memory is freed */ + for (i = 0; i < n_threads; i++) + kthread_stop(tasks[i]); + + end = ktime_get(); + elapsed_us = ktime_us_delta(end, start); + + /* Merge all per-thread latencies and sort for percentile calculation */ + j = 0; + for (i = 0; i < n_threads; i++) { + memcpy(&all_latencies[j], ctxs[i].latencies, + wq_items * sizeof(u64)); + j += wq_items; + } + + sort(all_latencies, total_items, sizeof(u64), cmp_u64, NULL); + + pr_info("test_workqueue: %-16s %llu items/sec\tp50=%llu\tp90=%llu\tp95=%llu ns\n", + label, + elapsed_us ? div_u64(total_items * 1000000ULL, elapsed_us) : 0, + all_latencies[total_items * 50 / 100], + all_latencies[total_items * 90 / 100], + all_latencies[total_items * 95 / 100]); + + ret = 0; +out_free: + for (i = 0; i < n_threads; i++) + kvfree(ctxs[i].latencies); + kvfree(all_latencies); + kfree(tasks); + kfree(ctxs); + + return ret; +} + +static const char * const bench_scopes[] = { + "cpu", "smt", "cache_shard", "cache", "numa", "system", +}; + +static int __init test_workqueue_init(void) +{ + int n_threads = min(nr_threads ?: num_online_cpus(), num_online_cpus()); + int i; + + if (wq_items <= 0) { + pr_err("test_workqueue: wq_items must be > 0\n"); + return -EINVAL; + } + + bench_wq = alloc_workqueue(WQ_NAME, WQ_UNBOUND | WQ_SYSFS, 0); + if (!bench_wq) + return -ENOMEM; + + pr_info("test_workqueue: running %d threads, %d items/thread\n", + n_threads, wq_items); + + for (i = 0; i < ARRAY_SIZE(bench_scopes); i++) + run_bench(n_threads, bench_scopes[i], bench_scopes[i]); + + destroy_workqueue(bench_wq); + + /* Return -EAGAIN so the module doesn't stay loaded after the benchmark */ + return -EAGAIN; +} + +module_init(test_workqueue_init); +MODULE_AUTHOR("Breno Leitao <leitao@debian.org>"); +MODULE_DESCRIPTION("Stress/performance benchmark for workqueue subsystem"); +MODULE_LICENSE("GPL"); |
