From 7f7dc377a3b2bbaa8cf8941587c228eab4bd82ec Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:24 -1000 Subject: workqueue: Add tools/workqueue/wq_dump.py which prints out workqueue configuration Lack of visibility has always been a pain point for workqueues. While the recently added wq_monitor.py improved the situation, it's still difficult to understand what worker pools are active in the system, how workqueues map to them and why. The lack of visibility into how workqueues are configured is going to become more noticeable as workqueue improves locality awareness and provides more mechanisms to customize locality related behaviors. Now that the basic framework for more flexible locality support is in place, this is a good time to improve the situation. This patch adds tools/workqueues/wq_dump.py which prints out the topology configuration, worker pools and how workqueues are mapped to pools. Read the command's help message for more details. Signed-off-by: Tejun Heo --- tools/workqueue/wq_dump.py | 166 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 tools/workqueue/wq_dump.py (limited to 'tools') diff --git a/tools/workqueue/wq_dump.py b/tools/workqueue/wq_dump.py new file mode 100644 index 000000000000..ddd0bb4395ea --- /dev/null +++ b/tools/workqueue/wq_dump.py @@ -0,0 +1,166 @@ +#!/usr/bin/env drgn +# +# Copyright (C) 2023 Tejun Heo +# Copyright (C) 2023 Meta Platforms, Inc. and affiliates. + +desc = """ +This is a drgn script to show the current workqueue configuration. For more +info on drgn, visit https://github.com/osandov/drgn. + +Affinity Scopes +=============== + +Shows the CPUs that can be used for unbound workqueues and how they will be +grouped by each available affinity type. For each type: + + nr_pods number of CPU pods in the affinity type + pod_cpus CPUs in each pod + pod_node NUMA node for memory allocation for each pod + cpu_pod pod that each CPU is associated to + +Worker Pools +============ + +Lists all worker pools indexed by their ID. For each pool: + + ref number of pool_workqueue's associated with this pool + nice nice value of the worker threads in the pool + idle number of idle workers + workers number of all workers + cpu CPU the pool is associated with (per-cpu pool) + cpus CPUs the workers in the pool can run on (unbound pool) + +Workqueue CPU -> pool +===================== + +Lists all workqueues along with their type and worker pool association. For +each workqueue: + + NAME TYPE POOL_ID... + + NAME name of the workqueue + TYPE percpu, unbound or ordered + POOL_ID worker pool ID associated with each possible CPU +""" + +import sys + +import drgn +from drgn.helpers.linux.list import list_for_each_entry,list_empty +from drgn.helpers.linux.percpu import per_cpu_ptr +from drgn.helpers.linux.cpumask import for_each_cpu,for_each_possible_cpu +from drgn.helpers.linux.idr import idr_for_each + +import argparse +parser = argparse.ArgumentParser(description=desc, + formatter_class=argparse.RawTextHelpFormatter) +args = parser.parse_args() + +def err(s): + print(s, file=sys.stderr, flush=True) + sys.exit(1) + +def cpumask_str(cpumask): + output = "" + base = 0 + v = 0 + for cpu in for_each_cpu(cpumask[0]): + while cpu - base >= 32: + output += f'{hex(v)} ' + base += 32 + v = 0 + v |= 1 << (cpu - base) + if v > 0: + output += f'{v:08x}' + return output.strip() + +worker_pool_idr = prog['worker_pool_idr'] +workqueues = prog['workqueues'] +wq_unbound_cpumask = prog['wq_unbound_cpumask'] +wq_pod_types = prog['wq_pod_types'] + +WQ_UNBOUND = prog['WQ_UNBOUND'] +WQ_ORDERED = prog['__WQ_ORDERED'] +WQ_MEM_RECLAIM = prog['WQ_MEM_RECLAIM'] + +WQ_AFFN_NUMA = prog['WQ_AFFN_NUMA'] +WQ_AFFN_SYSTEM = prog['WQ_AFFN_SYSTEM'] + +print('Affinity Scopes') +print('===============') + +print(f'wq_unbound_cpumask={cpumask_str(wq_unbound_cpumask)}') + +def print_pod_type(pt): + print(f' nr_pods {pt.nr_pods.value_()}') + + print(' pod_cpus', end='') + for pod in range(pt.nr_pods): + print(f' [{pod}]={cpumask_str(pt.pod_cpus[pod])}', end='') + print('') + + print(' pod_node', end='') + for pod in range(pt.nr_pods): + print(f' [{pod}]={pt.pod_node[pod].value_()}', end='') + print('') + + print(f' cpu_pod ', end='') + for cpu in for_each_possible_cpu(prog): + print(f' [{cpu}]={pt.cpu_pod[cpu].value_()}', end='') + print('') + +print('') +print('NUMA') +print_pod_type(wq_pod_types[WQ_AFFN_NUMA]) +print('') +print('SYSTEM') +print_pod_type(wq_pod_types[WQ_AFFN_SYSTEM]) + +print('') +print('Worker Pools') +print('============') + +max_pool_id_len = 0 +max_ref_len = 0 +for pi, pool in idr_for_each(worker_pool_idr): + pool = drgn.Object(prog, 'struct worker_pool', address=pool) + max_pool_id_len = max(max_pool_id_len, len(f'{pi}')) + max_ref_len = max(max_ref_len, len(f'{pool.refcnt.value_()}')) + +for pi, pool in idr_for_each(worker_pool_idr): + pool = drgn.Object(prog, 'struct worker_pool', address=pool) + print(f'pool[{pi:0{max_pool_id_len}}] ref={pool.refcnt.value_():{max_ref_len}} nice={pool.attrs.nice.value_():3} ', end='') + print(f'idle/workers={pool.nr_idle.value_():3}/{pool.nr_workers.value_():3} ', end='') + if pool.cpu >= 0: + print(f'cpu={pool.cpu.value_():3}', end='') + else: + print(f'cpus={cpumask_str(pool.attrs.cpumask)}', end='') + print('') + +print('') +print('Workqueue CPU -> pool') +print('=====================') + +print('[ workqueue \ CPU ', end='') +for cpu in for_each_possible_cpu(prog): + print(f' {cpu:{max_pool_id_len}}', end='') +print(' dfl]') + +for wq in list_for_each_entry('struct workqueue_struct', workqueues.address_of_(), 'list'): + print(f'{wq.name.string_().decode()[-24:]:24}', end='') + if wq.flags & WQ_UNBOUND: + if wq.flags & WQ_ORDERED: + print(' ordered', end='') + else: + print(' unbound', end='') + else: + print(' percpu ', end='') + + for cpu in for_each_possible_cpu(prog): + pool_id = per_cpu_ptr(wq.cpu_pwq, cpu)[0].pool.id.value_() + field_len = max(len(str(cpu)), max_pool_id_len) + print(f' {pool_id:{field_len}}', end='') + + if wq.flags & WQ_UNBOUND: + print(f' {wq.dfl_pwq.pool.id.value_():{max_pool_id_len}}', end='') + print('') -- cgit v1.2.3 From 63c5484e74952f60f5810256bd69814d167b8d22 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:24 -1000 Subject: workqueue: Add multiple affinity scopes and interface to select them Add three more affinity scopes - WQ_AFFN_CPU, SMT and CACHE - and make CACHE the default. The code changes to actually add the additional scopes are trivial. Also add module parameter "workqueue.default_affinity_scope" to override the default scope and "affinity_scope" sysfs file to configure it per workqueue. wq_dump.py and documentations are updated accordingly. This enables significant flexibility in configuring how unbound workqueues behave. If affinity scope is set to "cpu", it'll behave close to a per-cpu workqueue. On the other hand, "system" removes all locality boundaries. Many modern machines have multiple L3 caches often while being mostly uniform in terms of memory access. Thus, workqueue's previous behavior of spreading work items in each NUMA node had negative performance implications from unncessarily crossing L3 boundaries between issue and execution. However, picking a finer grained affinity scope also has a downside in that an issuer in one group can't utilize CPUs in other groups. While dependent on the specifics of workload, there's usually a noticeable penalty in crossing L3 boundaries, so let's default to CACHE. This issue will be further addressed and documented with examples in future patches. Signed-off-by: Tejun Heo --- Documentation/admin-guide/kernel-parameters.txt | 12 +++ Documentation/core-api/workqueue.rst | 63 ++++++++++++++ include/linux/workqueue.h | 5 +- kernel/workqueue.c | 110 ++++++++++++++++++++++-- tools/workqueue/wq_dump.py | 15 ++-- 5 files changed, 193 insertions(+), 12 deletions(-) (limited to 'tools') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2b89cbc39713..732c5c7e3fa5 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -7007,6 +7007,18 @@ The default value of this parameter is determined by the config option CONFIG_WQ_POWER_EFFICIENT_DEFAULT. + workqueue.default_affinity_scope= + Select the default affinity scope to use for unbound + workqueues. Can be one of "cpu", "smt", "cache", + "numa" and "system". Default is "cache". For more + information, see the Affinity Scopes section in + Documentation/core-api/workqueue.rst. + + This can be updated after boot through the matching + file under /sys/module/workqueue/parameters. + However, the changed default will only apply to + unbound workqueues created afterwards. + workqueue.debug_force_rr_cpu Workqueue used to implicitly guarantee that work items queued without explicit CPU specified are put diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst index c9e46acd339b..56af317508c9 100644 --- a/Documentation/core-api/workqueue.rst +++ b/Documentation/core-api/workqueue.rst @@ -347,6 +347,51 @@ Guidelines level of locality in wq operations and work item execution. +Affinity Scopes +=============== + +An unbound workqueue groups CPUs according to its affinity scope to improve +cache locality. For example, if a workqueue is using the default affinity +scope of "cache", it will group CPUs according to last level cache +boundaries. A work item queued on the workqueue will be processed by a +worker running on one of the CPUs which share the last level cache with the +issuing CPU. + +Workqueue currently supports the following five affinity scopes. + +``cpu`` + CPUs are not grouped. A work item issued on one CPU is processed by a + worker on the same CPU. This makes unbound workqueues behave as per-cpu + workqueues without concurrency management. + +``smt`` + CPUs are grouped according to SMT boundaries. This usually means that the + logical threads of each physical CPU core are grouped together. + +``cache`` + CPUs are grouped according to cache boundaries. Which specific cache + boundary is used is determined by the arch code. L3 is used in a lot of + cases. This is the default affinity scope. + +``numa`` + CPUs are grouped according to NUMA bounaries. + +``system`` + All CPUs are put in the same group. Workqueue makes no effort to process a + work item on a CPU close to the issuing CPU. + +The default affinity scope can be changed with the module parameter +``workqueue.default_affinity_scope`` and a specific workqueue's affinity +scope can be changed using ``apply_workqueue_attrs()``. + +If ``WQ_SYSFS`` is set, the workqueue will have the following affinity scope +related interface files under its ``/sys/devices/virtual/WQ_NAME/`` +directory. + +``affinity_scope`` + Read to see the current affinity scope. Write to change. + + Examining Configuration ======================= @@ -358,6 +403,24 @@ configuration, worker pools and how workqueues map to the pools: :: =============== wq_unbound_cpumask=0000000f + CPU + nr_pods 4 + pod_cpus [0]=00000001 [1]=00000002 [2]=00000004 [3]=00000008 + pod_node [0]=0 [1]=0 [2]=1 [3]=1 + cpu_pod [0]=0 [1]=1 [2]=2 [3]=3 + + SMT + nr_pods 4 + pod_cpus [0]=00000001 [1]=00000002 [2]=00000004 [3]=00000008 + pod_node [0]=0 [1]=0 [2]=1 [3]=1 + cpu_pod [0]=0 [1]=1 [2]=2 [3]=3 + + CACHE (default) + nr_pods 2 + pod_cpus [0]=00000003 [1]=0000000c + pod_node [0]=0 [1]=1 + cpu_pod [0]=0 [1]=0 [2]=1 [3]=1 + NUMA nr_pods 2 pod_cpus [0]=00000003 [1]=0000000c diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 180491ee6706..568cfbc24bc0 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -126,12 +126,15 @@ struct rcu_work { }; enum wq_affn_scope { + WQ_AFFN_CPU, /* one pod per CPU */ + WQ_AFFN_SMT, /* one pod poer SMT */ + WQ_AFFN_CACHE, /* one pod per LLC */ WQ_AFFN_NUMA, /* one pod per NUMA node */ WQ_AFFN_SYSTEM, /* one pod across the whole system */ WQ_AFFN_NR_TYPES, - WQ_AFFN_DFL = WQ_AFFN_NUMA, + WQ_AFFN_DFL = WQ_AFFN_CACHE, }; /** diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a2cc0432abd5..8e3a499c3f00 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -338,6 +338,15 @@ struct wq_pod_type { }; static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES]; +static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_DFL; + +static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = { + [WQ_AFFN_CPU] = "cpu", + [WQ_AFFN_SMT] = "smt", + [WQ_AFFN_CACHE] = "cache", + [WQ_AFFN_NUMA] = "numa", + [WQ_AFFN_SYSTEM] = "system", +}; /* * Per-cpu work items which run for longer than the following threshold are @@ -3664,7 +3673,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(void) goto fail; cpumask_copy(attrs->cpumask, cpu_possible_mask); - attrs->affn_scope = WQ_AFFN_DFL; + attrs->affn_scope = wq_affn_dfl; return attrs; fail: free_workqueue_attrs(attrs); @@ -5777,19 +5786,55 @@ out_unlock: return ret; } +static int parse_affn_scope(const char *val) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) { + if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i]))) + return i; + } + return -EINVAL; +} + +static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp) +{ + int affn; + + affn = parse_affn_scope(val); + if (affn < 0) + return affn; + + wq_affn_dfl = affn; + return 0; +} + +static int wq_affn_dfl_get(char *buffer, const struct kernel_param *kp) +{ + return scnprintf(buffer, PAGE_SIZE, "%s\n", wq_affn_names[wq_affn_dfl]); +} + +static const struct kernel_param_ops wq_affn_dfl_ops = { + .set = wq_affn_dfl_set, + .get = wq_affn_dfl_get, +}; + +module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644); + #ifdef CONFIG_SYSFS /* * Workqueues with WQ_SYSFS flag set is visible to userland via * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the * following attributes. * - * per_cpu RO bool : whether the workqueue is per-cpu or unbound - * max_active RW int : maximum number of in-flight work items + * per_cpu RO bool : whether the workqueue is per-cpu or unbound + * max_active RW int : maximum number of in-flight work items * * Unbound workqueues have the following extra attributes. * - * nice RW int : nice value of the workers - * cpumask RW mask : bitmask of allowed CPUs for the workers + * nice RW int : nice value of the workers + * cpumask RW mask : bitmask of allowed CPUs for the workers + * affinity_scope RW str : worker CPU affinity scope (cache, numa, none) */ struct wq_device { struct workqueue_struct *wq; @@ -5932,9 +5977,47 @@ out_unlock: return ret ?: count; } +static ssize_t wq_affn_scope_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%s\n", + wq_affn_names[wq->unbound_attrs->affn_scope]); + mutex_unlock(&wq->mutex); + + return written; +} + +static ssize_t wq_affn_scope_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int affn, ret = -ENOMEM; + + affn = parse_affn_scope(buf); + if (affn < 0) + return affn; + + apply_wqattrs_lock(); + attrs = wq_sysfs_prep_attrs(wq); + if (attrs) { + attrs->affn_scope = affn; + ret = apply_workqueue_attrs_locked(wq, attrs); + } + apply_wqattrs_unlock(); + free_workqueue_attrs(attrs); + return ret ?: count; +} + static struct device_attribute wq_sysfs_unbound_attrs[] = { __ATTR(nice, 0644, wq_nice_show, wq_nice_store), __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), + __ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store), __ATTR_NULL, }; @@ -6537,6 +6620,20 @@ static void __init init_pod_type(struct wq_pod_type *pt, } } +static bool __init cpus_dont_share(int cpu0, int cpu1) +{ + return false; +} + +static bool __init cpus_share_smt(int cpu0, int cpu1) +{ +#ifdef CONFIG_SCHED_SMT + return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1)); +#else + return false; +#endif +} + static bool __init cpus_share_numa(int cpu0, int cpu1) { return cpu_to_node(cpu0) == cpu_to_node(cpu1); @@ -6554,6 +6651,9 @@ void __init workqueue_init_topology(void) struct workqueue_struct *wq; int cpu; + init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share); + init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt); + init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache); init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa); mutex_lock(&wq_pool_mutex); diff --git a/tools/workqueue/wq_dump.py b/tools/workqueue/wq_dump.py index ddd0bb4395ea..43ab71a193b8 100644 --- a/tools/workqueue/wq_dump.py +++ b/tools/workqueue/wq_dump.py @@ -78,11 +78,16 @@ worker_pool_idr = prog['worker_pool_idr'] workqueues = prog['workqueues'] wq_unbound_cpumask = prog['wq_unbound_cpumask'] wq_pod_types = prog['wq_pod_types'] +wq_affn_dfl = prog['wq_affn_dfl'] +wq_affn_names = prog['wq_affn_names'] WQ_UNBOUND = prog['WQ_UNBOUND'] WQ_ORDERED = prog['__WQ_ORDERED'] WQ_MEM_RECLAIM = prog['WQ_MEM_RECLAIM'] +WQ_AFFN_CPU = prog['WQ_AFFN_CPU'] +WQ_AFFN_SMT = prog['WQ_AFFN_SMT'] +WQ_AFFN_CACHE = prog['WQ_AFFN_CACHE'] WQ_AFFN_NUMA = prog['WQ_AFFN_NUMA'] WQ_AFFN_SYSTEM = prog['WQ_AFFN_SYSTEM'] @@ -109,12 +114,10 @@ def print_pod_type(pt): print(f' [{cpu}]={pt.cpu_pod[cpu].value_()}', end='') print('') -print('') -print('NUMA') -print_pod_type(wq_pod_types[WQ_AFFN_NUMA]) -print('') -print('SYSTEM') -print_pod_type(wq_pod_types[WQ_AFFN_SYSTEM]) +for affn in [WQ_AFFN_CPU, WQ_AFFN_SMT, WQ_AFFN_CACHE, WQ_AFFN_NUMA, WQ_AFFN_SYSTEM]: + print('') + print(f'{wq_affn_names[affn].string_().decode().upper()}{" (default)" if affn == wq_affn_dfl else ""}') + print_pod_type(wq_pod_types[affn]) print('') print('Worker Pools') -- cgit v1.2.3 From 8639ecebc9b1796d7074751a350462f5e1c61cd4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:25 -1000 Subject: workqueue: Implement non-strict affinity scope for unbound workqueues An unbound workqueue can be served by multiple worker_pools to improve locality. The segmentation is achieved by grouping CPUs into pods. By default, the cache boundaries according to cpus_share_cache() define the CPUs are grouped. Let's a workqueue is allowed to run on all CPUs and the system has two L3 caches. The workqueue would be mapped to two worker_pools each serving one L3 cache domains. While this improves locality, because the pod boundaries are strict, it limits the total bandwidth a given issuer can consume. For example, let's say there is a thread pinned to a CPU issuing enough work items to saturate the whole machine. With the machine segmented into two pods, no matter how many work items it issues, it can only use half of the CPUs on the system. While this limitation has existed for a very long time, it wasn't very pronounced because the affinity grouping used to be always by NUMA nodes. With cache boundaries as the default and support for even finer grained scopes (smt and cpu), it is now an a lot more pressing problem. This patch implements non-strict affinity scope where the pod boundaries aren't enforced strictly. Going back to the previous example, the workqueue would still be mapped to two worker_pools; however, the affinity enforcement would be soft. The workers in both pools would have their cpus_allowed set to the whole machine thus allowing the scheduler to migrate them anywhere on the machine. However, whenever an idle worker is woken up, the workqueue code asks the scheduler to bring back the task within the pod if the worker is outside. ie. work items start executing within its affinity scope but can be migrated outside as the scheduler sees fit. This removes the hard cap on utilization while maintaining the benefits of affinity scopes. After the earlier ->__pod_cpumask changes, the implementation is pretty simple. When non-strict which is the new default: * pool_allowed_cpus() returns @pool->attrs->cpumask instead of ->__pod_cpumask so that the workers are allowed to run on any CPU that the associated workqueues allow. * If the idle worker task's ->wake_cpu is outside the pod, kick_pool() sets the field to a CPU within the pod. This would be the first use of task_struct->wake_cpu outside scheduler proper, so it isn't clear whether this would be acceptable. However, other methods of migrating tasks are significantly more expensive and are likely prohibitively so if we want to do this on every work item. This needs discussion with scheduler folks. There is also a race window where setting ->wake_cpu wouldn't be effective as the target task is still on CPU. However, the window is pretty small and this being a best-effort optimization, it doesn't seem to warrant more complexity at the moment. While the non-strict cache affinity scopes seem to be the best option, the performance picture interacts with the affinity scope and is a bit complicated to fully discuss in this patch, so the behavior is made easily selectable through wqattrs and sysfs and the next patch will add documentation to discuss performance implications. v2: pool->attrs->affn_strict is set to true for per-cpu worker_pools. Signed-off-by: Tejun Heo Cc: Peter Zijlstra Cc: Linus Torvalds --- Documentation/core-api/workqueue.rst | 30 +++++++++++---- include/linux/workqueue.h | 11 ++++++ kernel/workqueue.c | 74 +++++++++++++++++++++++++++++++++++- tools/workqueue/wq_dump.py | 16 ++++++-- tools/workqueue/wq_monitor.py | 21 ++++++---- 5 files changed, 132 insertions(+), 20 deletions(-) (limited to 'tools') diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst index 56af317508c9..c73a6df6a118 100644 --- a/Documentation/core-api/workqueue.rst +++ b/Documentation/core-api/workqueue.rst @@ -353,9 +353,10 @@ Affinity Scopes An unbound workqueue groups CPUs according to its affinity scope to improve cache locality. For example, if a workqueue is using the default affinity scope of "cache", it will group CPUs according to last level cache -boundaries. A work item queued on the workqueue will be processed by a -worker running on one of the CPUs which share the last level cache with the -issuing CPU. +boundaries. A work item queued on the workqueue will be assigned to a worker +on one of the CPUs which share the last level cache with the issuing CPU. +Once started, the worker may or may not be allowed to move outside the scope +depending on the ``affinity_strict`` setting of the scope. Workqueue currently supports the following five affinity scopes. @@ -391,6 +392,21 @@ directory. ``affinity_scope`` Read to see the current affinity scope. Write to change. +``affinity_strict`` + 0 by default indicating that affinity scopes are not strict. When a work + item starts execution, workqueue makes a best-effort attempt to ensure + that the worker is inside its affinity scope, which is called + repatriation. Once started, the scheduler is free to move the worker + anywhere in the system as it sees fit. This enables benefiting from scope + locality while still being able to utilize other CPUs if necessary and + available. + + If set to 1, all workers of the scope are guaranteed always to be in the + scope. This may be useful when crossing affinity scopes has other + implications, for example, in terms of power consumption or workload + isolation. Strict NUMA scope can also be used to match the workqueue + behavior of older kernels. + Examining Configuration ======================= @@ -475,21 +491,21 @@ Monitoring Use tools/workqueue/wq_monitor.py to monitor workqueue operations: :: $ tools/workqueue/wq_monitor.py events - total infl CPUtime CPUhog CMwake mayday rescued + total infl CPUtime CPUhog CMW/RPR mayday rescued events 18545 0 6.1 0 5 - - events_highpri 8 0 0.0 0 0 - - events_long 3 0 0.0 0 0 - - - events_unbound 38306 0 0.1 - - - - + events_unbound 38306 0 0.1 - 7 - - events_freezable 0 0 0.0 0 0 - - events_power_efficient 29598 0 0.2 0 0 - - events_freezable_power_ 10 0 0.0 0 0 - - sock_diag_events 0 0 0.0 0 0 - - - total infl CPUtime CPUhog CMwake mayday rescued + total infl CPUtime CPUhog CMW/RPR mayday rescued events 18548 0 6.1 0 5 - - events_highpri 8 0 0.0 0 0 - - events_long 3 0 0.0 0 0 - - - events_unbound 38322 0 0.1 - - - - + events_unbound 38322 0 0.1 - 7 - - events_freezable 0 0 0.0 0 0 - - events_power_efficient 29603 0 0.2 0 0 - - events_freezable_power_ 10 0 0.0 0 0 - - diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index fe53976e088e..0c1cad38f9db 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -169,6 +169,17 @@ struct workqueue_attrs { */ cpumask_var_t __pod_cpumask; + /** + * @affn_strict: affinity scope is strict + * + * If clear, workqueue will make a best-effort attempt at starting the + * worker inside @__pod_cpumask but the scheduler is free to migrate it + * outside. + * + * If set, workers are only allowed to run inside @__pod_cpumask. + */ + bool affn_strict; + /* * Below fields aren't properties of a worker_pool. They only modify how * :c:func:`apply_workqueue_attrs` select pools and thus don't diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e61b4291bec8..6f6f4f37ceb3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -211,6 +211,7 @@ enum pool_workqueue_stats { PWQ_STAT_CPU_TIME, /* total CPU time consumed */ PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */ PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */ + PWQ_STAT_REPATRIATED, /* unbound workers brought back into scope */ PWQ_STAT_MAYDAY, /* maydays to rescuer */ PWQ_STAT_RESCUED, /* linked work items executed by rescuer */ @@ -1103,13 +1104,41 @@ static bool assign_work(struct work_struct *work, struct worker *worker, static bool kick_pool(struct worker_pool *pool) { struct worker *worker = first_idle_worker(pool); + struct task_struct *p; lockdep_assert_held(&pool->lock); if (!need_more_worker(pool) || !worker) return false; - wake_up_process(worker->task); + p = worker->task; + +#ifdef CONFIG_SMP + /* + * Idle @worker is about to execute @work and waking up provides an + * opportunity to migrate @worker at a lower cost by setting the task's + * wake_cpu field. Let's see if we want to move @worker to improve + * execution locality. + * + * We're waking the worker that went idle the latest and there's some + * chance that @worker is marked idle but hasn't gone off CPU yet. If + * so, setting the wake_cpu won't do anything. As this is a best-effort + * optimization and the race window is narrow, let's leave as-is for + * now. If this becomes pronounced, we can skip over workers which are + * still on cpu when picking an idle worker. + * + * If @pool has non-strict affinity, @worker might have ended up outside + * its affinity scope. Repatriate. + */ + if (!pool->attrs->affn_strict && + !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) { + struct work_struct *work = list_first_entry(&pool->worklist, + struct work_struct, entry); + p->wake_cpu = cpumask_any_distribute(pool->attrs->__pod_cpumask); + get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++; + } +#endif + wake_up_process(p); return true; } @@ -2051,7 +2080,10 @@ static struct worker *alloc_worker(int node) static cpumask_t *pool_allowed_cpus(struct worker_pool *pool) { - return pool->attrs->__pod_cpumask; + if (pool->cpu < 0 && pool->attrs->affn_strict) + return pool->attrs->__pod_cpumask; + else + return pool->attrs->cpumask; } /** @@ -3715,6 +3747,7 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, to->nice = from->nice; cpumask_copy(to->cpumask, from->cpumask); cpumask_copy(to->__pod_cpumask, from->__pod_cpumask); + to->affn_strict = from->affn_strict; /* * Unlike hash and equality test, copying shouldn't ignore wq-only @@ -3745,6 +3778,7 @@ static u32 wqattrs_hash(const struct workqueue_attrs *attrs) BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); hash = jhash(cpumask_bits(attrs->__pod_cpumask), BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); + hash = jhash_1word(attrs->affn_strict, hash); return hash; } @@ -3758,6 +3792,8 @@ static bool wqattrs_equal(const struct workqueue_attrs *a, return false; if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask)) return false; + if (a->affn_strict != b->affn_strict) + return false; return true; } @@ -5847,6 +5883,7 @@ module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644); * nice RW int : nice value of the workers * cpumask RW mask : bitmask of allowed CPUs for the workers * affinity_scope RW str : worker CPU affinity scope (cache, numa, none) + * affinity_strict RW bool : worker CPU affinity is strict */ struct wq_device { struct workqueue_struct *wq; @@ -6026,10 +6063,42 @@ static ssize_t wq_affn_scope_store(struct device *dev, return ret ?: count; } +static ssize_t wq_affinity_strict_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", + wq->unbound_attrs->affn_strict); +} + +static ssize_t wq_affinity_strict_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int v, ret = -ENOMEM; + + if (sscanf(buf, "%d", &v) != 1) + return -EINVAL; + + apply_wqattrs_lock(); + attrs = wq_sysfs_prep_attrs(wq); + if (attrs) { + attrs->affn_strict = (bool)v; + ret = apply_workqueue_attrs_locked(wq, attrs); + } + apply_wqattrs_unlock(); + free_workqueue_attrs(attrs); + return ret ?: count; +} + static struct device_attribute wq_sysfs_unbound_attrs[] = { __ATTR(nice, 0644, wq_nice_show, wq_nice_store), __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), __ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store), + __ATTR(affinity_strict, 0644, wq_affinity_strict_show, wq_affinity_strict_store), __ATTR_NULL, }; @@ -6452,6 +6521,7 @@ void __init workqueue_init_early(void) cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu)); pool->attrs->nice = std_nice[i++]; + pool->attrs->affn_strict = true; pool->node = cpu_to_node(cpu); /* alloc pool ID */ diff --git a/tools/workqueue/wq_dump.py b/tools/workqueue/wq_dump.py index 43ab71a193b8..d0df5833f2c1 100644 --- a/tools/workqueue/wq_dump.py +++ b/tools/workqueue/wq_dump.py @@ -36,10 +36,11 @@ Workqueue CPU -> pool Lists all workqueues along with their type and worker pool association. For each workqueue: - NAME TYPE POOL_ID... + NAME TYPE[,FLAGS] POOL_ID... NAME name of the workqueue TYPE percpu, unbound or ordered + FLAGS S: strict affinity scope POOL_ID worker pool ID associated with each possible CPU """ @@ -138,13 +139,16 @@ for pi, pool in idr_for_each(worker_pool_idr): print(f'cpu={pool.cpu.value_():3}', end='') else: print(f'cpus={cpumask_str(pool.attrs.cpumask)}', end='') + print(f' pod_cpus={cpumask_str(pool.attrs.__pod_cpumask)}', end='') + if pool.attrs.affn_strict: + print(' strict', end='') print('') print('') print('Workqueue CPU -> pool') print('=====================') -print('[ workqueue \ CPU ', end='') +print('[ workqueue \ type CPU', end='') for cpu in for_each_possible_cpu(prog): print(f' {cpu:{max_pool_id_len}}', end='') print(' dfl]') @@ -153,11 +157,15 @@ for wq in list_for_each_entry('struct workqueue_struct', workqueues.address_of_( print(f'{wq.name.string_().decode()[-24:]:24}', end='') if wq.flags & WQ_UNBOUND: if wq.flags & WQ_ORDERED: - print(' ordered', end='') + print(' ordered ', end='') else: print(' unbound', end='') + if wq.unbound_attrs.affn_strict: + print(',S ', end='') + else: + print(' ', end='') else: - print(' percpu ', end='') + print(' percpu ', end='') for cpu in for_each_possible_cpu(prog): pool_id = per_cpu_ptr(wq.cpu_pwq, cpu)[0].pool.id.value_() diff --git a/tools/workqueue/wq_monitor.py b/tools/workqueue/wq_monitor.py index 6e258d123e8c..a8856a9c45dc 100644 --- a/tools/workqueue/wq_monitor.py +++ b/tools/workqueue/wq_monitor.py @@ -20,8 +20,11 @@ https://github.com/osandov/drgn. and got excluded from concurrency management to avoid stalling other work items. - CMwake The number of concurrency-management wake-ups while executing a - work item of the workqueue. + CMW/RPR For per-cpu workqueues, the number of concurrency-management + wake-ups while executing a work item of the workqueue. For + unbound workqueues, the number of times a worker was repatriated + to its affinity scope after being migrated to an off-scope CPU by + the scheduler. mayday The number of times the rescuer was requested while waiting for new worker creation. @@ -65,6 +68,7 @@ PWQ_STAT_COMPLETED = prog['PWQ_STAT_COMPLETED'] # work items completed exec PWQ_STAT_CPU_TIME = prog['PWQ_STAT_CPU_TIME'] # total CPU time consumed PWQ_STAT_CPU_INTENSIVE = prog['PWQ_STAT_CPU_INTENSIVE'] # wq_cpu_intensive_thresh_us violations PWQ_STAT_CM_WAKEUP = prog['PWQ_STAT_CM_WAKEUP'] # concurrency-management worker wakeups +PWQ_STAT_REPATRIATED = prog['PWQ_STAT_REPATRIATED'] # unbound workers brought back into scope PWQ_STAT_MAYDAY = prog['PWQ_STAT_MAYDAY'] # maydays to rescuer PWQ_STAT_RESCUED = prog['PWQ_STAT_RESCUED'] # linked work items executed by rescuer PWQ_NR_STATS = prog['PWQ_NR_STATS'] @@ -89,22 +93,25 @@ class WqStats: 'cpu_time' : self.stats[PWQ_STAT_CPU_TIME], 'cpu_intensive' : self.stats[PWQ_STAT_CPU_INTENSIVE], 'cm_wakeup' : self.stats[PWQ_STAT_CM_WAKEUP], + 'repatriated' : self.stats[PWQ_STAT_REPATRIATED], 'mayday' : self.stats[PWQ_STAT_MAYDAY], 'rescued' : self.stats[PWQ_STAT_RESCUED], } def table_header_str(): return f'{"":>24} {"total":>8} {"infl":>5} {"CPUtime":>8} '\ - f'{"CPUitsv":>7} {"CMwake":>7} {"mayday":>7} {"rescued":>7}' + f'{"CPUitsv":>7} {"CMW/RPR":>7} {"mayday":>7} {"rescued":>7}' def table_row_str(self): cpu_intensive = '-' - cm_wakeup = '-' + cmw_rpr = '-' mayday = '-' rescued = '-' - if not self.unbound: + if self.unbound: + cmw_rpr = str(self.stats[PWQ_STAT_REPATRIATED]); + else: cpu_intensive = str(self.stats[PWQ_STAT_CPU_INTENSIVE]) - cm_wakeup = str(self.stats[PWQ_STAT_CM_WAKEUP]) + cmw_rpr = str(self.stats[PWQ_STAT_CM_WAKEUP]) if self.mem_reclaim: mayday = str(self.stats[PWQ_STAT_MAYDAY]) @@ -115,7 +122,7 @@ class WqStats: f'{max(self.stats[PWQ_STAT_STARTED] - self.stats[PWQ_STAT_COMPLETED], 0):5} ' \ f'{self.stats[PWQ_STAT_CPU_TIME] / 1000000:8.1f} ' \ f'{cpu_intensive:>7} ' \ - f'{cm_wakeup:>7} ' \ + f'{cmw_rpr:>7} ' \ f'{mayday:>7} ' \ f'{rescued:>7} ' return out.rstrip(':') -- cgit v1.2.3