summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-12-29 10:57:20 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2018-12-29 10:57:20 -0800
commit6f9d71c9c759b1e7d31189a4de228983192c7dc7 (patch)
treeb8372cc73cbac4701c85ca1d441c4ba8a53b4c1f
parent55db91fbaad9ea769d516e6867195808b4399894 (diff)
parent3fc9c12d27b4ded4f1f761a800558dab2e6bbac5 (diff)
downloadlwn-6f9d71c9c759b1e7d31189a4de228983192c7dc7.tar.gz
lwn-6f9d71c9c759b1e7d31189a4de228983192c7dc7.zip
Merge branch 'for-4.21' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: - Waiman's cgroup2 cpuset support has been finally merged closing one of the last remaining feature gaps. - cgroup.procs could show non-leader threads when cgroup2 threaded mode was used in certain ways. I forgot to push the fix during the last cycle. - A patch to fix mount option parsing when all mount options have been consumed by someone else (LSM). - cgroup_no_v1 boot param can now block named cgroup1 hierarchies too. * 'for-4.21' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: cgroup: Add named hierarchy disabling to cgroup_no_v1 boot param cgroup: fix parsing empty mount option string cpuset: Remove set but not used variable 'cs' cgroup: fix CSS_TASK_ITER_PROCS cgroup: Add .__DEBUG__. prefix to debug file names cpuset: Minor cgroup2 interface updates cpuset: Expose cpuset.cpus.subpartitions with cgroup_debug cpuset: Add documentation about the new "cpuset.sched.partition" flag cpuset: Use descriptive text when reading/writing cpuset.sched.partition cpuset: Expose cpus.effective and mems.effective on cgroup v2 root cpuset: Make generate_sched_domains() work with partition cpuset: Make CPU hotplug work with partition cpuset: Track cpusets that use parent's effective_cpus cpuset: Add an error state to cpuset.sched.partition cpuset: Add new v2 cpuset.sched.partition flag cpuset: Simply allocation and freeing of cpumasks cpuset: Define data structures to support scheduling partition cpuset: Enable cpuset controller in default hierarchy cgroup: remove unnecessary unlikely()
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst182
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt8
-rw-r--r--include/linux/cgroup-defs.h1
-rw-r--r--kernel/cgroup/cgroup-internal.h2
-rw-r--r--kernel/cgroup/cgroup-v1.c14
-rw-r--r--kernel/cgroup/cgroup.c60
-rw-r--r--kernel/cgroup/cpuset.c944
-rw-r--r--kernel/cgroup/debug.c4
8 files changed, 1115 insertions, 100 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index baf19bf28385..7bf3f129c68b 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -56,11 +56,13 @@ v1 is available under Documentation/cgroup-v1/.
5-3-3-2. IO Latency Interface Files
5-4. PID
5-4-1. PID Interface Files
- 5-5. Device
- 5-6. RDMA
- 5-6-1. RDMA Interface Files
- 5-7. Misc
- 5-7-1. perf_event
+ 5-5. Cpuset
+ 5.5-1. Cpuset Interface Files
+ 5-6. Device
+ 5-7. RDMA
+ 5-7-1. RDMA Interface Files
+ 5-8. Misc
+ 5-8-1. perf_event
5-N. Non-normative information
5-N-1. CPU controller root cgroup process behaviour
5-N-2. IO controller root cgroup process behaviour
@@ -1610,6 +1612,176 @@ through fork() or clone(). These will return -EAGAIN if the creation
of a new process would cause a cgroup policy to be violated.
+Cpuset
+------
+
+The "cpuset" controller provides a mechanism for constraining
+the CPU and memory node placement of tasks to only the resources
+specified in the cpuset interface files in a task's current cgroup.
+This is especially valuable on large NUMA systems where placing jobs
+on properly sized subsets of the systems with careful processor and
+memory placement to reduce cross-node memory access and contention
+can improve overall system performance.
+
+The "cpuset" controller is hierarchical. That means the controller
+cannot use CPUs or memory nodes not allowed in its parent.
+
+
+Cpuset Interface Files
+~~~~~~~~~~~~~~~~~~~~~~
+
+ cpuset.cpus
+ A read-write multiple values file which exists on non-root
+ cpuset-enabled cgroups.
+
+ It lists the requested CPUs to be used by tasks within this
+ cgroup. The actual list of CPUs to be granted, however, is
+ subjected to constraints imposed by its parent and can differ
+ from the requested CPUs.
+
+ The CPU numbers are comma-separated numbers or ranges.
+ For example:
+
+ # cat cpuset.cpus
+ 0-4,6,8-10
+
+ An empty value indicates that the cgroup is using the same
+ setting as the nearest cgroup ancestor with a non-empty
+ "cpuset.cpus" or all the available CPUs if none is found.
+
+ The value of "cpuset.cpus" stays constant until the next update
+ and won't be affected by any CPU hotplug events.
+
+ cpuset.cpus.effective
+ A read-only multiple values file which exists on all
+ cpuset-enabled cgroups.
+
+ It lists the onlined CPUs that are actually granted to this
+ cgroup by its parent. These CPUs are allowed to be used by
+ tasks within the current cgroup.
+
+ If "cpuset.cpus" is empty, the "cpuset.cpus.effective" file shows
+ all the CPUs from the parent cgroup that can be available to
+ be used by this cgroup. Otherwise, it should be a subset of
+ "cpuset.cpus" unless none of the CPUs listed in "cpuset.cpus"
+ can be granted. In this case, it will be treated just like an
+ empty "cpuset.cpus".
+
+ Its value will be affected by CPU hotplug events.
+
+ cpuset.mems
+ A read-write multiple values file which exists on non-root
+ cpuset-enabled cgroups.
+
+ It lists the requested memory nodes to be used by tasks within
+ this cgroup. The actual list of memory nodes granted, however,
+ is subjected to constraints imposed by its parent and can differ
+ from the requested memory nodes.
+
+ The memory node numbers are comma-separated numbers or ranges.
+ For example:
+
+ # cat cpuset.mems
+ 0-1,3
+
+ An empty value indicates that the cgroup is using the same
+ setting as the nearest cgroup ancestor with a non-empty
+ "cpuset.mems" or all the available memory nodes if none
+ is found.
+
+ The value of "cpuset.mems" stays constant until the next update
+ and won't be affected by any memory nodes hotplug events.
+
+ cpuset.mems.effective
+ A read-only multiple values file which exists on all
+ cpuset-enabled cgroups.
+
+ It lists the onlined memory nodes that are actually granted to
+ this cgroup by its parent. These memory nodes are allowed to
+ be used by tasks within the current cgroup.
+
+ If "cpuset.mems" is empty, it shows all the memory nodes from the
+ parent cgroup that will be available to be used by this cgroup.
+ Otherwise, it should be a subset of "cpuset.mems" unless none of
+ the memory nodes listed in "cpuset.mems" can be granted. In this
+ case, it will be treated just like an empty "cpuset.mems".
+
+ Its value will be affected by memory nodes hotplug events.
+
+ cpuset.cpus.partition
+ A read-write single value file which exists on non-root
+ cpuset-enabled cgroups. This flag is owned by the parent cgroup
+ and is not delegatable.
+
+ It accepts only the following input values when written to.
+
+ "root" - a paritition root
+ "member" - a non-root member of a partition
+
+ When set to be a partition root, the current cgroup is the
+ root of a new partition or scheduling domain that comprises
+ itself and all its descendants except those that are separate
+ partition roots themselves and their descendants. The root
+ cgroup is always a partition root.
+
+ There are constraints on where a partition root can be set.
+ It can only be set in a cgroup if all the following conditions
+ are true.
+
+ 1) The "cpuset.cpus" is not empty and the list of CPUs are
+ exclusive, i.e. they are not shared by any of its siblings.
+ 2) The parent cgroup is a partition root.
+ 3) The "cpuset.cpus" is also a proper subset of the parent's
+ "cpuset.cpus.effective".
+ 4) There is no child cgroups with cpuset enabled. This is for
+ eliminating corner cases that have to be handled if such a
+ condition is allowed.
+
+ Setting it to partition root will take the CPUs away from the
+ effective CPUs of the parent cgroup. Once it is set, this
+ file cannot be reverted back to "member" if there are any child
+ cgroups with cpuset enabled.
+
+ A parent partition cannot distribute all its CPUs to its
+ child partitions. There must be at least one cpu left in the
+ parent partition.
+
+ Once becoming a partition root, changes to "cpuset.cpus" is
+ generally allowed as long as the first condition above is true,
+ the change will not take away all the CPUs from the parent
+ partition and the new "cpuset.cpus" value is a superset of its
+ children's "cpuset.cpus" values.
+
+ Sometimes, external factors like changes to ancestors'
+ "cpuset.cpus" or cpu hotplug can cause the state of the partition
+ root to change. On read, the "cpuset.sched.partition" file
+ can show the following values.
+
+ "member" Non-root member of a partition
+ "root" Partition root
+ "root invalid" Invalid partition root
+
+ It is a partition root if the first 2 partition root conditions
+ above are true and at least one CPU from "cpuset.cpus" is
+ granted by the parent cgroup.
+
+ A partition root can become invalid if none of CPUs requested
+ in "cpuset.cpus" can be granted by the parent cgroup or the
+ parent cgroup is no longer a partition root itself. In this
+ case, it is not a real partition even though the restriction
+ of the first partition root condition above will still apply.
+ The cpu affinity of all the tasks in the cgroup will then be
+ associated with CPUs in the nearest ancestor partition.
+
+ An invalid partition root can be transitioned back to a
+ real partition root if at least one of the requested CPUs
+ can now be granted by its parent. In this case, the cpu
+ affinity of all the tasks in the formerly invalid partition
+ will be associated to the CPUs of the newly formed partition.
+ Changing the partition state of an invalid partition root to
+ "member" is always allowed even if child cpusets are present.
+
+
Device controller
-----------------
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index ff4daa780ae8..b7c9040f547e 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -486,10 +486,14 @@
cut the overhead, others just disable the usage. So
only cgroup_disable=memory is actually worthy}
- cgroup_no_v1= [KNL] Disable one, multiple, all cgroup controllers in v1
- Format: { controller[,controller...] | "all" }
+ cgroup_no_v1= [KNL] Disable cgroup controllers and named hierarchies in v1
+ Format: { { controller | "all" | "named" }
+ [,{ controller | "all" | "named" }...] }
Like cgroup_disable, but only applies to cgroup v1;
the blacklisted controllers remain available in cgroup2.
+ "all" blacklists all controllers and "named" disables
+ named mounts. Specifying both "all" and "named" disables
+ all v1 hierarchies.
cgroup.memory= [KNL] Pass options to the cgroup memory controller.
Format: <string>
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 5e1694fe035b..8fcbae1b8db0 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -92,6 +92,7 @@ enum {
CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */
CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */
+ CFTYPE_DEBUG = (1 << 5), /* create when cgroup_debug */
/* internal flags, do not use outside cgroup core proper */
__CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 75568fcf2180..c950864016e2 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -11,6 +11,8 @@
#define TRACE_CGROUP_PATH_LEN 1024
extern spinlock_t trace_cgroup_path_lock;
extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
+extern bool cgroup_debug;
+extern void __init enable_debug_cgroup(void);
/*
* cgroup_path() takes a spin lock. It is good practice not to take
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 51063e7a93c2..583b969b0c0e 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -27,6 +27,9 @@
/* Controllers blocked by the commandline in v1 */
static u16 cgroup_no_v1_mask;
+/* disable named v1 mounts */
+static bool cgroup_no_v1_named;
+
/*
* pidlist destructions need to be flushed on cgroup destruction. Use a
* separate workqueue as flush domain.
@@ -963,6 +966,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
}
if (!strncmp(token, "name=", 5)) {
const char *name = token + 5;
+
+ /* blocked by boot param? */
+ if (cgroup_no_v1_named)
+ return -ENOENT;
/* Can't specify an empty name */
if (!strlen(name))
return -EINVAL;
@@ -1292,7 +1299,12 @@ static int __init cgroup_no_v1(char *str)
if (!strcmp(token, "all")) {
cgroup_no_v1_mask = U16_MAX;
- break;
+ continue;
+ }
+
+ if (!strcmp(token, "named")) {
+ cgroup_no_v1_named = true;
+ continue;
}
for_each_subsys(ss, i) {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 39eb36ba36ad..f31bd61c9466 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -86,6 +86,7 @@ EXPORT_SYMBOL_GPL(css_set_lock);
DEFINE_SPINLOCK(trace_cgroup_path_lock);
char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
+bool cgroup_debug __read_mostly;
/*
* Protects cgroup_idr and css_idr so that IDs can be released without
@@ -1429,12 +1430,15 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
struct cgroup_subsys *ss = cft->ss;
if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
- !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
- snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
- cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
+ !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
+ const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
+
+ snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
+ dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
cft->name);
- else
+ } else {
strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
+ }
return buf;
}
@@ -1774,7 +1778,7 @@ static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
*root_flags = 0;
- if (!data)
+ if (!data || *data == '\0')
return 0;
while ((token = strsep(&data, ",")) != NULL) {
@@ -3669,7 +3673,8 @@ restart:
continue;
if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
continue;
-
+ if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
+ continue;
if (is_add) {
ret = cgroup_add_file(css, cgrp, cft);
if (ret) {
@@ -4232,20 +4237,25 @@ static void css_task_iter_advance(struct css_task_iter *it)
lockdep_assert_held(&css_set_lock);
repeat:
- /*
- * Advance iterator to find next entry. cset->tasks is consumed
- * first and then ->mg_tasks. After ->mg_tasks, we move onto the
- * next cset.
- */
- next = it->task_pos->next;
+ if (it->task_pos) {
+ /*
+ * Advance iterator to find next entry. cset->tasks is
+ * consumed first and then ->mg_tasks. After ->mg_tasks,
+ * we move onto the next cset.
+ */
+ next = it->task_pos->next;
- if (next == it->tasks_head)
- next = it->mg_tasks_head->next;
+ if (next == it->tasks_head)
+ next = it->mg_tasks_head->next;
- if (next == it->mg_tasks_head)
+ if (next == it->mg_tasks_head)
+ css_task_iter_advance_css_set(it);
+ else
+ it->task_pos = next;
+ } else {
+ /* called from start, proceed to the first cset */
css_task_iter_advance_css_set(it);
- else
- it->task_pos = next;
+ }
/* if PROCS, skip over tasks which aren't group leaders */
if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
@@ -4285,7 +4295,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
it->cset_head = it->cset_pos;
- css_task_iter_advance_css_set(it);
+ css_task_iter_advance(it);
spin_unlock_irq(&css_set_lock);
}
@@ -5773,6 +5783,16 @@ static int __init cgroup_disable(char *str)
}
__setup("cgroup_disable=", cgroup_disable);
+void __init __weak enable_debug_cgroup(void) { }
+
+static int __init enable_cgroup_debug(char *str)
+{
+ cgroup_debug = true;
+ enable_debug_cgroup();
+ return 1;
+}
+__setup("cgroup_debug", enable_cgroup_debug);
+
/**
* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
* @dentry: directory dentry of interest
@@ -6008,10 +6028,8 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf,
ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
- if (unlikely(ret >= size)) {
- WARN_ON(1);
+ if (WARN_ON(ret >= size))
break;
- }
}
return ret;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 9510a5b32eaf..479743db6c37 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -110,6 +110,16 @@ struct cpuset {
nodemask_t effective_mems;
/*
+ * CPUs allocated to child sub-partitions (default hierarchy only)
+ * - CPUs granted by the parent = effective_cpus U subparts_cpus
+ * - effective_cpus and subparts_cpus are mutually exclusive.
+ *
+ * effective_cpus contains only onlined CPUs, but subparts_cpus
+ * may have offlined ones.
+ */
+ cpumask_var_t subparts_cpus;
+
+ /*
* This is old Memory Nodes tasks took on.
*
* - top_cpuset.old_mems_allowed is initialized to mems_allowed.
@@ -134,6 +144,47 @@ struct cpuset {
/* for custom sched domain */
int relax_domain_level;
+
+ /* number of CPUs in subparts_cpus */
+ int nr_subparts_cpus;
+
+ /* partition root state */
+ int partition_root_state;
+
+ /*
+ * Default hierarchy only:
+ * use_parent_ecpus - set if using parent's effective_cpus
+ * child_ecpus_count - # of children with use_parent_ecpus set
+ */
+ int use_parent_ecpus;
+ int child_ecpus_count;
+};
+
+/*
+ * Partition root states:
+ *
+ * 0 - not a partition root
+ *
+ * 1 - partition root
+ *
+ * -1 - invalid partition root
+ * None of the cpus in cpus_allowed can be put into the parent's
+ * subparts_cpus. In this case, the cpuset is not a real partition
+ * root anymore. However, the CPU_EXCLUSIVE bit will still be set
+ * and the cpuset can be restored back to a partition root if the
+ * parent cpuset can give more CPUs back to this child cpuset.
+ */
+#define PRS_DISABLED 0
+#define PRS_ENABLED 1
+#define PRS_ERROR -1
+
+/*
+ * Temporary cpumasks for working with partitions that are passed among
+ * functions to avoid memory allocation in inner functions.
+ */
+struct tmpmasks {
+ cpumask_var_t addmask, delmask; /* For partition root */
+ cpumask_var_t new_cpus; /* For update_cpumasks_hier() */
};
static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
@@ -218,9 +269,15 @@ static inline int is_spread_slab(const struct cpuset *cs)
return test_bit(CS_SPREAD_SLAB, &cs->flags);
}
+static inline int is_partition_root(const struct cpuset *cs)
+{
+ return cs->partition_root_state > 0;
+}
+
static struct cpuset top_cpuset = {
.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
(1 << CS_MEM_EXCLUSIVE)),
+ .partition_root_state = PRS_ENABLED,
};
/**
@@ -419,6 +476,65 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
}
/**
+ * alloc_cpumasks - allocate three cpumasks for cpuset
+ * @cs: the cpuset that have cpumasks to be allocated.
+ * @tmp: the tmpmasks structure pointer
+ * Return: 0 if successful, -ENOMEM otherwise.
+ *
+ * Only one of the two input arguments should be non-NULL.
+ */
+static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
+{
+ cpumask_var_t *pmask1, *pmask2, *pmask3;
+
+ if (cs) {
+ pmask1 = &cs->cpus_allowed;
+ pmask2 = &cs->effective_cpus;
+ pmask3 = &cs->subparts_cpus;
+ } else {
+ pmask1 = &tmp->new_cpus;
+ pmask2 = &tmp->addmask;
+ pmask3 = &tmp->delmask;
+ }
+
+ if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (!zalloc_cpumask_var(pmask2, GFP_KERNEL))
+ goto free_one;
+
+ if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
+ goto free_two;
+
+ return 0;
+
+free_two:
+ free_cpumask_var(*pmask2);
+free_one:
+ free_cpumask_var(*pmask1);
+ return -ENOMEM;
+}
+
+/**
+ * free_cpumasks - free cpumasks in a tmpmasks structure
+ * @cs: the cpuset that have cpumasks to be free.
+ * @tmp: the tmpmasks structure pointer
+ */
+static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
+{
+ if (cs) {
+ free_cpumask_var(cs->cpus_allowed);
+ free_cpumask_var(cs->effective_cpus);
+ free_cpumask_var(cs->subparts_cpus);
+ }
+ if (tmp) {
+ free_cpumask_var(tmp->new_cpus);
+ free_cpumask_var(tmp->addmask);
+ free_cpumask_var(tmp->delmask);
+ }
+}
+
+/**
* alloc_trial_cpuset - allocate a trial cpuset
* @cs: the cpuset that the trial cpuset duplicates
*/
@@ -430,31 +546,24 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
if (!trial)
return NULL;
- if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
- goto free_cs;
- if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
- goto free_cpus;
+ if (alloc_cpumasks(trial, NULL)) {
+ kfree(trial);
+ return NULL;
+ }
cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
cpumask_copy(trial->effective_cpus, cs->effective_cpus);
return trial;
-
-free_cpus:
- free_cpumask_var(trial->cpus_allowed);
-free_cs:
- kfree(trial);
- return NULL;
}
/**
- * free_trial_cpuset - free the trial cpuset
- * @trial: the trial cpuset to be freed
+ * free_cpuset - free the cpuset
+ * @cs: the cpuset to be freed
*/
-static void free_trial_cpuset(struct cpuset *trial)
+static inline void free_cpuset(struct cpuset *cs)
{
- free_cpumask_var(trial->effective_cpus);
- free_cpumask_var(trial->cpus_allowed);
- kfree(trial);
+ free_cpumasks(cs, NULL);
+ kfree(cs);
}
/*
@@ -660,13 +769,14 @@ static int generate_sched_domains(cpumask_var_t **domains,
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
struct cgroup_subsys_state *pos_css;
+ bool root_load_balance = is_sched_load_balance(&top_cpuset);
doms = NULL;
dattr = NULL;
csa = NULL;
/* Special case for the 99% of systems with one, full, sched domain */
- if (is_sched_load_balance(&top_cpuset)) {
+ if (root_load_balance && !top_cpuset.nr_subparts_cpus) {
ndoms = 1;
doms = alloc_sched_domains(ndoms);
if (!doms)
@@ -689,6 +799,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
csn = 0;
rcu_read_lock();
+ if (root_load_balance)
+ csa[csn++] = &top_cpuset;
cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
if (cp == &top_cpuset)
continue;
@@ -699,6 +811,9 @@ static int generate_sched_domains(cpumask_var_t **domains,
* parent's cpus, so just skip them, and then we call
* update_domain_attr_tree() to calc relax_domain_level of
* the corresponding sched domain.
+ *
+ * If root is load-balancing, we can skip @cp if it
+ * is a subset of the root's effective_cpus.
*/
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
@@ -706,11 +821,16 @@ static int generate_sched_domains(cpumask_var_t **domains,
housekeeping_cpumask(HK_FLAG_DOMAIN))))
continue;
+ if (root_load_balance &&
+ cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
+ continue;
+
if (is_sched_load_balance(cp))
csa[csn++] = cp;
- /* skip @cp's subtree */
- pos_css = css_rightmost_descendant(pos_css);
+ /* skip @cp's subtree if not a partition root */
+ if (!is_partition_root(cp))
+ pos_css = css_rightmost_descendant(pos_css);
}
rcu_read_unlock();
@@ -838,7 +958,12 @@ static void rebuild_sched_domains_locked(void)
* passing doms with offlined cpu to partition_sched_domains().
* Anyways, hotplug work item will rebuild sched domains.
*/
- if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+ if (!top_cpuset.nr_subparts_cpus &&
+ !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+ goto out;
+
+ if (top_cpuset.nr_subparts_cpus &&
+ !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask))
goto out;
/* Generate domain masks and attrs */
@@ -881,10 +1006,248 @@ static void update_tasks_cpumask(struct cpuset *cs)
css_task_iter_end(&it);
}
+/**
+ * compute_effective_cpumask - Compute the effective cpumask of the cpuset
+ * @new_cpus: the temp variable for the new effective_cpus mask
+ * @cs: the cpuset the need to recompute the new effective_cpus mask
+ * @parent: the parent cpuset
+ *
+ * If the parent has subpartition CPUs, include them in the list of
+ * allowable CPUs in computing the new effective_cpus mask. Since offlined
+ * CPUs are not removed from subparts_cpus, we have to use cpu_active_mask
+ * to mask those out.
+ */
+static void compute_effective_cpumask(struct cpumask *new_cpus,
+ struct cpuset *cs, struct cpuset *parent)
+{
+ if (parent->nr_subparts_cpus) {
+ cpumask_or(new_cpus, parent->effective_cpus,
+ parent->subparts_cpus);
+ cpumask_and(new_cpus, new_cpus, cs->cpus_allowed);
+ cpumask_and(new_cpus, new_cpus, cpu_active_mask);
+ } else {
+ cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
+ }
+}
+
+/*
+ * Commands for update_parent_subparts_cpumask
+ */
+enum subparts_cmd {
+ partcmd_enable, /* Enable partition root */
+ partcmd_disable, /* Disable partition root */
+ partcmd_update, /* Update parent's subparts_cpus */
+};
+
+/**
+ * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
+ * @cpuset: The cpuset that requests change in partition root state
+ * @cmd: Partition root state change command
+ * @newmask: Optional new cpumask for partcmd_update
+ * @tmp: Temporary addmask and delmask
+ * Return: 0, 1 or an error code
+ *
+ * For partcmd_enable, the cpuset is being transformed from a non-partition
+ * root to a partition root. The cpus_allowed mask of the given cpuset will
+ * be put into parent's subparts_cpus and taken away from parent's
+ * effective_cpus. The function will return 0 if all the CPUs listed in
+ * cpus_allowed can be granted or an error code will be returned.
+ *
+ * For partcmd_disable, the cpuset is being transofrmed from a partition
+ * root back to a non-partition root. any CPUs in cpus_allowed that are in
+ * parent's subparts_cpus will be taken away from that cpumask and put back
+ * into parent's effective_cpus. 0 should always be returned.
+ *
+ * For partcmd_update, if the optional newmask is specified, the cpu
+ * list is to be changed from cpus_allowed to newmask. Otherwise,
+ * cpus_allowed is assumed to remain the same. The cpuset should either
+ * be a partition root or an invalid partition root. The partition root
+ * state may change if newmask is NULL and none of the requested CPUs can
+ * be granted by the parent. The function will return 1 if changes to
+ * parent's subparts_cpus and effective_cpus happen or 0 otherwise.
+ * Error code should only be returned when newmask is non-NULL.
+ *
+ * The partcmd_enable and partcmd_disable commands are used by
+ * update_prstate(). The partcmd_update command is used by
+ * update_cpumasks_hier() with newmask NULL and update_cpumask() with
+ * newmask set.
+ *
+ * The checking is more strict when enabling partition root than the
+ * other two commands.
+ *
+ * Because of the implicit cpu exclusive nature of a partition root,
+ * cpumask changes that violates the cpu exclusivity rule will not be
+ * permitted when checked by validate_change(). The validate_change()
+ * function will also prevent any changes to the cpu list if it is not
+ * a superset of children's cpu lists.
+ */
+static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
+ struct cpumask *newmask,
+ struct tmpmasks *tmp)
+{
+ struct cpuset *parent = parent_cs(cpuset);
+ int adding; /* Moving cpus from effective_cpus to subparts_cpus */
+ int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
+ bool part_error = false; /* Partition error? */
+
+ lockdep_assert_held(&cpuset_mutex);
+
+ /*
+ * The parent must be a partition root.
+ * The new cpumask, if present, or the current cpus_allowed must
+ * not be empty.
+ */
+ if (!is_partition_root(parent) ||
+ (newmask && cpumask_empty(newmask)) ||
+ (!newmask && cpumask_empty(cpuset->cpus_allowed)))
+ return -EINVAL;
+
+ /*
+ * Enabling/disabling partition root is not allowed if there are
+ * online children.
+ */
+ if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
+ return -EBUSY;
+
+ /*
+ * Enabling partition root is not allowed if not all the CPUs
+ * can be granted from parent's effective_cpus or at least one
+ * CPU will be left after that.
+ */
+ if ((cmd == partcmd_enable) &&
+ (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
+ cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
+ return -EINVAL;
+
+ /*
+ * A cpumask update cannot make parent's effective_cpus become empty.
+ */
+ adding = deleting = false;
+ if (cmd == partcmd_enable) {
+ cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
+ adding = true;
+ } else if (cmd == partcmd_disable) {
+ deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
+ parent->subparts_cpus);
+ } else if (newmask) {
+ /*
+ * partcmd_update with newmask:
+ *
+ * delmask = cpus_allowed & ~newmask & parent->subparts_cpus
+ * addmask = newmask & parent->effective_cpus
+ * & ~parent->subparts_cpus
+ */
+ cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
+ deleting = cpumask_and(tmp->delmask, tmp->delmask,
+ parent->subparts_cpus);
+
+ cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
+ adding = cpumask_andnot(tmp->addmask, tmp->addmask,
+ parent->subparts_cpus);
+ /*
+ * Return error if the new effective_cpus could become empty.
+ */
+ if (adding &&
+ cpumask_equal(parent->effective_cpus, tmp->addmask)) {
+ if (!deleting)
+ return -EINVAL;
+ /*
+ * As some of the CPUs in subparts_cpus might have
+ * been offlined, we need to compute the real delmask
+ * to confirm that.
+ */
+ if (!cpumask_and(tmp->addmask, tmp->delmask,
+ cpu_active_mask))
+ return -EINVAL;
+ cpumask_copy(tmp->addmask, parent->effective_cpus);
+ }
+ } else {
+ /*
+ * partcmd_update w/o newmask:
+ *
+ * addmask = cpus_allowed & parent->effectiveb_cpus
+ *
+ * Note that parent's subparts_cpus may have been
+ * pre-shrunk in case there is a change in the cpu list.
+ * So no deletion is needed.
+ */
+ adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
+ parent->effective_cpus);
+ part_error = cpumask_equal(tmp->addmask,
+ parent->effective_cpus);
+ }
+
+ if (cmd == partcmd_update) {
+ int prev_prs = cpuset->partition_root_state;
+
+ /*
+ * Check for possible transition between PRS_ENABLED
+ * and PRS_ERROR.
+ */
+ switch (cpuset->partition_root_state) {
+ case PRS_ENABLED:
+ if (part_error)
+ cpuset->partition_root_state = PRS_ERROR;
+ break;
+ case PRS_ERROR:
+ if (!part_error)
+ cpuset->partition_root_state = PRS_ENABLED;
+ break;
+ }
+ /*
+ * Set part_error if previously in invalid state.
+ */
+ part_error = (prev_prs == PRS_ERROR);
+ }
+
+ if (!part_error && (cpuset->partition_root_state == PRS_ERROR))
+ return 0; /* Nothing need to be done */
+
+ if (cpuset->partition_root_state == PRS_ERROR) {
+ /*
+ * Remove all its cpus from parent's subparts_cpus.
+ */
+ adding = false;
+ deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
+ parent->subparts_cpus);
+ }
+
+ if (!adding && !deleting)
+ return 0;
+
+ /*
+ * Change the parent's subparts_cpus.
+ * Newly added CPUs will be removed from effective_cpus and
+ * newly deleted ones will be added back to effective_cpus.
+ */
+ spin_lock_irq(&callback_lock);
+ if (adding) {
+ cpumask_or(parent->subparts_cpus,
+ parent->subparts_cpus, tmp->addmask);
+ cpumask_andnot(parent->effective_cpus,
+ parent->effective_cpus, tmp->addmask);
+ }
+ if (deleting) {
+ cpumask_andnot(parent->subparts_cpus,
+ parent->subparts_cpus, tmp->delmask);
+ /*
+ * Some of the CPUs in subparts_cpus might have been offlined.
+ */
+ cpumask_and(tmp->delmask, tmp->delmask, cpu_active_mask);
+ cpumask_or(parent->effective_cpus,
+ parent->effective_cpus, tmp->delmask);
+ }
+
+ parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
+ spin_unlock_irq(&callback_lock);
+
+ return cmd == partcmd_update;
+}
+
/*
* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
- * @cs: the cpuset to consider
- * @new_cpus: temp variable for calculating new effective_cpus
+ * @cs: the cpuset to consider
+ * @tmp: temp variables for calculating effective_cpus & partition setup
*
* When congifured cpumask is changed, the effective cpumasks of this cpuset
* and all its descendants need to be updated.
@@ -893,7 +1256,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
*
* Called with cpuset_mutex held
*/
-static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
+static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
@@ -903,27 +1266,115 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
struct cpuset *parent = parent_cs(cp);
- cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
+ compute_effective_cpumask(tmp->new_cpus, cp, parent);
/*
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some CPUs.
*/
- if (is_in_v2_mode() && cpumask_empty(new_cpus))
- cpumask_copy(new_cpus, parent->effective_cpus);
+ if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
+ cpumask_copy(tmp->new_cpus, parent->effective_cpus);
+ if (!cp->use_parent_ecpus) {
+ cp->use_parent_ecpus = true;
+ parent->child_ecpus_count++;
+ }
+ } else if (cp->use_parent_ecpus) {
+ cp->use_parent_ecpus = false;
+ WARN_ON_ONCE(!parent->child_ecpus_count);
+ parent->child_ecpus_count--;
+ }
- /* Skip the whole subtree if the cpumask remains the same. */
- if (cpumask_equal(new_cpus, cp->effective_cpus)) {
+ /*
+ * Skip the whole subtree if the cpumask remains the same
+ * and has no partition root state.
+ */
+ if (!cp->partition_root_state &&
+ cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
+ /*
+ * update_parent_subparts_cpumask() should have been called
+ * for cs already in update_cpumask(). We should also call
+ * update_tasks_cpumask() again for tasks in the parent
+ * cpuset if the parent's subparts_cpus changes.
+ */
+ if ((cp != cs) && cp->partition_root_state) {
+ switch (parent->partition_root_state) {
+ case PRS_DISABLED:
+ /*
+ * If parent is not a partition root or an
+ * invalid partition root, clear the state
+ * state and the CS_CPU_EXCLUSIVE flag.
+ */
+ WARN_ON_ONCE(cp->partition_root_state
+ != PRS_ERROR);
+ cp->partition_root_state = 0;
+
+ /*
+ * clear_bit() is an atomic operation and
+ * readers aren't interested in the state
+ * of CS_CPU_EXCLUSIVE anyway. So we can
+ * just update the flag without holding
+ * the callback_lock.
+ */
+ clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
+ break;
+
+ case PRS_ENABLED:
+ if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
+ update_tasks_cpumask(parent);
+ break;
+
+ case PRS_ERROR:
+ /*
+ * When parent is invalid, it has to be too.
+ */
+ cp->partition_root_state = PRS_ERROR;
+ if (cp->nr_subparts_cpus) {
+ cp->nr_subparts_cpus = 0;
+ cpumask_clear(cp->subparts_cpus);
+ }
+ break;
+ }
+ }
+
if (!css_tryget_online(&cp->css))
continue;
rcu_read_unlock();
spin_lock_irq(&callback_lock);
- cpumask_copy(cp->effective_cpus, new_cpus);
+
+ cpumask_copy(cp->effective_cpus, tmp->new_cpus);
+ if (cp->nr_subparts_cpus &&
+ (cp->partition_root_state != PRS_ENABLED)) {
+ cp->nr_subparts_cpus = 0;
+ cpumask_clear(cp->subparts_cpus);
+ } else if (cp->nr_subparts_cpus) {
+ /*
+ * Make sure that effective_cpus & subparts_cpus
+ * are mutually exclusive.
+ *
+ * In the unlikely event that effective_cpus
+ * becomes empty. we clear cp->nr_subparts_cpus and
+ * let its child partition roots to compete for
+ * CPUs again.
+ */
+ cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
+ cp->subparts_cpus);
+ if (cpumask_empty(cp->effective_cpus)) {
+ cpumask_copy(cp->effective_cpus, tmp->new_cpus);
+ cpumask_clear(cp->subparts_cpus);
+ cp->nr_subparts_cpus = 0;
+ } else if (!cpumask_subset(cp->subparts_cpus,
+ tmp->new_cpus)) {
+ cpumask_andnot(cp->subparts_cpus,
+ cp->subparts_cpus, tmp->new_cpus);
+ cp->nr_subparts_cpus
+ = cpumask_weight(cp->subparts_cpus);
+ }
+ }
spin_unlock_irq(&callback_lock);
WARN_ON(!is_in_v2_mode() &&
@@ -932,11 +1383,15 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
update_tasks_cpumask(cp);
/*
- * If the effective cpumask of any non-empty cpuset is changed,
- * we need to rebuild sched domains.
+ * On legacy hierarchy, if the effective cpumask of any non-
+ * empty cpuset is changed, we need to rebuild sched domains.
+ * On default hierarchy, the cpuset needs to be a partition
+ * root as well.
*/
if (!cpumask_empty(cp->cpus_allowed) &&
- is_sched_load_balance(cp))
+ is_sched_load_balance(cp) &&
+ (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ is_partition_root(cp)))
need_rebuild_sched_domains = true;
rcu_read_lock();
@@ -949,6 +1404,35 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
}
/**
+ * update_sibling_cpumasks - Update siblings cpumasks
+ * @parent: Parent cpuset
+ * @cs: Current cpuset
+ * @tmp: Temp variables
+ */
+static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
+ struct tmpmasks *tmp)
+{
+ struct cpuset *sibling;
+ struct cgroup_subsys_state *pos_css;
+
+ /*
+ * Check all its siblings and call update_cpumasks_hier()
+ * if their use_parent_ecpus flag is set in order for them
+ * to use the right effective_cpus value.
+ */
+ rcu_read_lock();
+ cpuset_for_each_child(sibling, pos_css, parent) {
+ if (sibling == cs)
+ continue;
+ if (!sibling->use_parent_ecpus)
+ continue;
+
+ update_cpumasks_hier(sibling, tmp);
+ }
+ rcu_read_unlock();
+}
+
+/**
* update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
* @cs: the cpuset to consider
* @trialcs: trial cpuset
@@ -958,6 +1442,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
int retval;
+ struct tmpmasks tmp;
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
@@ -989,12 +1474,50 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
return retval;
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ /*
+ * Use the cpumasks in trialcs for tmpmasks when they are pointers
+ * to allocated cpumasks.
+ */
+ tmp.addmask = trialcs->subparts_cpus;
+ tmp.delmask = trialcs->effective_cpus;
+ tmp.new_cpus = trialcs->cpus_allowed;
+#endif
+
+ if (cs->partition_root_state) {
+ /* Cpumask of a partition root cannot be empty */
+ if (cpumask_empty(trialcs->cpus_allowed))
+ return -EINVAL;
+ if (update_parent_subparts_cpumask(cs, partcmd_update,
+ trialcs->cpus_allowed, &tmp) < 0)
+ return -EINVAL;
+ }
+
spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
+
+ /*
+ * Make sure that subparts_cpus is a subset of cpus_allowed.
+ */
+ if (cs->nr_subparts_cpus) {
+ cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus,
+ cs->cpus_allowed);
+ cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
+ }
spin_unlock_irq(&callback_lock);
- /* use trialcs->cpus_allowed as a temp variable */
- update_cpumasks_hier(cs, trialcs->cpus_allowed);
+ update_cpumasks_hier(cs, &tmp);
+
+ if (cs->partition_root_state) {
+ struct cpuset *parent = parent_cs(cs);
+
+ /*
+ * For partition root, update the cpumasks of sibling
+ * cpusets if they use parent's effective_cpus.
+ */
+ if (parent->child_ecpus_count)
+ update_sibling_cpumasks(parent, cs, &tmp);
+ }
return 0;
}
@@ -1348,7 +1871,95 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
if (spread_flag_changed)
update_tasks_flags(cs);
out:
- free_trial_cpuset(trialcs);
+ free_cpuset(trialcs);
+ return err;
+}
+
+/*
+ * update_prstate - update partititon_root_state
+ * cs: the cpuset to update
+ * val: 0 - disabled, 1 - enabled
+ *
+ * Call with cpuset_mutex held.
+ */
+static int update_prstate(struct cpuset *cs, int val)
+{
+ int err;
+ struct cpuset *parent = parent_cs(cs);
+ struct tmpmasks tmp;
+
+ if ((val != 0) && (val != 1))
+ return -EINVAL;
+ if (val == cs->partition_root_state)
+ return 0;
+
+ /*
+ * Cannot force a partial or invalid partition root to a full
+ * partition root.
+ */
+ if (val && cs->partition_root_state)
+ return -EINVAL;
+
+ if (alloc_cpumasks(NULL, &tmp))
+ return -ENOMEM;
+
+ err = -EINVAL;
+ if (!cs->partition_root_state) {
+ /*
+ * Turning on partition root requires setting the
+ * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
+ * cannot be NULL.
+ */
+ if (cpumask_empty(cs->cpus_allowed))
+ goto out;
+
+ err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
+ if (err)
+ goto out;
+
+ err = update_parent_subparts_cpumask(cs, partcmd_enable,
+ NULL, &tmp);
+ if (err) {
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ goto out;
+ }
+ cs->partition_root_state = PRS_ENABLED;
+ } else {
+ /*
+ * Turning off partition root will clear the
+ * CS_CPU_EXCLUSIVE bit.
+ */
+ if (cs->partition_root_state == PRS_ERROR) {
+ cs->partition_root_state = 0;
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ err = 0;
+ goto out;
+ }
+
+ err = update_parent_subparts_cpumask(cs, partcmd_disable,
+ NULL, &tmp);
+ if (err)
+ goto out;
+
+ cs->partition_root_state = 0;
+
+ /* Turning off CS_CPU_EXCLUSIVE will not return error */
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+ }
+
+ /*
+ * Update cpumask of parent's tasks except when it is the top
+ * cpuset as some system daemons cannot be mapped to other CPUs.
+ */
+ if (parent != &top_cpuset)
+ update_tasks_cpumask(parent);
+
+ if (parent->child_ecpus_count)
+ update_sibling_cpumasks(parent, cs, &tmp);
+
+ rebuild_sched_domains_locked();
+out:
+ free_cpumasks(NULL, &tmp);
return err;
}
@@ -1498,10 +2109,8 @@ out_unlock:
static void cpuset_cancel_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
- struct cpuset *cs;
cgroup_taskset_first(tset, &css);
- cs = css_cs(css);
mutex_lock(&cpuset_mutex);
css_cs(css)->attach_in_progress--;
@@ -1593,10 +2202,12 @@ typedef enum {
FILE_MEMLIST,
FILE_EFFECTIVE_CPULIST,
FILE_EFFECTIVE_MEMLIST,
+ FILE_SUBPARTS_CPULIST,
FILE_CPU_EXCLUSIVE,
FILE_MEM_EXCLUSIVE,
FILE_MEM_HARDWALL,
FILE_SCHED_LOAD_BALANCE,
+ FILE_PARTITION_ROOT,
FILE_SCHED_RELAX_DOMAIN_LEVEL,
FILE_MEMORY_PRESSURE_ENABLED,
FILE_MEMORY_PRESSURE,
@@ -1732,7 +2343,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
break;
}
- free_trial_cpuset(trialcs);
+ free_cpuset(trialcs);
out_unlock:
mutex_unlock(&cpuset_mutex);
kernfs_unbreak_active_protection(of->kn);
@@ -1770,6 +2381,9 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
case FILE_EFFECTIVE_MEMLIST:
seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
break;
+ case FILE_SUBPARTS_CPULIST:
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
+ break;
default:
ret = -EINVAL;
}
@@ -1824,12 +2438,60 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
return 0;
}
+static int sched_partition_show(struct seq_file *seq, void *v)
+{
+ struct cpuset *cs = css_cs(seq_css(seq));
+
+ switch (cs->partition_root_state) {
+ case PRS_ENABLED:
+ seq_puts(seq, "root\n");
+ break;
+ case PRS_DISABLED:
+ seq_puts(seq, "member\n");
+ break;
+ case PRS_ERROR:
+ seq_puts(seq, "root invalid\n");
+ break;
+ }
+ return 0;
+}
+
+static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct cpuset *cs = css_cs(of_css(of));
+ int val;
+ int retval = -ENODEV;
+
+ buf = strstrip(buf);
+
+ /*
+ * Convert "root" to ENABLED, and convert "member" to DISABLED.
+ */
+ if (!strcmp(buf, "root"))
+ val = PRS_ENABLED;
+ else if (!strcmp(buf, "member"))
+ val = PRS_DISABLED;
+ else
+ return -EINVAL;
+
+ css_get(&cs->css);
+ mutex_lock(&cpuset_mutex);
+ if (!is_cpuset_online(cs))
+ goto out_unlock;
+
+ retval = update_prstate(cs, val);
+out_unlock:
+ mutex_unlock(&cpuset_mutex);
+ css_put(&cs->css);
+ return retval ?: nbytes;
+}
/*
* for the common functions, 'private' gives the type of file
*/
-static struct cftype files[] = {
+static struct cftype legacy_files[] = {
{
.name = "cpus",
.seq_show = cpuset_common_seq_show,
@@ -1932,6 +2594,60 @@ static struct cftype files[] = {
};
/*
+ * This is currently a minimal set for the default hierarchy. It can be
+ * expanded later on by migrating more features and control files from v1.
+ */
+static struct cftype dfl_files[] = {
+ {
+ .name = "cpus",
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * NR_CPUS),
+ .private = FILE_CPULIST,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+
+ {
+ .name = "mems",
+ .seq_show = cpuset_common_seq_show,
+ .write = cpuset_write_resmask,
+ .max_write_len = (100U + 6 * MAX_NUMNODES),
+ .private = FILE_MEMLIST,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+
+ {
+ .name = "cpus.effective",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_CPULIST,
+ },
+
+ {
+ .name = "mems.effective",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_MEMLIST,
+ },
+
+ {
+ .name = "cpus.partition",
+ .seq_show = sched_partition_show,
+ .write = sched_partition_write,
+ .private = FILE_PARTITION_ROOT,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+
+ {
+ .name = "cpus.subpartitions",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_SUBPARTS_CPULIST,
+ .flags = CFTYPE_DEBUG,
+ },
+
+ { } /* terminate */
+};
+
+
+/*
* cpuset_css_alloc - allocate a cpuset css
* cgrp: control group that the new cpuset will be part of
*/
@@ -1947,26 +2663,19 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
- if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
- goto free_cs;
- if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
- goto free_cpus;
+
+ if (alloc_cpumasks(cs, NULL)) {
+ kfree(cs);
+ return ERR_PTR(-ENOMEM);
+ }
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
- cpumask_clear(cs->cpus_allowed);
nodes_clear(cs->mems_allowed);
- cpumask_clear(cs->effective_cpus);
nodes_clear(cs->effective_mems);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
return &cs->css;
-
-free_cpus:
- free_cpumask_var(cs->cpus_allowed);
-free_cs:
- kfree(cs);
- return ERR_PTR(-ENOMEM);
}
static int cpuset_css_online(struct cgroup_subsys_state *css)
@@ -1993,6 +2702,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
if (is_in_v2_mode()) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
+ cs->use_parent_ecpus = true;
+ parent->child_ecpus_count++;
}
spin_unlock_irq(&callback_lock);
@@ -2035,7 +2746,12 @@ out_unlock:
/*
* If the cpuset being removed has its flag 'sched_load_balance'
* enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains_locked().
+ * will call rebuild_sched_domains_locked(). That is not needed
+ * in the default hierarchy where only changes in partition
+ * will cause repartitioning.
+ *
+ * If the cpuset has the 'sched.partition' flag enabled, simulate
+ * turning 'sched.partition" off.
*/
static void cpuset_css_offline(struct cgroup_subsys_state *css)
@@ -2044,9 +2760,20 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
mutex_lock(&cpuset_mutex);
- if (is_sched_load_balance(cs))
+ if (is_partition_root(cs))
+ update_prstate(cs, 0);
+
+ if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ is_sched_load_balance(cs))
update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+ if (cs->use_parent_ecpus) {
+ struct cpuset *parent = parent_cs(cs);
+
+ cs->use_parent_ecpus = false;
+ parent->child_ecpus_count--;
+ }
+
cpuset_dec();
clear_bit(CS_ONLINE, &cs->flags);
@@ -2057,9 +2784,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
- free_cpumask_var(cs->effective_cpus);
- free_cpumask_var(cs->cpus_allowed);
- kfree(cs);
+ free_cpuset(cs);
}
static void cpuset_bind(struct cgroup_subsys_state *root_css)
@@ -2105,8 +2830,10 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.post_attach = cpuset_post_attach,
.bind = cpuset_bind,
.fork = cpuset_fork,
- .legacy_cftypes = files,
+ .legacy_cftypes = legacy_files,
+ .dfl_cftypes = dfl_files,
.early_init = true,
+ .threaded = true,
};
/**
@@ -2121,6 +2848,7 @@ int __init cpuset_init(void)
BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
+ BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
@@ -2227,20 +2955,29 @@ hotplug_update_tasks(struct cpuset *cs,
update_tasks_nodemask(cs);
}
+static bool force_rebuild;
+
+void cpuset_force_rebuild(void)
+{
+ force_rebuild = true;
+}
+
/**
* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
* @cs: cpuset in interest
+ * @tmp: the tmpmasks structure pointer
*
* Compare @cs's cpu and mem masks against top_cpuset and if some have gone
* offline, update @cs accordingly. If @cs ends up with no CPU or memory,
* all its tasks are moved to the nearest ancestor with both resources.
*/
-static void cpuset_hotplug_update_tasks(struct cpuset *cs)
+static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
{
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated;
bool mems_updated;
+ struct cpuset *parent;
retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
@@ -2255,9 +2992,60 @@ retry:
goto retry;
}
- cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
- nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
+ parent = parent_cs(cs);
+ compute_effective_cpumask(&new_cpus, cs, parent);
+ nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
+
+ if (cs->nr_subparts_cpus)
+ /*
+ * Make sure that CPUs allocated to child partitions
+ * do not show up in effective_cpus.
+ */
+ cpumask_andnot(&new_cpus, &new_cpus, cs->subparts_cpus);
+
+ if (!tmp || !cs->partition_root_state)
+ goto update_tasks;
+
+ /*
+ * In the unlikely event that a partition root has empty
+ * effective_cpus or its parent becomes erroneous, we have to
+ * transition it to the erroneous state.
+ */
+ if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
+ (parent->partition_root_state == PRS_ERROR))) {
+ if (cs->nr_subparts_cpus) {
+ cs->nr_subparts_cpus = 0;
+ cpumask_clear(cs->subparts_cpus);
+ compute_effective_cpumask(&new_cpus, cs, parent);
+ }
+
+ /*
+ * If the effective_cpus is empty because the child
+ * partitions take away all the CPUs, we can keep
+ * the current partition and let the child partitions
+ * fight for available CPUs.
+ */
+ if ((parent->partition_root_state == PRS_ERROR) ||
+ cpumask_empty(&new_cpus)) {
+ update_parent_subparts_cpumask(cs, partcmd_disable,
+ NULL, tmp);
+ cs->partition_root_state = PRS_ERROR;
+ }
+ cpuset_force_rebuild();
+ }
+
+ /*
+ * On the other hand, an erroneous partition root may be transitioned
+ * back to a regular one or a partition root with no CPU allocated
+ * from the parent may change to erroneous.
+ */
+ if (is_partition_root(parent) &&
+ ((cs->partition_root_state == PRS_ERROR) ||
+ !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
+ update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
+ cpuset_force_rebuild();
+update_tasks:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
@@ -2271,13 +3059,6 @@ retry:
mutex_unlock(&cpuset_mutex);
}
-static bool force_rebuild;
-
-void cpuset_force_rebuild(void)
-{
- force_rebuild = true;
-}
-
/**
* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
*
@@ -2300,6 +3081,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
bool on_dfl = is_in_v2_mode();
+ struct tmpmasks tmp, *ptmp = NULL;
+
+ if (on_dfl && !alloc_cpumasks(NULL, &tmp))
+ ptmp = &tmp;
mutex_lock(&cpuset_mutex);
@@ -2307,6 +3092,11 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
cpumask_copy(&new_cpus, cpu_active_mask);
new_mems = node_states[N_MEMORY];
+ /*
+ * If subparts_cpus is populated, it is likely that the check below
+ * will produce a false positive on cpus_updated when the cpu list
+ * isn't changed. It is extra work, but it is better to be safe.
+ */
cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
@@ -2315,6 +3105,22 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
spin_lock_irq(&callback_lock);
if (!on_dfl)
cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+ /*
+ * Make sure that CPUs allocated to child partitions
+ * do not show up in effective_cpus. If no CPU is left,
+ * we clear the subparts_cpus & let the child partitions
+ * fight for the CPUs again.
+ */
+ if (top_cpuset.nr_subparts_cpus) {
+ if (cpumask_subset(&new_cpus,
+ top_cpuset.subparts_cpus)) {
+ top_cpuset.nr_subparts_cpus = 0;
+ cpumask_clear(top_cpuset.subparts_cpus);
+ } else {
+ cpumask_andnot(&new_cpus, &new_cpus,
+ top_cpuset.subparts_cpus);
+ }
+ }
cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
spin_unlock_irq(&callback_lock);
/* we don't mess with cpumasks of tasks in top_cpuset */
@@ -2343,7 +3149,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
continue;
rcu_read_unlock();
- cpuset_hotplug_update_tasks(cs);
+ cpuset_hotplug_update_tasks(cs, ptmp);
rcu_read_lock();
css_put(&cs->css);
@@ -2356,6 +3162,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
force_rebuild = false;
rebuild_sched_domains();
}
+
+ free_cpumasks(NULL, ptmp);
}
void cpuset_update_active_cpus(void)
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
index 9caeda610249..5f1b87330bee 100644
--- a/kernel/cgroup/debug.c
+++ b/kernel/cgroup/debug.c
@@ -373,11 +373,9 @@ struct cgroup_subsys debug_cgrp_subsys = {
* On v2, debug is an implicit controller enabled by "cgroup_debug" boot
* parameter.
*/
-static int __init enable_cgroup_debug(char *str)
+void __init enable_debug_cgroup(void)
{
debug_cgrp_subsys.dfl_cftypes = debug_files;
debug_cgrp_subsys.implicit_on_dfl = true;
debug_cgrp_subsys.threaded = true;
- return 1;
}
-__setup("cgroup_debug", enable_cgroup_debug);