summaryrefslogtreecommitdiff
path: root/kernel/cgroup
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup')
-rw-r--r--kernel/cgroup/cgroup-internal.h1
-rw-r--r--kernel/cgroup/cgroup-v1.c9
-rw-r--r--kernel/cgroup/cgroup.c38
-rw-r--r--kernel/cgroup/cpuset-internal.h1
-rw-r--r--kernel/cgroup/cpuset-v1.c49
-rw-r--r--kernel/cgroup/cpuset.c480
-rw-r--r--kernel/cgroup/legacy_freezer.c6
-rw-r--r--kernel/cgroup/misc.c16
-rw-r--r--kernel/cgroup/rstat.c117
9 files changed, 384 insertions, 333 deletions
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index c964dd7ff967..95ab39e1ec8f 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -168,6 +168,7 @@ struct cgroup_mgctx {
extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;
+extern bool cgrp_dfl_visible;
/* iterate across the hierarchies */
#define for_each_root(root) \
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index e28d5f0d20ed..fa24c032ed6f 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -673,6 +673,7 @@ struct cftype cgroup1_base_files[] = {
int proc_cgroupstats_show(struct seq_file *m, void *v)
{
struct cgroup_subsys *ss;
+ bool cgrp_v1_visible = false;
int i;
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
@@ -684,12 +685,18 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
for_each_subsys(ss, i) {
if (cgroup1_subsys_absent(ss))
continue;
+ cgrp_v1_visible |= ss->root != &cgrp_dfl_root;
+
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->legacy_name, ss->root->hierarchy_id,
atomic_read(&ss->root->nr_cgrps),
cgroup_ssid_enabled(i));
}
+ if (cgrp_dfl_visible && !cgrp_v1_visible)
+ pr_info_once("/proc/cgroups lists only v1 controllers, use cgroup.controllers of root cgroup for v2 info\n");
+
+
return 0;
}
@@ -844,7 +851,7 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent
if (kernfs_type(kn) != KERNFS_DIR)
return -ENOTDIR;
- if (kn->parent != new_parent)
+ if (rcu_access_pointer(kn->__parent) != new_parent)
return -EIO;
/*
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index afc665b7b1fe..3caf2cd86e65 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -171,7 +171,7 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
* The default hierarchy always exists but is hidden until mounted for the
* first time. This is for backward compatibility.
*/
-static bool cgrp_dfl_visible;
+bool cgrp_dfl_visible;
/* some controllers are not supported in the default hierarchy */
static u16 cgrp_dfl_inhibit_ss_mask;
@@ -633,9 +633,22 @@ int cgroup_task_count(const struct cgroup *cgrp)
return count;
}
+static struct cgroup *kn_priv(struct kernfs_node *kn)
+{
+ struct kernfs_node *parent;
+ /*
+ * The parent can not be replaced due to KERNFS_ROOT_INVARIANT_PARENT.
+ * Therefore it is always safe to dereference this pointer outside of a
+ * RCU section.
+ */
+ parent = rcu_dereference_check(kn->__parent,
+ kernfs_root_flags(kn) & KERNFS_ROOT_INVARIANT_PARENT);
+ return parent->priv;
+}
+
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
- struct cgroup *cgrp = of->kn->parent->priv;
+ struct cgroup *cgrp = kn_priv(of->kn);
struct cftype *cft = of_cft(of);
/*
@@ -1612,7 +1625,7 @@ void cgroup_kn_unlock(struct kernfs_node *kn)
if (kernfs_type(kn) == KERNFS_DIR)
cgrp = kn->priv;
else
- cgrp = kn->parent->priv;
+ cgrp = kn_priv(kn);
cgroup_unlock();
@@ -1644,7 +1657,7 @@ struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
if (kernfs_type(kn) == KERNFS_DIR)
cgrp = kn->priv;
else
- cgrp = kn->parent->priv;
+ cgrp = kn_priv(kn);
/*
* We're gonna grab cgroup_mutex which nests outside kernfs
@@ -1682,7 +1695,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
cfile->kn = NULL;
spin_unlock_irq(&cgroup_file_kn_lock);
- del_timer_sync(&cfile->notify_timer);
+ timer_delete_sync(&cfile->notify_timer);
}
kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
@@ -2118,7 +2131,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
root->kf_root = kernfs_create_root(kf_sops,
KERNFS_ROOT_CREATE_DEACTIVATED |
KERNFS_ROOT_SUPPORT_EXPORTOP |
- KERNFS_ROOT_SUPPORT_USER_XATTR,
+ KERNFS_ROOT_SUPPORT_USER_XATTR |
+ KERNFS_ROOT_INVARIANT_PARENT,
root_cgrp);
if (IS_ERR(root->kf_root)) {
ret = PTR_ERR(root->kf_root);
@@ -4115,7 +4129,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct cgroup_file_ctx *ctx = of->priv;
- struct cgroup *cgrp = of->kn->parent->priv;
+ struct cgroup *cgrp = kn_priv(of->kn);
struct cftype *cft = of_cft(of);
struct cgroup_subsys_state *css;
int ret;
@@ -4447,7 +4461,7 @@ int cgroup_rm_cftypes(struct cftype *cfts)
* function currently returns 0 as long as @cfts registration is successful
* even if some file creation attempts on existing cgroups fail.
*/
-static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
int ret;
@@ -5831,7 +5845,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
}
/*
- * This extra ref will be put in cgroup_free_fn() and guarantees
+ * This extra ref will be put in css_free_rwork_fn() and guarantees
* that @cgrp->kn is always accessible.
*/
kernfs_get(cgrp->kn);
@@ -5909,6 +5923,12 @@ static void kill_css(struct cgroup_subsys_state *css)
if (css->flags & CSS_DYING)
return;
+ /*
+ * Call css_killed(), if defined, before setting the CSS_DYING flag
+ */
+ if (css->ss->css_killed)
+ css->ss->css_killed(css);
+
css->flags |= CSS_DYING;
/*
diff --git a/kernel/cgroup/cpuset-internal.h b/kernel/cgroup/cpuset-internal.h
index 976a8bc3ff60..383963e28ac6 100644
--- a/kernel/cgroup/cpuset-internal.h
+++ b/kernel/cgroup/cpuset-internal.h
@@ -33,6 +33,7 @@ enum prs_errcode {
PERR_CPUSEMPTY,
PERR_HKEEPING,
PERR_ACCESS,
+ PERR_REMOTE,
};
/* bits in struct cpuset flags field */
diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c
index 25c1d7b77e2f..b69a7db67090 100644
--- a/kernel/cgroup/cpuset-v1.c
+++ b/kernel/cgroup/cpuset-v1.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-or-later
+#include "cgroup-internal.h"
#include "cpuset-internal.h"
/*
@@ -175,6 +176,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
switch (type) {
case FILE_SCHED_RELAX_DOMAIN_LEVEL:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
retval = update_relax_domain_level(cs, val);
break;
default:
@@ -373,6 +375,46 @@ out:
return ret;
}
+#ifdef CONFIG_PROC_PID_CPUSET
+/*
+ * proc_cpuset_show()
+ * - Print tasks cpuset path into seq_file.
+ * - Used for /proc/<pid>/cpuset.
+ */
+int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *tsk)
+{
+ char *buf;
+ struct cgroup_subsys_state *css;
+ int retval;
+
+ retval = -ENOMEM;
+ buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!buf)
+ goto out;
+
+ rcu_read_lock();
+ spin_lock_irq(&css_set_lock);
+ css = task_css(tsk, cpuset_cgrp_id);
+ retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
+ current->nsproxy->cgroup_ns);
+ spin_unlock_irq(&css_set_lock);
+ rcu_read_unlock();
+
+ if (retval == -E2BIG)
+ retval = -ENAMETOOLONG;
+ if (retval < 0)
+ goto out_free;
+ seq_puts(m, buf);
+ seq_putc(m, '\n');
+ retval = 0;
+out_free:
+ kfree(buf);
+out:
+ return retval;
+}
+#endif /* CONFIG_PROC_PID_CPUSET */
+
static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
{
struct cpuset *cs = css_cs(css);
@@ -424,24 +466,31 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val);
break;
case FILE_MEM_EXCLUSIVE:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val);
break;
case FILE_MEM_HARDWALL:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val);
break;
case FILE_SCHED_LOAD_BALANCE:
+ pr_info_once("cpuset.%s is deprecated, use cpuset.cpus.partition instead\n", cft->name);
retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
break;
case FILE_MEMORY_MIGRATE:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val);
break;
case FILE_MEMORY_PRESSURE_ENABLED:
+ pr_info_once("cpuset.%s is deprecated, use memory.pressure with CONFIG_PSI instead\n", cft->name);
cpuset_memory_pressure_enabled = !!val;
break;
case FILE_SPREAD_PAGE:
+ pr_info_once("cpuset.%s is deprecated\n", cft->name);
retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val);
break;
case FILE_SPREAD_SLAB:
+ pr_warn_once("cpuset.%s is deprecated\n", cft->name);
retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val);
break;
default:
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 0f910c828973..306b60430091 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -21,7 +21,6 @@
* License. See the file COPYING in the main directory of the Linux
* distribution for more details.
*/
-#include "cgroup-internal.h"
#include "cpuset-internal.h"
#include <linux/init.h>
@@ -62,10 +61,17 @@ static const char * const perr_strings[] = {
[PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty",
[PERR_HKEEPING] = "partition config conflicts with housekeeping setup",
[PERR_ACCESS] = "Enable partition not permitted",
+ [PERR_REMOTE] = "Have remote partition underneath",
};
/*
- * Exclusive CPUs distributed out to sub-partitions of top_cpuset
+ * For local partitions, update to subpartitions_cpus & isolated_cpus is done
+ * in update_parent_effective_cpumask(). For remote partitions, it is done in
+ * the remote_partition_*() and remote_cpus_update() helpers.
+ */
+/*
+ * Exclusive CPUs distributed out to local or remote sub-partitions of
+ * top_cpuset
*/
static cpumask_var_t subpartitions_cpus;
@@ -87,7 +93,6 @@ static struct list_head remote_children;
* A flag to force sched domain rebuild at the end of an operation.
* It can be set in
* - update_partition_sd_lb()
- * - remote_partition_check()
* - update_cpumasks_hier()
* - cpuset_update_flag()
* - cpuset_hotplug_update_tasks()
@@ -954,10 +959,12 @@ static void dl_update_tasks_root_domain(struct cpuset *cs)
css_task_iter_end(&it);
}
-static void dl_rebuild_rd_accounting(void)
+void dl_rebuild_rd_accounting(void)
{
struct cpuset *cs = NULL;
struct cgroup_subsys_state *pos_css;
+ int cpu;
+ u64 cookie = ++dl_cookie;
lockdep_assert_held(&cpuset_mutex);
lockdep_assert_cpus_held();
@@ -965,11 +972,12 @@ static void dl_rebuild_rd_accounting(void)
rcu_read_lock();
- /*
- * Clear default root domain DL accounting, it will be computed again
- * if a task belongs to it.
- */
- dl_clear_root_domain(&def_root_domain);
+ for_each_possible_cpu(cpu) {
+ if (dl_bw_visited(cpu, cookie))
+ continue;
+
+ dl_clear_root_domain_cpu(cpu);
+ }
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
@@ -990,16 +998,6 @@ static void dl_rebuild_rd_accounting(void)
rcu_read_unlock();
}
-static void
-partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
- struct sched_domain_attr *dattr_new)
-{
- mutex_lock(&sched_domains_mutex);
- partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
- dl_rebuild_rd_accounting();
- mutex_unlock(&sched_domains_mutex);
-}
-
/*
* Rebuild scheduler domains.
*
@@ -1061,7 +1059,7 @@ void rebuild_sched_domains_locked(void)
ndoms = generate_sched_domains(&doms, &attr);
/* Have scheduler rebuild the domains */
- partition_and_rebuild_sched_domains(ndoms, doms, attr);
+ partition_sched_domains(ndoms, doms, attr);
}
#else /* !CONFIG_SMP */
void rebuild_sched_domains_locked(void)
@@ -1083,6 +1081,13 @@ void rebuild_sched_domains(void)
cpus_read_unlock();
}
+void cpuset_reset_sched_domains(void)
+{
+ mutex_lock(&cpuset_mutex);
+ partition_sched_domains(1, NULL, NULL);
+ mutex_unlock(&cpuset_mutex);
+}
+
/**
* cpuset_update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
@@ -1090,9 +1095,14 @@ void rebuild_sched_domains(void)
*
* Iterate through each task of @cs updating its cpus_allowed to the
* effective cpuset's. As this function is called with cpuset_mutex held,
- * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask()
- * is used instead of effective_cpus to make sure all offline CPUs are also
- * included as hotplug code won't update cpumasks for tasks in top_cpuset.
+ * cpuset membership stays stable.
+ *
+ * For top_cpuset, task_cpu_possible_mask() is used instead of effective_cpus
+ * to make sure all offline CPUs are also included as hotplug code won't
+ * update cpumasks for tasks in top_cpuset.
+ *
+ * As task_cpu_possible_mask() can be task dependent in arm64, we have to
+ * do cpu masking per task instead of doing it once for all.
*/
void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
{
@@ -1152,7 +1162,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
*
* Return: 0 if successful, an error code otherwise
*/
-static int update_partition_exclusive(struct cpuset *cs, int new_prs)
+static int update_partition_exclusive_flag(struct cpuset *cs, int new_prs)
{
bool exclusive = (new_prs > PRS_MEMBER);
@@ -1235,12 +1245,12 @@ static void reset_partition_data(struct cpuset *cs)
}
/*
- * partition_xcpus_newstate - Exclusive CPUs state change
+ * isolated_cpus_update - Update the isolated_cpus mask
* @old_prs: old partition_root_state
* @new_prs: new partition_root_state
* @xcpus: exclusive CPUs with state change
*/
-static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus)
+static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus)
{
WARN_ON_ONCE(old_prs == new_prs);
if (new_prs == PRS_ISOLATED)
@@ -1274,8 +1284,8 @@ static bool partition_xcpus_add(int new_prs, struct cpuset *parent,
isolcpus_updated = (new_prs != parent->partition_root_state);
if (isolcpus_updated)
- partition_xcpus_newstate(parent->partition_root_state, new_prs,
- xcpus);
+ isolated_cpus_update(parent->partition_root_state, new_prs,
+ xcpus);
cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);
return isolcpus_updated;
@@ -1305,8 +1315,8 @@ static bool partition_xcpus_del(int old_prs, struct cpuset *parent,
isolcpus_updated = (old_prs != parent->partition_root_state);
if (isolcpus_updated)
- partition_xcpus_newstate(old_prs, parent->partition_root_state,
- xcpus);
+ isolated_cpus_update(old_prs, parent->partition_root_state,
+ xcpus);
cpumask_and(xcpus, xcpus, cpu_active_mask);
cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
@@ -1341,20 +1351,57 @@ EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
* compute_effective_exclusive_cpumask - compute effective exclusive CPUs
* @cs: cpuset
* @xcpus: effective exclusive CPUs value to be set
- * Return: true if xcpus is not empty, false otherwise.
+ * @real_cs: the real cpuset (can be NULL)
+ * Return: 0 if there is no sibling conflict, > 0 otherwise
*
- * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set),
- * it must be a subset of parent's effective_xcpus.
+ * If exclusive_cpus isn't explicitly set or a real_cs is provided, we have to
+ * scan the sibling cpusets and exclude their exclusive_cpus or effective_xcpus
+ * as well. The provision of real_cs means that a cpumask is being changed and
+ * the given cs is a trial one.
*/
-static bool compute_effective_exclusive_cpumask(struct cpuset *cs,
- struct cpumask *xcpus)
+static int compute_effective_exclusive_cpumask(struct cpuset *cs,
+ struct cpumask *xcpus,
+ struct cpuset *real_cs)
{
+ struct cgroup_subsys_state *css;
struct cpuset *parent = parent_cs(cs);
+ struct cpuset *sibling;
+ int retval = 0;
if (!xcpus)
xcpus = cs->effective_xcpus;
- return cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus);
+ cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus);
+
+ if (!real_cs) {
+ if (!cpumask_empty(cs->exclusive_cpus))
+ return 0;
+ } else {
+ cs = real_cs;
+ }
+
+ /*
+ * Exclude exclusive CPUs from siblings
+ */
+ rcu_read_lock();
+ cpuset_for_each_child(sibling, css, parent) {
+ if (sibling == cs)
+ continue;
+
+ if (!cpumask_empty(sibling->exclusive_cpus) &&
+ cpumask_intersects(xcpus, sibling->exclusive_cpus)) {
+ cpumask_andnot(xcpus, xcpus, sibling->exclusive_cpus);
+ retval++;
+ continue;
+ }
+ if (!cpumask_empty(sibling->effective_xcpus) &&
+ cpumask_intersects(xcpus, sibling->effective_xcpus)) {
+ cpumask_andnot(xcpus, xcpus, sibling->effective_xcpus);
+ retval++;
+ }
+ }
+ rcu_read_unlock();
+ return retval;
}
static inline bool is_remote_partition(struct cpuset *cs)
@@ -1396,7 +1443,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
* remote partition root underneath it, its exclusive_cpus must
* have overlapped with subpartitions_cpus.
*/
- compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
+ compute_effective_exclusive_cpumask(cs, tmp->new_cpus, NULL);
if (cpumask_empty(tmp->new_cpus) ||
cpumask_intersects(tmp->new_cpus, subpartitions_cpus) ||
cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
@@ -1405,8 +1452,11 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
spin_lock_irq(&callback_lock);
isolcpus_updated = partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
list_add(&cs->remote_sibling, &remote_children);
+ cpumask_copy(cs->effective_xcpus, tmp->new_cpus);
spin_unlock_irq(&callback_lock);
update_unbound_workqueue_cpumask(isolcpus_updated);
+ cpuset_force_rebuild();
+ cs->prs_err = 0;
/*
* Propagate changes in top_cpuset's effective_cpus down the hierarchy.
@@ -1429,20 +1479,24 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
{
bool isolcpus_updated;
- compute_effective_exclusive_cpumask(cs, tmp->new_cpus);
WARN_ON_ONCE(!is_remote_partition(cs));
- WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus));
+ WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));
spin_lock_irq(&callback_lock);
list_del_init(&cs->remote_sibling);
isolcpus_updated = partition_xcpus_del(cs->partition_root_state,
- NULL, tmp->new_cpus);
- cs->partition_root_state = -cs->partition_root_state;
- if (!cs->prs_err)
- cs->prs_err = PERR_INVCPUS;
+ NULL, cs->effective_xcpus);
+ if (cs->prs_err)
+ cs->partition_root_state = -cs->partition_root_state;
+ else
+ cs->partition_root_state = PRS_MEMBER;
+
+ /* effective_xcpus may need to be changed */
+ compute_effective_exclusive_cpumask(cs, NULL, NULL);
reset_partition_data(cs);
spin_unlock_irq(&callback_lock);
update_unbound_workqueue_cpumask(isolcpus_updated);
+ cpuset_force_rebuild();
/*
* Propagate changes in top_cpuset's effective_cpus down the hierarchy.
@@ -1454,14 +1508,15 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
/*
* remote_cpus_update - cpus_exclusive change of remote partition
* @cs: the cpuset to be updated
- * @newmask: the new effective_xcpus mask
+ * @xcpus: the new exclusive_cpus mask, if non-NULL
+ * @excpus: the new effective_xcpus mask
* @tmp: temporary masks
*
* top_cpuset and subpartitions_cpus will be updated or partition can be
* invalidated.
*/
-static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
- struct tmpmasks *tmp)
+static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus,
+ struct cpumask *excpus, struct tmpmasks *tmp)
{
bool adding, deleting;
int prs = cs->partition_root_state;
@@ -1472,29 +1527,45 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));
- if (cpumask_empty(newmask))
+ if (cpumask_empty(excpus)) {
+ cs->prs_err = PERR_CPUSEMPTY;
goto invalidate;
+ }
- adding = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus);
- deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask);
+ adding = cpumask_andnot(tmp->addmask, excpus, cs->effective_xcpus);
+ deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, excpus);
/*
* Additions of remote CPUs is only allowed if those CPUs are
* not allocated to other partitions and there are effective_cpus
* left in the top cpuset.
*/
- if (adding && (!capable(CAP_SYS_ADMIN) ||
- cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
- cpumask_subset(top_cpuset.effective_cpus, tmp->addmask)))
- goto invalidate;
+ if (adding) {
+ if (!capable(CAP_SYS_ADMIN))
+ cs->prs_err = PERR_ACCESS;
+ else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
+ cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))
+ cs->prs_err = PERR_NOCPUS;
+ if (cs->prs_err)
+ goto invalidate;
+ }
spin_lock_irq(&callback_lock);
if (adding)
isolcpus_updated += partition_xcpus_add(prs, NULL, tmp->addmask);
if (deleting)
isolcpus_updated += partition_xcpus_del(prs, NULL, tmp->delmask);
+ /*
+ * Need to update effective_xcpus and exclusive_cpus now as
+ * update_sibling_cpumasks() below may iterate back to the same cs.
+ */
+ cpumask_copy(cs->effective_xcpus, excpus);
+ if (xcpus)
+ cpumask_copy(cs->exclusive_cpus, xcpus);
spin_unlock_irq(&callback_lock);
update_unbound_workqueue_cpumask(isolcpus_updated);
+ if (adding || deleting)
+ cpuset_force_rebuild();
/*
* Propagate changes in top_cpuset's effective_cpus down the hierarchy.
@@ -1508,47 +1579,6 @@ invalidate:
}
/*
- * remote_partition_check - check if a child remote partition needs update
- * @cs: the cpuset to be updated
- * @newmask: the new effective_xcpus mask
- * @delmask: temporary mask for deletion (not in tmp)
- * @tmp: temporary masks
- *
- * This should be called before the given cs has updated its cpus_allowed
- * and/or effective_xcpus.
- */
-static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask,
- struct cpumask *delmask, struct tmpmasks *tmp)
-{
- struct cpuset *child, *next;
- int disable_cnt = 0;
-
- /*
- * Compute the effective exclusive CPUs that will be deleted.
- */
- if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) ||
- !cpumask_intersects(delmask, subpartitions_cpus))
- return; /* No deletion of exclusive CPUs in partitions */
-
- /*
- * Searching the remote children list to look for those that will
- * be impacted by the deletion of exclusive CPUs.
- *
- * Since a cpuset must be removed from the remote children list
- * before it can go offline and holding cpuset_mutex will prevent
- * any change in cpuset status. RCU read lock isn't needed.
- */
- lockdep_assert_held(&cpuset_mutex);
- list_for_each_entry_safe(child, next, &remote_children, remote_sibling)
- if (cpumask_intersects(child->effective_cpus, delmask)) {
- remote_partition_disable(child, tmp);
- disable_cnt++;
- }
- if (disable_cnt)
- cpuset_force_rebuild();
-}
-
-/*
* prstate_housekeeping_conflict - check for partition & housekeeping conflicts
* @prstate: partition root state to be checked
* @new_cpus: cpu mask
@@ -1602,7 +1632,7 @@ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
* The partcmd_update command is used by update_cpumasks_hier() with newmask
* NULL and update_cpumask() with newmask set. The partcmd_invalidate is used
* by update_cpumask() with NULL newmask. In both cases, the callers won't
- * check for error and so partition_root_state and prs_error will be updated
+ * check for error and so partition_root_state and prs_err will be updated
* directly.
*/
static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
@@ -1615,11 +1645,12 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
int old_prs, new_prs;
int part_error = PERR_NONE; /* Partition error? */
int subparts_delta = 0;
- struct cpumask *xcpus; /* cs effective_xcpus */
int isolcpus_updated = 0;
+ struct cpumask *xcpus = user_xcpus(cs);
bool nocpu;
lockdep_assert_held(&cpuset_mutex);
+ WARN_ON_ONCE(is_remote_partition(cs));
/*
* new_prs will only be changed for the partcmd_update and
@@ -1627,7 +1658,6 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
*/
adding = deleting = false;
old_prs = new_prs = cs->partition_root_state;
- xcpus = user_xcpus(cs);
if (cmd == partcmd_invalidate) {
if (is_prs_invalid(old_prs))
@@ -1662,12 +1692,19 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {
/*
+ * Need to call compute_effective_exclusive_cpumask() in case
+ * exclusive_cpus not set. Sibling conflict should only happen
+ * if exclusive_cpus isn't set.
+ */
+ xcpus = tmp->new_cpus;
+ if (compute_effective_exclusive_cpumask(cs, xcpus, NULL))
+ WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus));
+
+ /*
* Enabling partition root is not allowed if its
- * effective_xcpus is empty or doesn't overlap with
- * parent's effective_xcpus.
+ * effective_xcpus is empty.
*/
- if (cpumask_empty(xcpus) ||
- !cpumask_intersects(xcpus, parent->effective_xcpus))
+ if (cpumask_empty(xcpus))
return PERR_INVCPUS;
if (prstate_housekeeping_conflict(new_prs, xcpus))
@@ -1680,19 +1717,22 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
if (nocpu)
return PERR_NOCPUS;
- cpumask_copy(tmp->delmask, xcpus);
- deleting = true;
- subparts_delta++;
+ deleting = cpumask_and(tmp->delmask, xcpus, parent->effective_xcpus);
+ if (deleting)
+ subparts_delta++;
new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
} else if (cmd == partcmd_disable) {
/*
- * May need to add cpus to parent's effective_cpus for
- * valid partition root.
+ * May need to add cpus back to parent's effective_cpus
+ * (and maybe removed from subpartitions_cpus/isolated_cpus)
+ * for valid partition root. xcpus may contain CPUs that
+ * shouldn't be removed from the two global cpumasks.
*/
- adding = !is_prs_invalid(old_prs) &&
- cpumask_and(tmp->addmask, xcpus, parent->effective_xcpus);
- if (adding)
+ if (is_partition_valid(cs)) {
+ cpumask_copy(tmp->addmask, cs->effective_xcpus);
+ adding = true;
subparts_delta--;
+ }
new_prs = PRS_MEMBER;
} else if (newmask) {
/*
@@ -1702,6 +1742,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
part_error = PERR_CPUSEMPTY;
goto write_error;
}
+
/* Check newmask again, whether cpus are available for parent/cs */
nocpu |= tasks_nocpu_error(parent, cs, newmask);
@@ -1830,7 +1871,7 @@ write_error:
* CPU lists in cs haven't been updated yet. So defer it to later.
*/
if ((old_prs != new_prs) && (cmd != partcmd_update)) {
- int err = update_partition_exclusive(cs, new_prs);
+ int err = update_partition_exclusive_flag(cs, new_prs);
if (err)
return err;
@@ -1868,7 +1909,7 @@ write_error:
update_unbound_workqueue_cpumask(isolcpus_updated);
if ((old_prs != new_prs) && (cmd == partcmd_update))
- update_partition_exclusive(cs, new_prs);
+ update_partition_exclusive_flag(cs, new_prs);
if (adding || deleting) {
cpuset_update_tasks_cpumask(parent, tmp->addmask);
@@ -1918,7 +1959,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs,
* 2) All the effective_cpus will be used up and cp
* has tasks
*/
- compute_effective_exclusive_cpumask(cs, new_ecpus);
+ compute_effective_exclusive_cpumask(cs, new_ecpus, NULL);
cpumask_and(new_ecpus, new_ecpus, cpu_active_mask);
rcu_read_lock();
@@ -1926,6 +1967,11 @@ static void compute_partition_effective_cpumask(struct cpuset *cs,
if (!is_partition_valid(child))
continue;
+ /*
+ * There shouldn't be a remote partition underneath another
+ * partition root.
+ */
+ WARN_ON_ONCE(is_remote_partition(child));
child->prs_err = 0;
if (!cpumask_subset(child->effective_xcpus,
cs->effective_xcpus))
@@ -1981,32 +2027,39 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
bool remote = is_remote_partition(cp);
bool update_parent = false;
+ old_prs = new_prs = cp->partition_root_state;
+
/*
- * Skip descendent remote partition that acquires CPUs
- * directly from top cpuset unless it is cs.
+ * For child remote partition root (!= cs), we need to call
+ * remote_cpus_update() if effective_xcpus will be changed.
+ * Otherwise, we can skip the whole subtree.
+ *
+ * remote_cpus_update() will reuse tmp->new_cpus only after
+ * its value is being processed.
*/
if (remote && (cp != cs)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
+ compute_effective_exclusive_cpumask(cp, tmp->new_cpus, NULL);
+ if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+ rcu_read_unlock();
+ remote_cpus_update(cp, NULL, tmp->new_cpus, tmp);
+ rcu_read_lock();
- /*
- * Update effective_xcpus if exclusive_cpus set.
- * The case when exclusive_cpus isn't set is handled later.
- */
- if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) {
- spin_lock_irq(&callback_lock);
- compute_effective_exclusive_cpumask(cp, NULL);
- spin_unlock_irq(&callback_lock);
+ /* Remote partition may be invalidated */
+ new_prs = cp->partition_root_state;
+ remote = (new_prs == old_prs);
}
- old_prs = new_prs = cp->partition_root_state;
- if (remote || (is_partition_valid(parent) &&
- is_partition_valid(cp)))
+ if (remote || (is_partition_valid(parent) && is_partition_valid(cp)))
compute_partition_effective_cpumask(cp, tmp->new_cpus);
else
compute_effective_cpumask(tmp->new_cpus, cp, parent);
+ if (remote)
+ goto get_css; /* Ready to update cpuset data */
+
/*
* A partition with no effective_cpus is allowed as long as
* there is no task associated with it. Call
@@ -2026,9 +2079,6 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus))
cpumask_copy(tmp->new_cpus, parent->effective_cpus);
- if (remote)
- goto get_css;
-
/*
* Skip the whole subtree if
* 1) the cpumask remains the same,
@@ -2089,6 +2139,9 @@ get_css:
spin_lock_irq(&callback_lock);
cpumask_copy(cp->effective_cpus, tmp->new_cpus);
cp->partition_root_state = new_prs;
+ if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs))
+ compute_effective_exclusive_cpumask(cp, NULL, NULL);
+
/*
* Make sure effective_xcpus is properly set for a valid
* partition root.
@@ -2175,7 +2228,14 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
parent);
if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
continue;
+ } else if (is_remote_partition(sibling)) {
+ /*
+ * Change in a sibling cpuset won't affect a remote
+ * partition root.
+ */
+ continue;
}
+
if (!css_tryget_online(&sibling->css))
continue;
@@ -2232,8 +2292,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* trialcs->effective_xcpus is used as a temporary cpumask
* for checking validity of the partition root.
*/
+ trialcs->partition_root_state = PRS_MEMBER;
if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs))
- compute_effective_exclusive_cpumask(trialcs, NULL);
+ compute_effective_exclusive_cpumask(trialcs, NULL, cs);
}
/* Nothing to do if the cpus didn't change */
@@ -2306,19 +2367,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* Call remote_cpus_update() to handle valid remote partition
*/
if (is_remote_partition(cs))
- remote_cpus_update(cs, xcpus, &tmp);
+ remote_cpus_update(cs, NULL, xcpus, &tmp);
else if (invalidate)
update_parent_effective_cpumask(cs, partcmd_invalidate,
NULL, &tmp);
else
update_parent_effective_cpumask(cs, partcmd_update,
xcpus, &tmp);
- } else if (!cpumask_empty(cs->exclusive_cpus)) {
- /*
- * Use trialcs->effective_cpus as a temp cpumask
- */
- remote_partition_check(cs, trialcs->effective_xcpus,
- trialcs->effective_cpus, &tmp);
}
spin_lock_irq(&callback_lock);
@@ -2370,8 +2425,15 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
return 0;
- if (*buf)
- compute_effective_exclusive_cpumask(trialcs, NULL);
+ if (*buf) {
+ trialcs->partition_root_state = PRS_MEMBER;
+ /*
+ * Reject the change if there is exclusive CPUs conflict with
+ * the siblings.
+ */
+ if (compute_effective_exclusive_cpumask(trialcs, NULL, cs))
+ return -EINVAL;
+ }
/*
* Check all the descendants in update_cpumasks_hier() if
@@ -2402,8 +2464,8 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (invalidate)
remote_partition_disable(cs, &tmp);
else
- remote_cpus_update(cs, trialcs->effective_xcpus,
- &tmp);
+ remote_cpus_update(cs, trialcs->exclusive_cpus,
+ trialcs->effective_xcpus, &tmp);
} else if (invalidate) {
update_parent_effective_cpumask(cs, partcmd_invalidate,
NULL, &tmp);
@@ -2411,12 +2473,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
update_parent_effective_cpumask(cs, partcmd_update,
trialcs->effective_xcpus, &tmp);
}
- } else if (!cpumask_empty(trialcs->exclusive_cpus)) {
- /*
- * Use trialcs->effective_cpus as a temp cpumask
- */
- remote_partition_check(cs, trialcs->effective_xcpus,
- trialcs->effective_cpus, &tmp);
}
spin_lock_irq(&callback_lock);
cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus);
@@ -2783,7 +2839,7 @@ static int update_prstate(struct cpuset *cs, int new_prs)
int err = PERR_NONE, old_prs = cs->partition_root_state;
struct cpuset *parent = parent_cs(cs);
struct tmpmasks tmpmask;
- bool new_xcpus_state = false;
+ bool isolcpus_updated = false;
if (old_prs == new_prs)
return 0;
@@ -2797,18 +2853,7 @@ static int update_prstate(struct cpuset *cs, int new_prs)
if (alloc_cpumasks(NULL, &tmpmask))
return -ENOMEM;
- /*
- * Setup effective_xcpus if not properly set yet, it will be cleared
- * later if partition becomes invalid.
- */
- if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) {
- spin_lock_irq(&callback_lock);
- cpumask_and(cs->effective_xcpus,
- cs->cpus_allowed, parent->effective_xcpus);
- spin_unlock_irq(&callback_lock);
- }
-
- err = update_partition_exclusive(cs, new_prs);
+ err = update_partition_exclusive_flag(cs, new_prs);
if (err)
goto out;
@@ -2822,6 +2867,19 @@ static int update_prstate(struct cpuset *cs, int new_prs)
}
/*
+ * We don't support the creation of a new local partition with
+ * a remote partition underneath it. This unsupported
+ * setting can happen only if parent is the top_cpuset because
+ * a remote partition cannot be created underneath an existing
+ * local or remote partition.
+ */
+ if ((parent == &top_cpuset) &&
+ cpumask_intersects(cs->exclusive_cpus, subpartitions_cpus)) {
+ err = PERR_REMOTE;
+ goto out;
+ }
+
+ /*
* If parent is valid partition, enable local partiion.
* Otherwise, enable a remote partition.
*/
@@ -2836,8 +2894,9 @@ static int update_prstate(struct cpuset *cs, int new_prs)
} else if (old_prs && new_prs) {
/*
* A change in load balance state only, no change in cpumasks.
+ * Need to update isolated_cpus.
*/
- new_xcpus_state = true;
+ isolcpus_updated = true;
} else {
/*
* Switching back to member is always allowed even if it
@@ -2861,7 +2920,7 @@ out:
*/
if (err) {
new_prs = -new_prs;
- update_partition_exclusive(cs, new_prs);
+ update_partition_exclusive_flag(cs, new_prs);
}
spin_lock_irq(&callback_lock);
@@ -2869,14 +2928,18 @@ out:
WRITE_ONCE(cs->prs_err, err);
if (!is_partition_valid(cs))
reset_partition_data(cs);
- else if (new_xcpus_state)
- partition_xcpus_newstate(old_prs, new_prs, cs->effective_xcpus);
+ else if (isolcpus_updated)
+ isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus);
spin_unlock_irq(&callback_lock);
- update_unbound_workqueue_cpumask(new_xcpus_state);
+ update_unbound_workqueue_cpumask(isolcpus_updated);
- /* Force update if switching back to member */
+ /* Force update if switching back to member & update effective_xcpus */
update_cpumasks_hier(cs, &tmpmask, !new_prs);
+ /* A newly created partition must have effective_xcpus set */
+ WARN_ON_ONCE(!old_prs && (new_prs > 0)
+ && cpumask_empty(cs->effective_xcpus));
+
/* Update sched domains and load balance flag */
update_partition_sd_lb(cs, old_prs);
@@ -3209,7 +3272,7 @@ int cpuset_common_seq_show(struct seq_file *sf, void *v)
return ret;
}
-static int sched_partition_show(struct seq_file *seq, void *v)
+static int cpuset_partition_show(struct seq_file *seq, void *v)
{
struct cpuset *cs = css_cs(seq_css(seq));
const char *err, *type = NULL;
@@ -3240,7 +3303,7 @@ static int sched_partition_show(struct seq_file *seq, void *v)
return 0;
}
-static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
+static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct cpuset *cs = css_cs(of_css(of));
@@ -3261,11 +3324,8 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
css_get(&cs->css);
cpus_read_lock();
mutex_lock(&cpuset_mutex);
- if (!is_cpuset_online(cs))
- goto out_unlock;
-
- retval = update_prstate(cs, val);
-out_unlock:
+ if (is_cpuset_online(cs))
+ retval = update_prstate(cs, val);
mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
css_put(&cs->css);
@@ -3309,8 +3369,8 @@ static struct cftype dfl_files[] = {
{
.name = "cpus.partition",
- .seq_show = sched_partition_show,
- .write = sched_partition_write,
+ .seq_show = cpuset_partition_show,
+ .write = cpuset_partition_write,
.private = FILE_PARTITION_ROOT,
.flags = CFTYPE_NOT_ON_ROOT,
.file_offset = offsetof(struct cpuset, partition_file),
@@ -3476,9 +3536,6 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
cpus_read_lock();
mutex_lock(&cpuset_mutex);
- if (is_partition_valid(cs))
- update_prstate(cs, 0);
-
if (!cpuset_v2() && is_sched_load_balance(cs))
cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
@@ -3489,6 +3546,22 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
cpus_read_unlock();
}
+static void cpuset_css_killed(struct cgroup_subsys_state *css)
+{
+ struct cpuset *cs = css_cs(css);
+
+ cpus_read_lock();
+ mutex_lock(&cpuset_mutex);
+
+ /* Reset valid partition back to member */
+ if (is_partition_valid(cs))
+ update_prstate(cs, PRS_MEMBER);
+
+ mutex_unlock(&cpuset_mutex);
+ cpus_read_unlock();
+
+}
+
static void cpuset_css_free(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
@@ -3610,6 +3683,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
.css_alloc = cpuset_css_alloc,
.css_online = cpuset_css_online,
.css_offline = cpuset_css_offline,
+ .css_killed = cpuset_css_killed,
.css_free = cpuset_css_free,
.can_attach = cpuset_can_attach,
.cancel_attach = cpuset_cancel_attach,
@@ -3740,10 +3814,10 @@ retry:
if (remote && cpumask_empty(&new_cpus) &&
partition_is_populated(cs, NULL)) {
+ cs->prs_err = PERR_HOTPLUG;
remote_partition_disable(cs, tmp);
compute_effective_cpumask(&new_cpus, cs, parent);
remote = false;
- cpuset_force_rebuild();
}
/*
@@ -4244,50 +4318,6 @@ void cpuset_print_current_mems_allowed(void)
rcu_read_unlock();
}
-#ifdef CONFIG_PROC_PID_CPUSET
-/*
- * proc_cpuset_show()
- * - Print tasks cpuset path into seq_file.
- * - Used for /proc/<pid>/cpuset.
- * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
- * doesn't really matter if tsk->cpuset changes after we read it,
- * and we take cpuset_mutex, keeping cpuset_attach() from changing it
- * anyway.
- */
-int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
- struct pid *pid, struct task_struct *tsk)
-{
- char *buf;
- struct cgroup_subsys_state *css;
- int retval;
-
- retval = -ENOMEM;
- buf = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!buf)
- goto out;
-
- rcu_read_lock();
- spin_lock_irq(&css_set_lock);
- css = task_css(tsk, cpuset_cgrp_id);
- retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
- current->nsproxy->cgroup_ns);
- spin_unlock_irq(&css_set_lock);
- rcu_read_unlock();
-
- if (retval == -E2BIG)
- retval = -ENAMETOOLONG;
- if (retval < 0)
- goto out_free;
- seq_puts(m, buf);
- seq_putc(m, '\n');
- retval = 0;
-out_free:
- kfree(buf);
-out:
- return retval;
-}
-#endif /* CONFIG_PROC_PID_CPUSET */
-
/* Display task mems_allowed in /proc/<pid>/status file. */
void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
{
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 074653f964c1..039d1eb2f215 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -430,9 +430,11 @@ static ssize_t freezer_write(struct kernfs_open_file *of,
if (strcmp(buf, freezer_state_strs(0)) == 0)
freeze = false;
- else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0)
+ else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) {
+ pr_info_once("Freezing with imperfect legacy cgroup freezer. "
+ "See cgroup.freeze of cgroup v2\n");
freeze = true;
- else
+ } else
return -EINVAL;
freezer_change_state(css_freezer(of_css(of)), freeze);
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index 0e26068995a6..2fa3a4fb2aaf 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -68,22 +68,6 @@ static inline bool valid_type(enum misc_res_type type)
}
/**
- * misc_cg_res_total_usage() - Get the current total usage of the resource.
- * @type: misc res type.
- *
- * Context: Any context.
- * Return: Current total usage of the resource.
- */
-u64 misc_cg_res_total_usage(enum misc_res_type type)
-{
- if (valid_type(type))
- return atomic64_read(&root_cg.res[type].usage);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(misc_cg_res_total_usage);
-
-/**
* misc_cg_set_capacity() - Set the capacity of the misc cgroup res.
* @type: Type of the misc res.
* @capacity: Supported capacity of the misc res on the host.
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index aac91466279f..b2239156b7de 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -299,40 +299,6 @@ static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop)
spin_unlock_irq(&cgroup_rstat_lock);
}
-/* see cgroup_rstat_flush() */
-static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
- __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
-{
- int cpu;
-
- lockdep_assert_held(&cgroup_rstat_lock);
-
- for_each_possible_cpu(cpu) {
- struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu);
-
- for (; pos; pos = pos->rstat_flush_next) {
- struct cgroup_subsys_state *css;
-
- cgroup_base_stat_flush(pos, cpu);
- bpf_rstat_flush(pos, cgroup_parent(pos), cpu);
-
- rcu_read_lock();
- list_for_each_entry_rcu(css, &pos->rstat_css_list,
- rstat_css_node)
- css->ss->css_rstat_flush(css, cpu);
- rcu_read_unlock();
- }
-
- /* play nice and yield if necessary */
- if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) {
- __cgroup_rstat_unlock(cgrp, cpu);
- if (!cond_resched())
- cpu_relax();
- __cgroup_rstat_lock(cgrp, cpu);
- }
- }
-}
-
/**
* cgroup_rstat_flush - flush stats in @cgrp's subtree
* @cgrp: target cgroup
@@ -348,38 +314,31 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp)
*/
__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
{
+ int cpu;
+
might_sleep();
+ for_each_possible_cpu(cpu) {
+ struct cgroup *pos;
- __cgroup_rstat_lock(cgrp, -1);
- cgroup_rstat_flush_locked(cgrp);
- __cgroup_rstat_unlock(cgrp, -1);
-}
+ /* Reacquire for each CPU to avoid disabling IRQs too long */
+ __cgroup_rstat_lock(cgrp, cpu);
+ pos = cgroup_rstat_updated_list(cgrp, cpu);
+ for (; pos; pos = pos->rstat_flush_next) {
+ struct cgroup_subsys_state *css;
-/**
- * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
- * @cgrp: target cgroup
- *
- * Flush stats in @cgrp's subtree and prevent further flushes. Must be
- * paired with cgroup_rstat_flush_release().
- *
- * This function may block.
- */
-void cgroup_rstat_flush_hold(struct cgroup *cgrp)
- __acquires(&cgroup_rstat_lock)
-{
- might_sleep();
- __cgroup_rstat_lock(cgrp, -1);
- cgroup_rstat_flush_locked(cgrp);
-}
+ cgroup_base_stat_flush(pos, cpu);
+ bpf_rstat_flush(pos, cgroup_parent(pos), cpu);
-/**
- * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold()
- * @cgrp: cgroup used by tracepoint
- */
-void cgroup_rstat_flush_release(struct cgroup *cgrp)
- __releases(&cgroup_rstat_lock)
-{
- __cgroup_rstat_unlock(cgrp, -1);
+ rcu_read_lock();
+ list_for_each_entry_rcu(css, &pos->rstat_css_list,
+ rstat_css_node)
+ css->ss->css_rstat_flush(css, cpu);
+ rcu_read_unlock();
+ }
+ __cgroup_rstat_unlock(cgrp, cpu);
+ if (!cond_resched())
+ cpu_relax();
+ }
}
int cgroup_rstat_init(struct cgroup *cgrp)
@@ -612,36 +571,34 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat
void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- u64 usage, utime, stime, ntime;
+ struct cgroup_base_stat bstat;
if (cgroup_parent(cgrp)) {
- cgroup_rstat_flush_hold(cgrp);
- usage = cgrp->bstat.cputime.sum_exec_runtime;
+ cgroup_rstat_flush(cgrp);
+ __cgroup_rstat_lock(cgrp, -1);
+ bstat = cgrp->bstat;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
- &utime, &stime);
- ntime = cgrp->bstat.ntime;
- cgroup_rstat_flush_release(cgrp);
+ &bstat.cputime.utime, &bstat.cputime.stime);
+ __cgroup_rstat_unlock(cgrp, -1);
} else {
- /* cgrp->bstat of root is not actually used, reuse it */
- root_cgroup_cputime(&cgrp->bstat);
- usage = cgrp->bstat.cputime.sum_exec_runtime;
- utime = cgrp->bstat.cputime.utime;
- stime = cgrp->bstat.cputime.stime;
- ntime = cgrp->bstat.ntime;
+ root_cgroup_cputime(&bstat);
}
- do_div(usage, NSEC_PER_USEC);
- do_div(utime, NSEC_PER_USEC);
- do_div(stime, NSEC_PER_USEC);
- do_div(ntime, NSEC_PER_USEC);
+ do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC);
+ do_div(bstat.cputime.utime, NSEC_PER_USEC);
+ do_div(bstat.cputime.stime, NSEC_PER_USEC);
+ do_div(bstat.ntime, NSEC_PER_USEC);
seq_printf(seq, "usage_usec %llu\n"
"user_usec %llu\n"
"system_usec %llu\n"
"nice_usec %llu\n",
- usage, utime, stime, ntime);
+ bstat.cputime.sum_exec_runtime,
+ bstat.cputime.utime,
+ bstat.cputime.stime,
+ bstat.ntime);
- cgroup_force_idle_show(seq, &cgrp->bstat);
+ cgroup_force_idle_show(seq, &bstat);
}
/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */