summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLi Zefan <lizefan@huawei.com>2014-06-30 11:50:59 +0800
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2014-07-17 16:23:19 -0700
commit3fa0637a2a027db4a0d31314b0e9e6bd5526bb17 (patch)
tree6f7701c65a73348d7e7f18eb9345608842366ebf
parentf5fc1ec5e8d32bfb559a87800633b0a82a37e640 (diff)
downloadlwn-3fa0637a2a027db4a0d31314b0e9e6bd5526bb17.tar.gz
lwn-3fa0637a2a027db4a0d31314b0e9e6bd5526bb17.zip
cgroup: fix a race between cgroup_mount() and cgroup_kill_sb()
commit 3a32bd72d77058d768dbb38183ad517f720dd1bc upstream. We've converted cgroup to kernfs so cgroup won't be intertwined with vfs objects and locking, but there are dark areas. Run two instances of this script concurrently: for ((; ;)) { mount -t cgroup -o cpuacct xxx /cgroup umount /cgroup } After a while, I saw two mount processes were stuck at retrying, because they were waiting for a subsystem to become free, but the root associated with this subsystem never got freed. This can happen, if thread A is in the process of killing superblock but hasn't called percpu_ref_kill(), and at this time thread B is mounting the same cgroup root and finds the root in the root list and performs percpu_ref_try_get(). To fix this, we try to increase both the refcnt of the superblock and the percpu refcnt of cgroup root. v2: - we should try to get both the superblock refcnt and cgroup_root refcnt, because cgroup_root may have no superblock assosiated with it. - adjust/add comments. tj: Updated comments. Renamed @sb to @pinned_sb. Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Tejun Heo <tj@kernel.org> [lizf: Backported to 3.15: - Adjust context - s/percpu_tryget_live/atomic_inc_not_zero/] Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r--kernel/cgroup.c28
1 files changed, 27 insertions, 1 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 17a0a7ea692e..073226b3dfb0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1484,6 +1484,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
void *data)
{
+ struct super_block *pinned_sb = NULL;
struct cgroup_subsys *ss;
struct cgroup_root *root;
struct cgroup_sb_opts opts;
@@ -1584,10 +1585,25 @@ retry:
* destruction to complete so that the subsystems are free.
* We can use wait_queue for the wait but this path is
* super cold. Let's just sleep for a bit and retry.
+
+ * We want to reuse @root whose lifetime is governed by its
+ * ->cgrp. Let's check whether @root is alive and keep it
+ * that way. As cgroup_kill_sb() can happen anytime, we
+ * want to block it by pinning the sb so that @root doesn't
+ * get killed before mount is complete.
+ *
+ * With the sb pinned, inc_not_zero can reliably indicate
+ * whether @root can be reused. If it's being killed,
+ * drain it. We can use wait_queue for the wait but this
+ * path is super cold. Let's just sleep a bit and retry.
*/
- if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
+ pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
+ if (IS_ERR(pinned_sb) ||
+ !atomic_inc_not_zero(&root->cgrp.refcnt)) {
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgroup_tree_mutex);
+ if (!IS_ERR_OR_NULL(pinned_sb))
+ deactivate_super(pinned_sb);
msleep(10);
mutex_lock(&cgroup_tree_mutex);
mutex_lock(&cgroup_mutex);
@@ -1634,6 +1650,16 @@ out_unlock:
CGROUP_SUPER_MAGIC, &new_sb);
if (IS_ERR(dentry) || !new_sb)
cgroup_put(&root->cgrp);
+
+ /*
+ * If @pinned_sb, we're reusing an existing root and holding an
+ * extra ref on its sb. Mount is complete. Put the extra ref.
+ */
+ if (pinned_sb) {
+ WARN_ON(new_sb);
+ deactivate_super(pinned_sb);
+ }
+
return dentry;
}