diff options
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r-- | kernel/cgroup.c | 318 |
1 files changed, 91 insertions, 227 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f24f724620dd..3070164e2036 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -171,8 +171,8 @@ struct css_id { * The css to which this ID points. This pointer is set to valid value * after cgroup is populated. If cgroup is removed, this will be NULL. * This pointer is expected to be RCU-safe because destroy() - * is called after synchronize_rcu(). But for safe use, css_is_removed() - * css_tryget() should be used for avoiding race. + * is called after synchronize_rcu(). But for safe use, css_tryget() + * should be used for avoiding race. */ struct cgroup_subsys_state __rcu *css; /* @@ -854,30 +854,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) return inode; } -/* - * Call subsys's pre_destroy handler. - * This is called before css refcnt check. - */ -static int cgroup_call_pre_destroy(struct cgroup *cgrp) -{ - struct cgroup_subsys *ss; - int ret = 0; - - for_each_subsys(cgrp->root, ss) { - if (!ss->pre_destroy) - continue; - - ret = ss->pre_destroy(cgrp); - if (ret) { - /* ->pre_destroy() failure is being deprecated */ - WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs); - break; - } - } - - return ret; -} - static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -1015,33 +991,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry) } /* - * A queue for waiters to do rmdir() cgroup. A tasks will sleep when - * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some - * reference to css->refcnt. In general, this refcnt is expected to goes down - * to zero, soon. - * - * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; - */ -static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); - -static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) -{ - if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) - wake_up_all(&cgroup_rmdir_waitq); -} - -void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) -{ - css_get(css); -} - -void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) -{ - cgroup_wakeup_rmdir_waiter(css->cgroup); - css_put(css); -} - -/* * Call with cgroup_mutex held. Drops reference counts on modules, including * any duplicate ones that parse_cgroupfs_options took. If this function * returns an error, no reference counts are touched. @@ -2025,12 +1974,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) } synchronize_rcu(); - - /* - * wake up rmdir() waiter. the rmdir should fail since the cgroup - * is no longer empty. - */ - cgroup_wakeup_rmdir_waiter(cgrp); out: if (retval) { for_each_subsys(root, ss) { @@ -2200,7 +2143,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * step 5: success! and cleanup */ synchronize_rcu(); - cgroup_wakeup_rmdir_waiter(cgrp); retval = 0; out_put_css_set_refs: if (retval) { @@ -4022,14 +3964,12 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, cgrp->subsys[ss->subsys_id] = css; /* - * If !clear_css_refs, css holds an extra ref to @cgrp->dentry - * which is put on the last css_put(). dput() requires process - * context, which css_put() may be called without. @css->dput_work - * will be used to invoke dput() asynchronously from css_put(). + * css holds an extra ref to @cgrp->dentry which is put on the last + * css_put(). dput() requires process context, which css_put() may + * be called without. @css->dput_work will be used to invoke + * dput() asynchronously from css_put(). */ INIT_WORK(&css->dput_work, css_dput_fn); - if (ss->__DEPRECATED_clear_css_refs) - set_bit(CSS_CLEAR_CSS_REFS, &css->flags); } /* @@ -4053,6 +3993,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (!cgrp) return -ENOMEM; + /* + * Only live parents can have children. Note that the liveliness + * check isn't strictly necessary because cgroup_mkdir() and + * cgroup_rmdir() are fully synchronized by i_mutex; however, do it + * anyway so that locking is contained inside cgroup proper and we + * don't get nasty surprises if we ever grow another caller. + */ + if (!cgroup_lock_live_group(parent)) { + err = -ENODEV; + goto err_free; + } + /* Grab a reference on the superblock so the hierarchy doesn't * get deleted on unmount if there are child cgroups. This * can be done outside cgroup_mutex, since the sb can't @@ -4060,8 +4012,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, * fs */ atomic_inc(&sb->s_active); - mutex_lock(&cgroup_mutex); - init_cgroup_housekeeping(cgrp); cgrp->parent = parent; @@ -4109,10 +4059,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (err < 0) goto err_remove; - /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ + /* each css holds a ref to the cgroup's dentry */ for_each_subsys(root, ss) - if (!ss->__DEPRECATED_clear_css_refs) - dget(dentry); + dget(dentry); /* The cgroup directory was pre-locked for us */ BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); @@ -4143,7 +4092,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, /* Release the reference count that we took on the superblock */ deactivate_super(sb); - +err_free: kfree(cgrp); return err; } @@ -4197,71 +4146,6 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) return 0; } -/* - * Atomically mark all (or else none) of the cgroup's CSS objects as - * CSS_REMOVED. Return true on success, or false if the cgroup has - * busy subsystems. Call with cgroup_mutex held - * - * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or - * not, cgroup removal behaves differently. - * - * If clear is set, css refcnt for the subsystem should be zero before - * cgroup removal can be committed. This is implemented by - * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be - * called multiple times until all css refcnts reach zero and is allowed to - * veto removal on any invocation. This behavior is deprecated and will be - * removed as soon as the existing user (memcg) is updated. - * - * If clear is not set, each css holds an extra reference to the cgroup's - * dentry and cgroup removal proceeds regardless of css refs. - * ->pre_destroy() will be called at least once and is not allowed to fail. - * On the last put of each css, whenever that may be, the extra dentry ref - * is put so that dentry destruction happens only after all css's are - * released. - */ -static int cgroup_clear_css_refs(struct cgroup *cgrp) -{ - struct cgroup_subsys *ss; - unsigned long flags; - bool failed = false; - - local_irq_save(flags); - - /* - * Block new css_tryget() by deactivating refcnt. If all refcnts - * for subsystems w/ clear_css_refs set were 1 at the moment of - * deactivation, we succeeded. - */ - for_each_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - - WARN_ON(atomic_read(&css->refcnt) < 0); - atomic_add(CSS_DEACT_BIAS, &css->refcnt); - - if (ss->__DEPRECATED_clear_css_refs) - failed |= css_refcnt(css) != 1; - } - - /* - * If succeeded, set REMOVED and put all the base refs; otherwise, - * restore refcnts to positive values. Either way, all in-progress - * css_tryget() will be released. - */ - for_each_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - - if (!failed) { - set_bit(CSS_REMOVED, &css->flags); - css_put(css); - } else { - atomic_sub(CSS_DEACT_BIAS, &css->refcnt); - } - } - - local_irq_restore(flags); - return !failed; -} - static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) { struct cgroup *cgrp = dentry->d_fsdata; @@ -4269,70 +4153,52 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) struct cgroup *parent; DEFINE_WAIT(wait); struct cgroup_event *event, *tmp; - int ret; + struct cgroup_subsys *ss; /* the vfs holds both inode->i_mutex already */ -again: mutex_lock(&cgroup_mutex); - if (atomic_read(&cgrp->count) != 0) { - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - if (!list_empty(&cgrp->children)) { + parent = cgrp->parent; + if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { mutex_unlock(&cgroup_mutex); return -EBUSY; } - mutex_unlock(&cgroup_mutex); /* - * In general, subsystem has no css->refcnt after pre_destroy(). But - * in racy cases, subsystem may have to get css->refcnt after - * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes - * make rmdir return -EBUSY too often. To avoid that, we use waitqueue - * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir - * and subsystem's reference count handling. Please see css_get/put - * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. + * Block new css_tryget() by deactivating refcnt and mark @cgrp + * removed. This makes future css_tryget() and child creation + * attempts fail thus maintaining the removal conditions verified + * above. */ - set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); + for_each_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - /* - * Call pre_destroy handlers of subsys. Notify subsystems - * that rmdir() request comes. - */ - ret = cgroup_call_pre_destroy(cgrp); - if (ret) { - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - return ret; + WARN_ON(atomic_read(&css->refcnt) < 0); + atomic_add(CSS_DEACT_BIAS, &css->refcnt); } + set_bit(CGRP_REMOVED, &cgrp->flags); + /* + * Tell subsystems to initate destruction. pre_destroy() should be + * called with cgroup_mutex unlocked. See 3fa59dfbc3 ("cgroup: fix + * potential deadlock in pre_destroy") for details. + */ + mutex_unlock(&cgroup_mutex); + for_each_subsys(cgrp->root, ss) + if (ss->pre_destroy) + ss->pre_destroy(cgrp); mutex_lock(&cgroup_mutex); - parent = cgrp->parent; - if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); - if (!cgroup_clear_css_refs(cgrp)) { - mutex_unlock(&cgroup_mutex); - /* - * Because someone may call cgroup_wakeup_rmdir_waiter() before - * prepare_to_wait(), we need to check this flag. - */ - if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) - schedule(); - finish_wait(&cgroup_rmdir_waitq, &wait); - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - if (signal_pending(current)) - return -EINTR; - goto again; - } - /* NO css_tryget() can success after here. */ - finish_wait(&cgroup_rmdir_waitq, &wait); - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); + + /* + * Put all the base refs. Each css holds an extra reference to the + * cgroup's dentry and cgroup removal proceeds regardless of css + * refs. On the last put of each css, whenever that may be, the + * extra dentry ref is put so that dentry destruction happens only + * after all css's are released. + */ + for_each_subsys(cgrp->root, ss) + css_put(cgrp->subsys[ss->subsys_id]); raw_spin_lock(&release_list_lock); - set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); @@ -4832,44 +4698,19 @@ void cgroup_fork(struct task_struct *child) } /** - * cgroup_fork_callbacks - run fork callbacks - * @child: the new task - * - * Called on a new task very soon before adding it to the - * tasklist. No need to take any locks since no-one can - * be operating on this task. - */ -void cgroup_fork_callbacks(struct task_struct *child) -{ - if (need_forkexit_callback) { - int i; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - - /* - * forkexit callbacks are only supported for - * builtin subsystems. - */ - if (!ss || ss->module) - continue; - - if (ss->fork) - ss->fork(child); - } - } -} - -/** * cgroup_post_fork - called on a new task after adding it to the task list * @child: the task in question * - * Adds the task to the list running through its css_set if necessary. - * Has to be after the task is visible on the task list in case we race - * with the first call to cgroup_iter_start() - to guarantee that the - * new task ends up on its list. + * Adds the task to the list running through its css_set if necessary and + * call the subsystem fork() callbacks. Has to be after the task is + * visible on the task list in case we race with the first call to + * cgroup_iter_start() - to guarantee that the new task ends up on its + * list. */ void cgroup_post_fork(struct task_struct *child) { + int i; + /* * use_task_css_set_links is set to 1 before we walk the tasklist * under the tasklist_lock and we read it here after we added the child @@ -4889,7 +4730,30 @@ void cgroup_post_fork(struct task_struct *child) task_unlock(child); write_unlock(&css_set_lock); } + + /* + * Call ss->fork(). This must happen after @child is linked on + * css_set; otherwise, @child might change state between ->fork() + * and addition to css_set. + */ + if (need_forkexit_callback) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + + /* + * fork/exit callbacks are supported only for + * builtin subsystems and we don't need further + * synchronization as they never go away. + */ + if (!ss || ss->module) + continue; + + if (ss->fork) + ss->fork(child); + } + } } + /** * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process @@ -5022,15 +4886,17 @@ static void check_for_release(struct cgroup *cgrp) /* Caller must verify that the css is not for root cgroup */ bool __css_tryget(struct cgroup_subsys_state *css) { - do { - int v = css_refcnt(css); + while (true) { + int t, v; - if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) + v = css_refcnt(css); + t = atomic_cmpxchg(&css->refcnt, v, v + 1); + if (likely(t == v)) return true; + else if (t < 0) + return false; cpu_relax(); - } while (!test_bit(CSS_REMOVED, &css->flags)); - - return false; + } } EXPORT_SYMBOL_GPL(__css_tryget); @@ -5049,11 +4915,9 @@ void __css_put(struct cgroup_subsys_state *css) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } - cgroup_wakeup_rmdir_waiter(cgrp); break; case 0: - if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) - schedule_work(&css->dput_work); + schedule_work(&css->dput_work); break; } rcu_read_unlock(); |