diff options
-rw-r--r-- | block/blk-cgroup.c | 3 | ||||
-rw-r--r-- | include/linux/cgroup.h | 42 | ||||
-rw-r--r-- | include/linux/freezer.h | 57 | ||||
-rw-r--r-- | kernel/cgroup.c | 318 | ||||
-rw-r--r-- | kernel/cgroup_freezer.c | 209 | ||||
-rw-r--r-- | kernel/fork.c | 9 | ||||
-rw-r--r-- | kernel/freezer.c | 11 | ||||
-rw-r--r-- | kernel/power/process.c | 13 | ||||
-rw-r--r-- | kernel/signal.c | 20 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 11 | ||||
-rw-r--r-- | mm/memcontrol.c | 181 |
11 files changed, 314 insertions, 560 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index d0b770391ad4..3dc60fc441cb 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -610,7 +610,7 @@ struct cftype blkcg_files[] = { * * This is the blkcg counterpart of ioc_release_fn(). */ -static int blkcg_pre_destroy(struct cgroup *cgroup) +static void blkcg_pre_destroy(struct cgroup *cgroup) { struct blkcg *blkcg = cgroup_to_blkcg(cgroup); @@ -632,7 +632,6 @@ static int blkcg_pre_destroy(struct cgroup *cgroup) } spin_unlock_irq(&blkcg->lock); - return 0; } static void blkcg_destroy(struct cgroup *cgroup) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f8a030ced0c7..fe876a77031a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -34,7 +34,6 @@ extern int cgroup_lock_is_held(void); extern bool cgroup_lock_live_group(struct cgroup *cgrp); extern void cgroup_unlock(void); extern void cgroup_fork(struct task_struct *p); -extern void cgroup_fork_callbacks(struct task_struct *p); extern void cgroup_post_fork(struct task_struct *p); extern void cgroup_exit(struct task_struct *p, int run_callbacks); extern int cgroupstats_build(struct cgroupstats *stats, @@ -82,8 +81,6 @@ struct cgroup_subsys_state { /* bits in struct cgroup_subsys_state flags field */ enum { CSS_ROOT, /* This CSS is the root of the subsystem */ - CSS_REMOVED, /* This CSS is dead */ - CSS_CLEAR_CSS_REFS, /* @ss->__DEPRECATED_clear_css_refs */ }; /* Caller must verify that the css is not for root cgroup */ @@ -106,11 +103,6 @@ static inline void css_get(struct cgroup_subsys_state *css) __css_get(css, 1); } -static inline bool css_is_removed(struct cgroup_subsys_state *css) -{ - return test_bit(CSS_REMOVED, &css->flags); -} - /* * Call css_tryget() to take a reference on a css if your existing * (known-valid) reference isn't already ref-counted. Returns false if @@ -149,10 +141,6 @@ enum { /* Control Group requires release notifications to userspace */ CGRP_NOTIFY_ON_RELEASE, /* - * A thread in rmdir() is wating for this cgroup. - */ - CGRP_WAIT_ON_RMDIR, - /* * Clone cgroup values when creating a new child cgroup */ CGRP_CLONE_CHILDREN, @@ -422,23 +410,6 @@ int cgroup_task_count(const struct cgroup *cgrp); int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); /* - * When the subsys has to access css and may add permanent refcnt to css, - * it should take care of racy conditions with rmdir(). Following set of - * functions, is for stop/restart rmdir if necessary. - * Because these will call css_get/put, "css" should be alive css. - * - * cgroup_exclude_rmdir(); - * ...do some jobs which may access arbitrary empty cgroup - * cgroup_release_and_wakeup_rmdir(); - * - * When someone removes a cgroup while cgroup_exclude_rmdir() holds it, - * it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up. - */ - -void cgroup_exclude_rmdir(struct cgroup_subsys_state *css); -void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css); - -/* * Control Group taskset, used to pass around set of tasks to cgroup_subsys * methods. */ @@ -467,7 +438,7 @@ int cgroup_taskset_size(struct cgroup_taskset *tset); struct cgroup_subsys { struct cgroup_subsys_state *(*create)(struct cgroup *cgrp); - int (*pre_destroy)(struct cgroup *cgrp); + void (*pre_destroy)(struct cgroup *cgrp); void (*destroy)(struct cgroup *cgrp); int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); @@ -489,17 +460,6 @@ struct cgroup_subsys { bool use_id; /* - * If %true, cgroup removal will try to clear css refs by retrying - * ss->pre_destroy() until there's no css ref left. This behavior - * is strictly for backward compatibility and will be removed as - * soon as the current user (memcg) is updated. - * - * If %false, ss->pre_destroy() can't fail and cgroup removal won't - * wait for css refs to drop to zero before proceeding. - */ - bool __DEPRECATED_clear_css_refs; - - /* * If %false, this subsystem is properly hierarchical - * configuration, resource accounting and restriction on a parent * cgroup cover those of its children. If %true, hierarchy support diff --git a/include/linux/freezer.h b/include/linux/freezer.h index d09af4b67cf1..8039893bc3ec 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -75,35 +75,68 @@ static inline bool cgroup_freezing(struct task_struct *task) */ -/* Tell the freezer not to count the current task as freezable. */ +/** + * freezer_do_not_count - tell freezer to ignore %current + * + * Tell freezers to ignore the current task when determining whether the + * target frozen state is reached. IOW, the current task will be + * considered frozen enough by freezers. + * + * The caller shouldn't do anything which isn't allowed for a frozen task + * until freezer_cont() is called. Usually, freezer[_do_not]_count() pair + * wrap a scheduling operation and nothing much else. + */ static inline void freezer_do_not_count(void) { current->flags |= PF_FREEZER_SKIP; } -/* - * Tell the freezer to count the current task as freezable again and try to - * freeze it. +/** + * freezer_count - tell freezer to stop ignoring %current + * + * Undo freezer_do_not_count(). It tells freezers that %current should be + * considered again and tries to freeze if freezing condition is already in + * effect. */ static inline void freezer_count(void) { current->flags &= ~PF_FREEZER_SKIP; + /* + * If freezing is in progress, the following paired with smp_mb() + * in freezer_should_skip() ensures that either we see %true + * freezing() or freezer_should_skip() sees !PF_FREEZER_SKIP. + */ + smp_mb(); try_to_freeze(); } -/* - * Check if the task should be counted as freezable by the freezer +/** + * freezer_should_skip - whether to skip a task when determining frozen + * state is reached + * @p: task in quesion + * + * This function is used by freezers after establishing %true freezing() to + * test whether a task should be skipped when determining the target frozen + * state is reached. IOW, if this function returns %true, @p is considered + * frozen enough. */ -static inline int freezer_should_skip(struct task_struct *p) +static inline bool freezer_should_skip(struct task_struct *p) { - return !!(p->flags & PF_FREEZER_SKIP); + /* + * The following smp_mb() paired with the one in freezer_count() + * ensures that either freezer_count() sees %true freezing() or we + * see cleared %PF_FREEZER_SKIP and return %false. This makes it + * impossible for a task to slip frozen state testing after + * clearing %PF_FREEZER_SKIP. + */ + smp_mb(); + return p->flags & PF_FREEZER_SKIP; } /* - * These macros are intended to be used whenever you want allow a task that's - * sleeping in TASK_UNINTERRUPTIBLE or TASK_KILLABLE state to be frozen. Note - * that neither return any clear indication of whether a freeze event happened - * while in this function. + * These macros are intended to be used whenever you want allow a sleeping + * task to be frozen. Note that neither return any clear indication of + * whether a freeze event happened while in this function. */ /* Like schedule(), but should not block the freezer. */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f24f724620dd..3070164e2036 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -171,8 +171,8 @@ struct css_id { * The css to which this ID points. This pointer is set to valid value * after cgroup is populated. If cgroup is removed, this will be NULL. * This pointer is expected to be RCU-safe because destroy() - * is called after synchronize_rcu(). But for safe use, css_is_removed() - * css_tryget() should be used for avoiding race. + * is called after synchronize_rcu(). But for safe use, css_tryget() + * should be used for avoiding race. */ struct cgroup_subsys_state __rcu *css; /* @@ -854,30 +854,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) return inode; } -/* - * Call subsys's pre_destroy handler. - * This is called before css refcnt check. - */ -static int cgroup_call_pre_destroy(struct cgroup *cgrp) -{ - struct cgroup_subsys *ss; - int ret = 0; - - for_each_subsys(cgrp->root, ss) { - if (!ss->pre_destroy) - continue; - - ret = ss->pre_destroy(cgrp); - if (ret) { - /* ->pre_destroy() failure is being deprecated */ - WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs); - break; - } - } - - return ret; -} - static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -1015,33 +991,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry) } /* - * A queue for waiters to do rmdir() cgroup. A tasks will sleep when - * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some - * reference to css->refcnt. In general, this refcnt is expected to goes down - * to zero, soon. - * - * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; - */ -static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); - -static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) -{ - if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) - wake_up_all(&cgroup_rmdir_waitq); -} - -void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) -{ - css_get(css); -} - -void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) -{ - cgroup_wakeup_rmdir_waiter(css->cgroup); - css_put(css); -} - -/* * Call with cgroup_mutex held. Drops reference counts on modules, including * any duplicate ones that parse_cgroupfs_options took. If this function * returns an error, no reference counts are touched. @@ -2025,12 +1974,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) } synchronize_rcu(); - - /* - * wake up rmdir() waiter. the rmdir should fail since the cgroup - * is no longer empty. - */ - cgroup_wakeup_rmdir_waiter(cgrp); out: if (retval) { for_each_subsys(root, ss) { @@ -2200,7 +2143,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * step 5: success! and cleanup */ synchronize_rcu(); - cgroup_wakeup_rmdir_waiter(cgrp); retval = 0; out_put_css_set_refs: if (retval) { @@ -4022,14 +3964,12 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, cgrp->subsys[ss->subsys_id] = css; /* - * If !clear_css_refs, css holds an extra ref to @cgrp->dentry - * which is put on the last css_put(). dput() requires process - * context, which css_put() may be called without. @css->dput_work - * will be used to invoke dput() asynchronously from css_put(). + * css holds an extra ref to @cgrp->dentry which is put on the last + * css_put(). dput() requires process context, which css_put() may + * be called without. @css->dput_work will be used to invoke + * dput() asynchronously from css_put(). */ INIT_WORK(&css->dput_work, css_dput_fn); - if (ss->__DEPRECATED_clear_css_refs) - set_bit(CSS_CLEAR_CSS_REFS, &css->flags); } /* @@ -4053,6 +3993,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (!cgrp) return -ENOMEM; + /* + * Only live parents can have children. Note that the liveliness + * check isn't strictly necessary because cgroup_mkdir() and + * cgroup_rmdir() are fully synchronized by i_mutex; however, do it + * anyway so that locking is contained inside cgroup proper and we + * don't get nasty surprises if we ever grow another caller. + */ + if (!cgroup_lock_live_group(parent)) { + err = -ENODEV; + goto err_free; + } + /* Grab a reference on the superblock so the hierarchy doesn't * get deleted on unmount if there are child cgroups. This * can be done outside cgroup_mutex, since the sb can't @@ -4060,8 +4012,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, * fs */ atomic_inc(&sb->s_active); - mutex_lock(&cgroup_mutex); - init_cgroup_housekeeping(cgrp); cgrp->parent = parent; @@ -4109,10 +4059,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (err < 0) goto err_remove; - /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ + /* each css holds a ref to the cgroup's dentry */ for_each_subsys(root, ss) - if (!ss->__DEPRECATED_clear_css_refs) - dget(dentry); + dget(dentry); /* The cgroup directory was pre-locked for us */ BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); @@ -4143,7 +4092,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, /* Release the reference count that we took on the superblock */ deactivate_super(sb); - +err_free: kfree(cgrp); return err; } @@ -4197,71 +4146,6 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) return 0; } -/* - * Atomically mark all (or else none) of the cgroup's CSS objects as - * CSS_REMOVED. Return true on success, or false if the cgroup has - * busy subsystems. Call with cgroup_mutex held - * - * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or - * not, cgroup removal behaves differently. - * - * If clear is set, css refcnt for the subsystem should be zero before - * cgroup removal can be committed. This is implemented by - * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be - * called multiple times until all css refcnts reach zero and is allowed to - * veto removal on any invocation. This behavior is deprecated and will be - * removed as soon as the existing user (memcg) is updated. - * - * If clear is not set, each css holds an extra reference to the cgroup's - * dentry and cgroup removal proceeds regardless of css refs. - * ->pre_destroy() will be called at least once and is not allowed to fail. - * On the last put of each css, whenever that may be, the extra dentry ref - * is put so that dentry destruction happens only after all css's are - * released. - */ -static int cgroup_clear_css_refs(struct cgroup *cgrp) -{ - struct cgroup_subsys *ss; - unsigned long flags; - bool failed = false; - - local_irq_save(flags); - - /* - * Block new css_tryget() by deactivating refcnt. If all refcnts - * for subsystems w/ clear_css_refs set were 1 at the moment of - * deactivation, we succeeded. - */ - for_each_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - - WARN_ON(atomic_read(&css->refcnt) < 0); - atomic_add(CSS_DEACT_BIAS, &css->refcnt); - - if (ss->__DEPRECATED_clear_css_refs) - failed |= css_refcnt(css) != 1; - } - - /* - * If succeeded, set REMOVED and put all the base refs; otherwise, - * restore refcnts to positive values. Either way, all in-progress - * css_tryget() will be released. - */ - for_each_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - - if (!failed) { - set_bit(CSS_REMOVED, &css->flags); - css_put(css); - } else { - atomic_sub(CSS_DEACT_BIAS, &css->refcnt); - } - } - - local_irq_restore(flags); - return !failed; -} - static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) { struct cgroup *cgrp = dentry->d_fsdata; @@ -4269,70 +4153,52 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) struct cgroup *parent; DEFINE_WAIT(wait); struct cgroup_event *event, *tmp; - int ret; + struct cgroup_subsys *ss; /* the vfs holds both inode->i_mutex already */ -again: mutex_lock(&cgroup_mutex); - if (atomic_read(&cgrp->count) != 0) { - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - if (!list_empty(&cgrp->children)) { + parent = cgrp->parent; + if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { mutex_unlock(&cgroup_mutex); return -EBUSY; } - mutex_unlock(&cgroup_mutex); /* - * In general, subsystem has no css->refcnt after pre_destroy(). But - * in racy cases, subsystem may have to get css->refcnt after - * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes - * make rmdir return -EBUSY too often. To avoid that, we use waitqueue - * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir - * and subsystem's reference count handling. Please see css_get/put - * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. + * Block new css_tryget() by deactivating refcnt and mark @cgrp + * removed. This makes future css_tryget() and child creation + * attempts fail thus maintaining the removal conditions verified + * above. */ - set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); + for_each_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - /* - * Call pre_destroy handlers of subsys. Notify subsystems - * that rmdir() request comes. - */ - ret = cgroup_call_pre_destroy(cgrp); - if (ret) { - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - return ret; + WARN_ON(atomic_read(&css->refcnt) < 0); + atomic_add(CSS_DEACT_BIAS, &css->refcnt); } + set_bit(CGRP_REMOVED, &cgrp->flags); + /* + * Tell subsystems to initate destruction. pre_destroy() should be + * called with cgroup_mutex unlocked. See 3fa59dfbc3 ("cgroup: fix + * potential deadlock in pre_destroy") for details. + */ + mutex_unlock(&cgroup_mutex); + for_each_subsys(cgrp->root, ss) + if (ss->pre_destroy) + ss->pre_destroy(cgrp); mutex_lock(&cgroup_mutex); - parent = cgrp->parent; - if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); - if (!cgroup_clear_css_refs(cgrp)) { - mutex_unlock(&cgroup_mutex); - /* - * Because someone may call cgroup_wakeup_rmdir_waiter() before - * prepare_to_wait(), we need to check this flag. - */ - if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) - schedule(); - finish_wait(&cgroup_rmdir_waitq, &wait); - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - if (signal_pending(current)) - return -EINTR; - goto again; - } - /* NO css_tryget() can success after here. */ - finish_wait(&cgroup_rmdir_waitq, &wait); - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); + + /* + * Put all the base refs. Each css holds an extra reference to the + * cgroup's dentry and cgroup removal proceeds regardless of css + * refs. On the last put of each css, whenever that may be, the + * extra dentry ref is put so that dentry destruction happens only + * after all css's are released. + */ + for_each_subsys(cgrp->root, ss) + css_put(cgrp->subsys[ss->subsys_id]); raw_spin_lock(&release_list_lock); - set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); @@ -4832,44 +4698,19 @@ void cgroup_fork(struct task_struct *child) } /** - * cgroup_fork_callbacks - run fork callbacks - * @child: the new task - * - * Called on a new task very soon before adding it to the - * tasklist. No need to take any locks since no-one can - * be operating on this task. - */ -void cgroup_fork_callbacks(struct task_struct *child) -{ - if (need_forkexit_callback) { - int i; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - - /* - * forkexit callbacks are only supported for - * builtin subsystems. - */ - if (!ss || ss->module) - continue; - - if (ss->fork) - ss->fork(child); - } - } -} - -/** * cgroup_post_fork - called on a new task after adding it to the task list * @child: the task in question * - * Adds the task to the list running through its css_set if necessary. - * Has to be after the task is visible on the task list in case we race - * with the first call to cgroup_iter_start() - to guarantee that the - * new task ends up on its list. + * Adds the task to the list running through its css_set if necessary and + * call the subsystem fork() callbacks. Has to be after the task is + * visible on the task list in case we race with the first call to + * cgroup_iter_start() - to guarantee that the new task ends up on its + * list. */ void cgroup_post_fork(struct task_struct *child) { + int i; + /* * use_task_css_set_links is set to 1 before we walk the tasklist * under the tasklist_lock and we read it here after we added the child @@ -4889,7 +4730,30 @@ void cgroup_post_fork(struct task_struct *child) task_unlock(child); write_unlock(&css_set_lock); } + + /* + * Call ss->fork(). This must happen after @child is linked on + * css_set; otherwise, @child might change state between ->fork() + * and addition to css_set. + */ + if (need_forkexit_callback) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + + /* + * fork/exit callbacks are supported only for + * builtin subsystems and we don't need further + * synchronization as they never go away. + */ + if (!ss || ss->module) + continue; + + if (ss->fork) + ss->fork(child); + } + } } + /** * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process @@ -5022,15 +4886,17 @@ static void check_for_release(struct cgroup *cgrp) /* Caller must verify that the css is not for root cgroup */ bool __css_tryget(struct cgroup_subsys_state *css) { - do { - int v = css_refcnt(css); + while (true) { + int t, v; - if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) + v = css_refcnt(css); + t = atomic_cmpxchg(&css->refcnt, v, v + 1); + if (likely(t == v)) return true; + else if (t < 0) + return false; cpu_relax(); - } while (!test_bit(CSS_REMOVED, &css->flags)); - - return false; + } } EXPORT_SYMBOL_GPL(__css_tryget); @@ -5049,11 +4915,9 @@ void __css_put(struct cgroup_subsys_state *css) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } - cgroup_wakeup_rmdir_waiter(cgrp); break; case 0: - if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) - schedule_work(&css->dput_work); + schedule_work(&css->dput_work); break; } rcu_read_unlock(); diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index b1724ce98981..bedefd9a22df 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -84,50 +84,6 @@ static const char *freezer_state_strs[] = { struct cgroup_subsys freezer_subsys; -/* Locks taken and their ordering - * ------------------------------ - * cgroup_mutex (AKA cgroup_lock) - * freezer->lock - * css_set_lock - * task->alloc_lock (AKA task_lock) - * task->sighand->siglock - * - * cgroup code forces css_set_lock to be taken before task->alloc_lock - * - * freezer_create(), freezer_destroy(): - * cgroup_mutex [ by cgroup core ] - * - * freezer_can_attach(): - * cgroup_mutex (held by caller of can_attach) - * - * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): - * freezer->lock - * sighand->siglock (if the cgroup is freezing) - * - * freezer_read(): - * cgroup_mutex - * freezer->lock - * write_lock css_set_lock (cgroup iterator start) - * task->alloc_lock - * read_lock css_set_lock (cgroup iterator start) - * - * freezer_write() (freeze): - * cgroup_mutex - * freezer->lock - * write_lock css_set_lock (cgroup iterator start) - * task->alloc_lock - * read_lock css_set_lock (cgroup iterator start) - * sighand->siglock (fake signal delivery inside freeze_task()) - * - * freezer_write() (unfreeze): - * cgroup_mutex - * freezer->lock - * write_lock css_set_lock (cgroup iterator start) - * task->alloc_lock - * read_lock css_set_lock (cgroup iterator start) - * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) - * sighand->siglock - */ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) { struct freezer *freezer; @@ -150,96 +106,105 @@ static void freezer_destroy(struct cgroup *cgroup) kfree(freezer); } -/* task is frozen or will freeze immediately when next it gets woken */ -static bool is_task_frozen_enough(struct task_struct *task) -{ - return frozen(task) || - (task_is_stopped_or_traced(task) && freezing(task)); -} - /* - * The call to cgroup_lock() in the freezer.state write method prevents - * a write to that file racing against an attach, and hence the - * can_attach() result will remain valid until the attach completes. + * Tasks can be migrated into a different freezer anytime regardless of its + * current state. freezer_attach() is responsible for making new tasks + * conform to the current state. + * + * Freezer state changes and task migration are synchronized via + * @freezer->lock. freezer_attach() makes the new tasks conform to the + * current state and all following state changes can see the new tasks. */ -static int freezer_can_attach(struct cgroup *new_cgroup, - struct cgroup_taskset *tset) +static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) { - struct freezer *freezer; + struct freezer *freezer = cgroup_freezer(new_cgrp); struct task_struct *task; + spin_lock_irq(&freezer->lock); + /* - * Anything frozen can't move or be moved to/from. + * Make the new tasks conform to the current state of @new_cgrp. + * For simplicity, when migrating any task to a FROZEN cgroup, we + * revert it to FREEZING and let update_if_frozen() determine the + * correct state later. + * + * Tasks in @tset are on @new_cgrp but may not conform to its + * current state before executing the following - !frozen tasks may + * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. + * This means that, to determine whether to freeze, one should test + * whether the state equals THAWED. */ - cgroup_taskset_for_each(task, new_cgroup, tset) - if (cgroup_freezing(task)) - return -EBUSY; - - freezer = cgroup_freezer(new_cgroup); - if (freezer->state != CGROUP_THAWED) - return -EBUSY; + cgroup_taskset_for_each(task, new_cgrp, tset) { + if (freezer->state == CGROUP_THAWED) { + __thaw_task(task); + } else { + freeze_task(task); + freezer->state = CGROUP_FREEZING; + } + } - return 0; + spin_unlock_irq(&freezer->lock); } static void freezer_fork(struct task_struct *task) { struct freezer *freezer; - /* - * No lock is needed, since the task isn't on tasklist yet, - * so it can't be moved to another cgroup, which means the - * freezer won't be removed and will be valid during this - * function call. Nevertheless, apply RCU read-side critical - * section to suppress RCU lockdep false positives. - */ rcu_read_lock(); freezer = task_freezer(task); - rcu_read_unlock(); /* * The root cgroup is non-freezable, so we can skip the * following check. */ if (!freezer->css.cgroup->parent) - return; + goto out; spin_lock_irq(&freezer->lock); - BUG_ON(freezer->state == CGROUP_FROZEN); - - /* Locking avoids race with FREEZING -> THAWED transitions. */ - if (freezer->state == CGROUP_FREEZING) + /* + * @task might have been just migrated into a FROZEN cgroup. Test + * equality with THAWED. Read the comment in freezer_attach(). + */ + if (freezer->state != CGROUP_THAWED) freeze_task(task); spin_unlock_irq(&freezer->lock); +out: + rcu_read_unlock(); } /* - * caller must hold freezer->lock + * We change from FREEZING to FROZEN lazily if the cgroup was only + * partially frozen when we exitted write. Caller must hold freezer->lock. + * + * Task states and freezer state might disagree while tasks are being + * migrated into or out of @cgroup, so we can't verify task states against + * @freezer state here. See freezer_attach() for details. */ -static void update_if_frozen(struct cgroup *cgroup, - struct freezer *freezer) +static void update_if_frozen(struct cgroup *cgroup, struct freezer *freezer) { struct cgroup_iter it; struct task_struct *task; - unsigned int nfrozen = 0, ntotal = 0; - enum freezer_state old_state = freezer->state; + + if (freezer->state != CGROUP_FREEZING) + return; cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) { - ntotal++; - if (freezing(task) && is_task_frozen_enough(task)) - nfrozen++; - } - if (old_state == CGROUP_THAWED) { - BUG_ON(nfrozen > 0); - } else if (old_state == CGROUP_FREEZING) { - if (nfrozen == ntotal) - freezer->state = CGROUP_FROZEN; - } else { /* old_state == CGROUP_FROZEN */ - BUG_ON(nfrozen != ntotal); + while ((task = cgroup_iter_next(cgroup, &it))) { + if (freezing(task)) { + /* + * freezer_should_skip() indicates that the task + * should be skipped when determining freezing + * completion. Consider it frozen in addition to + * the usual frozen condition. + */ + if (!frozen(task) && !freezer_should_skip(task)) + goto notyet; + } } + freezer->state = CGROUP_FROZEN; +notyet: cgroup_iter_end(cgroup, &it); } @@ -249,44 +214,26 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft, struct freezer *freezer; enum freezer_state state; - if (!cgroup_lock_live_group(cgroup)) - return -ENODEV; - freezer = cgroup_freezer(cgroup); spin_lock_irq(&freezer->lock); + update_if_frozen(cgroup, freezer); state = freezer->state; - if (state == CGROUP_FREEZING) { - /* We change from FREEZING to FROZEN lazily if the cgroup was - * only partially frozen when we exitted write. */ - update_if_frozen(cgroup, freezer); - state = freezer->state; - } spin_unlock_irq(&freezer->lock); - cgroup_unlock(); seq_puts(m, freezer_state_strs[state]); seq_putc(m, '\n'); return 0; } -static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +static void freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) { struct cgroup_iter it; struct task_struct *task; - unsigned int num_cant_freeze_now = 0; cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) { - if (!freeze_task(task)) - continue; - if (is_task_frozen_enough(task)) - continue; - if (!freezing(task) && !freezer_should_skip(task)) - num_cant_freeze_now++; - } + while ((task = cgroup_iter_next(cgroup, &it))) + freeze_task(task); cgroup_iter_end(cgroup, &it); - - return num_cant_freeze_now ? -EBUSY : 0; } static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) @@ -300,18 +247,14 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) cgroup_iter_end(cgroup, &it); } -static int freezer_change_state(struct cgroup *cgroup, - enum freezer_state goal_state) +static void freezer_change_state(struct cgroup *cgroup, + enum freezer_state goal_state) { - struct freezer *freezer; - int retval = 0; - - freezer = cgroup_freezer(cgroup); + struct freezer *freezer = cgroup_freezer(cgroup); + /* also synchronizes against task migration, see freezer_attach() */ spin_lock_irq(&freezer->lock); - update_if_frozen(cgroup, freezer); - switch (goal_state) { case CGROUP_THAWED: if (freezer->state != CGROUP_THAWED) @@ -323,22 +266,19 @@ static int freezer_change_state(struct cgroup *cgroup, if (freezer->state == CGROUP_THAWED) atomic_inc(&system_freezing_cnt); freezer->state = CGROUP_FREEZING; - retval = try_to_freeze_cgroup(cgroup, freezer); + freeze_cgroup(cgroup, freezer); break; default: BUG(); } spin_unlock_irq(&freezer->lock); - - return retval; } static int freezer_write(struct cgroup *cgroup, struct cftype *cft, const char *buffer) { - int retval; enum freezer_state goal_state; if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) @@ -348,11 +288,8 @@ static int freezer_write(struct cgroup *cgroup, else return -EINVAL; - if (!cgroup_lock_live_group(cgroup)) - return -ENODEV; - retval = freezer_change_state(cgroup, goal_state); - cgroup_unlock(); - return retval; + freezer_change_state(cgroup, goal_state); + return 0; } static struct cftype files[] = { @@ -370,7 +307,7 @@ struct cgroup_subsys freezer_subsys = { .create = freezer_create, .destroy = freezer_destroy, .subsys_id = freezer_subsys_id, - .can_attach = freezer_can_attach, + .attach = freezer_attach, .fork = freezer_fork, .base_cftypes = files, diff --git a/kernel/fork.c b/kernel/fork.c index 8b20ab7d3aa2..acc4cb62f32f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1135,7 +1135,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, { int retval; struct task_struct *p; - int cgroup_callbacks_done = 0; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -1393,12 +1392,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; - /* Now that the task is set up, run cgroup callbacks if - * necessary. We need to run them before the task is visible - * on the tasklist. */ - cgroup_fork_callbacks(p); - cgroup_callbacks_done = 1; - /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); @@ -1503,7 +1496,7 @@ bad_fork_cleanup_cgroup: #endif if (clone_flags & CLONE_THREAD) threadgroup_change_end(current); - cgroup_exit(p, cgroup_callbacks_done); + cgroup_exit(p, 0); delayacct_tsk_free(p); module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: diff --git a/kernel/freezer.c b/kernel/freezer.c index 11f82a4d4eae..c38893b0efba 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -116,17 +116,10 @@ bool freeze_task(struct task_struct *p) return false; } - if (!(p->flags & PF_KTHREAD)) { + if (!(p->flags & PF_KTHREAD)) fake_signal_wake_up(p); - /* - * fake_signal_wake_up() goes through p's scheduler - * lock and guarantees that TASK_STOPPED/TRACED -> - * TASK_RUNNING transition can't race with task state - * testing in try_to_freeze_tasks(). - */ - } else { + else wake_up_state(p, TASK_INTERRUPTIBLE); - } spin_unlock_irqrestore(&freezer_lock, flags); return true; diff --git a/kernel/power/process.c b/kernel/power/process.c index 87da817f9e13..d5a258b60c6f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -48,18 +48,7 @@ static int try_to_freeze_tasks(bool user_only) if (p == current || !freeze_task(p)) continue; - /* - * Now that we've done set_freeze_flag, don't - * perturb a task in TASK_STOPPED or TASK_TRACED. - * It is "frozen enough". If the task does wake - * up, it will immediately call try_to_freeze. - * - * Because freeze_task() goes through p's scheduler lock, it's - * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING - * transition can't race with task state testing here. - */ - if (!task_is_stopped_or_traced(p) && - !freezer_should_skip(p)) + if (!freezer_should_skip(p)) todo++; } while_each_thread(g, p); read_unlock(&tasklist_lock); diff --git a/kernel/signal.c b/kernel/signal.c index 0af8868525d6..5ffb5626e072 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1908,7 +1908,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) preempt_disable(); read_unlock(&tasklist_lock); preempt_enable_no_resched(); - schedule(); + freezable_schedule(); } else { /* * By the time we got the lock, our tracer went away. @@ -1930,13 +1930,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) } /* - * While in TASK_TRACED, we were considered "frozen enough". - * Now that we woke up, it's crucial if we're supposed to be - * frozen that we freeze now before running anything substantial. - */ - try_to_freeze(); - - /* * We are back. Now reacquire the siglock before touching * last_siginfo, so that we are sure to have synchronized with * any signal-sending on another CPU that wants to examine it. @@ -2092,7 +2085,7 @@ static bool do_signal_stop(int signr) } /* Now we don't run again until woken by SIGCONT or SIGKILL */ - schedule(); + freezable_schedule(); return true; } else { /* @@ -2200,15 +2193,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, if (unlikely(uprobe_deny_signal())) return 0; -relock: /* - * We'll jump back here after any time we were stopped in TASK_STOPPED. - * While in TASK_STOPPED, we were considered "frozen enough". - * Now that we woke up, it's crucial if we're supposed to be - * frozen that we freeze now before running anything substantial. + * Do this once, we can't return to user-mode if freezing() == T. + * do_signal_stop() and ptrace_stop() do freezable_schedule() and + * thus do not need another check after return. */ try_to_freeze(); +relock: spin_lock_irq(&sighand->siglock); /* * Every stopped thread goes here after wakeup. Check to see if diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index a3f358fb8a0c..0d3a1a317731 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -155,18 +155,13 @@ out: * Force the hugetlb cgroup to empty the hugetlb resources by moving them to * the parent cgroup. */ -static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) +static void hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) { struct hstate *h; struct page *page; - int ret = 0, idx = 0; + int idx = 0; do { - if (cgroup_task_count(cgroup) || - !list_empty(&cgroup->children)) { - ret = -EBUSY; - goto out; - } for_each_hstate(h) { spin_lock(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_activelist, lru) @@ -177,8 +172,6 @@ static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) } cond_resched(); } while (hugetlb_cgroup_have_usage(cgroup)); -out: - return ret; } int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7acf43bf04a2..08adaaae6fcc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2337,7 +2337,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, again: if (*ptr) { /* css should be a valid one */ memcg = *ptr; - VM_BUG_ON(css_is_removed(&memcg->css)); if (mem_cgroup_is_root(memcg)) goto done; if (nr_pages == 1 && consume_stock(memcg)) @@ -2477,9 +2476,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, /* * A helper function to get mem_cgroup from ID. must be called under - * rcu_read_lock(). The caller must check css_is_removed() or some if - * it's concern. (dropping refcnt from swap can be called against removed - * memcg.) + * rcu_read_lock(). The caller is responsible for calling css_tryget if + * the mem_cgroup is used for charging. (dropping refcnt from swap can be + * called against removed memcg.) */ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) { @@ -2676,13 +2675,6 @@ static int mem_cgroup_move_account(struct page *page, /* caller should have done css_get */ pc->mem_cgroup = to; mem_cgroup_charge_statistics(to, anon, nr_pages); - /* - * We charges against "to" which may not have any tasks. Then, "to" - * can be under rmdir(). But in current implementation, caller of - * this function is just force_empty() and move charge, so it's - * guaranteed that "to" is never removed. So, we don't check rmdir - * status here. - */ move_unlock_mem_cgroup(from, &flags); ret = 0; unlock: @@ -2696,10 +2688,27 @@ out: return ret; } -/* - * move charges to its parent. +/** + * mem_cgroup_move_parent - moves page to the parent group + * @page: the page to move + * @pc: page_cgroup of the page + * @child: page's cgroup + * + * move charges to its parent or the root cgroup if the group has no + * parent (aka use_hierarchy==0). + * Although this might fail (get_page_unless_zero, isolate_lru_page or + * mem_cgroup_move_account fails) the failure is always temporary and + * it signals a race with a page removal/uncharge or migration. In the + * first case the page is on the way out and it will vanish from the LRU + * on the next attempt and the call should be retried later. + * Isolation from the LRU fails only if page has been isolated from + * the LRU since we looked at it and that usually means either global + * reclaim or migration going on. The page will either get back to the + * LRU or vanish. + * Finaly mem_cgroup_move_account fails only if the page got uncharged + * (!PageCgroupUsed) or moved to a different group. The page will + * disappear in the next attempt. */ - static int mem_cgroup_move_parent(struct page *page, struct page_cgroup *pc, struct mem_cgroup *child) @@ -2709,9 +2718,7 @@ static int mem_cgroup_move_parent(struct page *page, unsigned long uninitialized_var(flags); int ret; - /* Is ROOT ? */ - if (mem_cgroup_is_root(child)) - return -EINVAL; + VM_BUG_ON(mem_cgroup_is_root(child)); ret = -EBUSY; if (!get_page_unless_zero(page)) @@ -2728,8 +2735,10 @@ static int mem_cgroup_move_parent(struct page *page, if (!parent) parent = root_mem_cgroup; - if (nr_pages > 1) + if (nr_pages > 1) { + VM_BUG_ON(!PageTransHuge(page)); flags = compound_lock_irqsave(page); + } ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent); @@ -2871,7 +2880,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, return; if (!memcg) return; - cgroup_exclude_rmdir(&memcg->css); __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); /* @@ -2885,12 +2893,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, swp_entry_t ent = {.val = page_private(page)}; mem_cgroup_uncharge_swap(ent); } - /* - * At swapin, we may charge account against cgroup which has no tasks. - * So, rmdir()->pre_destroy() can be called while we do this charge. - * In that case, we need to call pre_destroy() again. check it here. - */ - cgroup_release_and_wakeup_rmdir(&memcg->css); } void mem_cgroup_commit_charge_swapin(struct page *page, @@ -3338,8 +3340,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, if (!memcg) return; - /* blocks rmdir() */ - cgroup_exclude_rmdir(&memcg->css); + if (!migration_ok) { used = oldpage; unused = newpage; @@ -3373,13 +3374,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, */ if (anon) mem_cgroup_uncharge_page(used); - /* - * At migration, we may charge account against cgroup which has no - * tasks. - * So, rmdir()->pre_destroy() can be called while we do this charge. - * In that case, we need to call pre_destroy() again. check it here. - */ - cgroup_release_and_wakeup_rmdir(&memcg->css); } /* @@ -3679,17 +3673,22 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, return nr_reclaimed; } -/* +/** + * mem_cgroup_force_empty_list - clears LRU of a group + * @memcg: group to clear + * @node: NUMA node + * @zid: zone id + * @lru: lru to to clear + * * Traverse a specified page_cgroup list and try to drop them all. This doesn't - * reclaim the pages page themselves - it just removes the page_cgroups. - * Returns true if some page_cgroups were not freed, indicating that the caller - * must retry this operation. + * reclaim the pages page themselves - pages are moved to the parent (or root) + * group. */ -static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, +static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, int node, int zid, enum lru_list lru) { struct mem_cgroup_per_zone *mz; - unsigned long flags, loop; + unsigned long flags; struct list_head *list; struct page *busy; struct zone *zone; @@ -3698,11 +3697,8 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, mz = mem_cgroup_zoneinfo(memcg, node, zid); list = &mz->lruvec.lists[lru]; - loop = mz->lru_size[lru]; - /* give some margin against EBUSY etc...*/ - loop += 256; busy = NULL; - while (loop--) { + do { struct page_cgroup *pc; struct page *page; @@ -3728,76 +3724,72 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, cond_resched(); } else busy = NULL; - } - return !list_empty(list); + } while (!list_empty(list)); } /* - * make mem_cgroup's charge to be 0 if there is no task. + * make mem_cgroup's charge to be 0 if there is no task by moving + * all the charges and pages to the parent. * This enables deleting this mem_cgroup. + * + * Caller is responsible for holding css reference on the memcg. */ -static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all) +static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) { - int ret; - int node, zid, shrink; - int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; - struct cgroup *cgrp = memcg->css.cgroup; - - css_get(&memcg->css); + int node, zid; - shrink = 0; - /* should free all ? */ - if (free_all) - goto try_to_free; -move_account: do { - ret = -EBUSY; - if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) - goto out; /* This is for making all *used* pages to be on LRU. */ lru_add_drain_all(); drain_all_stock_sync(memcg); - ret = 0; mem_cgroup_start_move(memcg); for_each_node_state(node, N_HIGH_MEMORY) { - for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { + for (zid = 0; zid < MAX_NR_ZONES; zid++) { enum lru_list lru; for_each_lru(lru) { - ret = mem_cgroup_force_empty_list(memcg, + mem_cgroup_force_empty_list(memcg, node, zid, lru); - if (ret) - break; } } - if (ret) - break; } mem_cgroup_end_move(memcg); memcg_oom_recover(memcg); cond_resched(); - /* "ret" should also be checked to ensure all lists are empty. */ - } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret); -out: - css_put(&memcg->css); - return ret; -try_to_free: + /* + * This is a safety check because mem_cgroup_force_empty_list + * could have raced with mem_cgroup_replace_page_cache callers + * so the lru seemed empty but the page could have been added + * right after the check. RES_USAGE should be safe as we always + * charge before adding to the LRU. + */ + } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0); +} + +/* + * Reclaims as many pages from the given memcg as possible and moves + * the rest to the parent. + * + * Caller is responsible for holding css reference for memcg. + */ +static int mem_cgroup_force_empty(struct mem_cgroup *memcg) +{ + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; + struct cgroup *cgrp = memcg->css.cgroup; + /* returns EBUSY if there is a task or if we come here twice. */ - if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { - ret = -EBUSY; - goto out; - } + if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) + return -EBUSY; + /* we call try-to-free pages for make this cgroup empty */ lru_add_drain_all(); /* try to free all pages in this cgroup */ - shrink = 1; while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { int progress; - if (signal_pending(current)) { - ret = -EINTR; - goto out; - } + if (signal_pending(current)) + return -EINTR; + progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, false); if (!progress) { @@ -3808,13 +3800,23 @@ try_to_free: } lru_add_drain(); - /* try move_account...there may be some *locked* pages. */ - goto move_account; + mem_cgroup_reparent_charges(memcg); + + return 0; } static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) { - return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + int ret; + + if (mem_cgroup_is_root(memcg)) + return -EINVAL; + css_get(&memcg->css); + ret = mem_cgroup_force_empty(memcg); + css_put(&memcg->css); + + return ret; } @@ -5001,11 +5003,11 @@ free_out: return ERR_PTR(error); } -static int mem_cgroup_pre_destroy(struct cgroup *cont) +static void mem_cgroup_pre_destroy(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - return mem_cgroup_force_empty(memcg, false); + mem_cgroup_reparent_charges(memcg); } static void mem_cgroup_destroy(struct cgroup *cont) @@ -5607,7 +5609,6 @@ struct cgroup_subsys mem_cgroup_subsys = { .base_cftypes = mem_cgroup_files, .early_init = 0, .use_id = 1, - .__DEPRECATED_clear_css_refs = true, }; #ifdef CONFIG_MEMCG_SWAP |