summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--block/blk-cgroup.c3
-rw-r--r--include/linux/cgroup.h42
-rw-r--r--include/linux/freezer.h57
-rw-r--r--kernel/cgroup.c318
-rw-r--r--kernel/cgroup_freezer.c209
-rw-r--r--kernel/fork.c9
-rw-r--r--kernel/freezer.c11
-rw-r--r--kernel/power/process.c13
-rw-r--r--kernel/signal.c20
-rw-r--r--mm/hugetlb_cgroup.c11
-rw-r--r--mm/memcontrol.c181
11 files changed, 314 insertions, 560 deletions
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d0b770391ad4..3dc60fc441cb 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -610,7 +610,7 @@ struct cftype blkcg_files[] = {
*
* This is the blkcg counterpart of ioc_release_fn().
*/
-static int blkcg_pre_destroy(struct cgroup *cgroup)
+static void blkcg_pre_destroy(struct cgroup *cgroup)
{
struct blkcg *blkcg = cgroup_to_blkcg(cgroup);
@@ -632,7 +632,6 @@ static int blkcg_pre_destroy(struct cgroup *cgroup)
}
spin_unlock_irq(&blkcg->lock);
- return 0;
}
static void blkcg_destroy(struct cgroup *cgroup)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f8a030ced0c7..fe876a77031a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -34,7 +34,6 @@ extern int cgroup_lock_is_held(void);
extern bool cgroup_lock_live_group(struct cgroup *cgrp);
extern void cgroup_unlock(void);
extern void cgroup_fork(struct task_struct *p);
-extern void cgroup_fork_callbacks(struct task_struct *p);
extern void cgroup_post_fork(struct task_struct *p);
extern void cgroup_exit(struct task_struct *p, int run_callbacks);
extern int cgroupstats_build(struct cgroupstats *stats,
@@ -82,8 +81,6 @@ struct cgroup_subsys_state {
/* bits in struct cgroup_subsys_state flags field */
enum {
CSS_ROOT, /* This CSS is the root of the subsystem */
- CSS_REMOVED, /* This CSS is dead */
- CSS_CLEAR_CSS_REFS, /* @ss->__DEPRECATED_clear_css_refs */
};
/* Caller must verify that the css is not for root cgroup */
@@ -106,11 +103,6 @@ static inline void css_get(struct cgroup_subsys_state *css)
__css_get(css, 1);
}
-static inline bool css_is_removed(struct cgroup_subsys_state *css)
-{
- return test_bit(CSS_REMOVED, &css->flags);
-}
-
/*
* Call css_tryget() to take a reference on a css if your existing
* (known-valid) reference isn't already ref-counted. Returns false if
@@ -149,10 +141,6 @@ enum {
/* Control Group requires release notifications to userspace */
CGRP_NOTIFY_ON_RELEASE,
/*
- * A thread in rmdir() is wating for this cgroup.
- */
- CGRP_WAIT_ON_RMDIR,
- /*
* Clone cgroup values when creating a new child cgroup
*/
CGRP_CLONE_CHILDREN,
@@ -422,23 +410,6 @@ int cgroup_task_count(const struct cgroup *cgrp);
int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
/*
- * When the subsys has to access css and may add permanent refcnt to css,
- * it should take care of racy conditions with rmdir(). Following set of
- * functions, is for stop/restart rmdir if necessary.
- * Because these will call css_get/put, "css" should be alive css.
- *
- * cgroup_exclude_rmdir();
- * ...do some jobs which may access arbitrary empty cgroup
- * cgroup_release_and_wakeup_rmdir();
- *
- * When someone removes a cgroup while cgroup_exclude_rmdir() holds it,
- * it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up.
- */
-
-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css);
-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css);
-
-/*
* Control Group taskset, used to pass around set of tasks to cgroup_subsys
* methods.
*/
@@ -467,7 +438,7 @@ int cgroup_taskset_size(struct cgroup_taskset *tset);
struct cgroup_subsys {
struct cgroup_subsys_state *(*create)(struct cgroup *cgrp);
- int (*pre_destroy)(struct cgroup *cgrp);
+ void (*pre_destroy)(struct cgroup *cgrp);
void (*destroy)(struct cgroup *cgrp);
int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset);
@@ -489,17 +460,6 @@ struct cgroup_subsys {
bool use_id;
/*
- * If %true, cgroup removal will try to clear css refs by retrying
- * ss->pre_destroy() until there's no css ref left. This behavior
- * is strictly for backward compatibility and will be removed as
- * soon as the current user (memcg) is updated.
- *
- * If %false, ss->pre_destroy() can't fail and cgroup removal won't
- * wait for css refs to drop to zero before proceeding.
- */
- bool __DEPRECATED_clear_css_refs;
-
- /*
* If %false, this subsystem is properly hierarchical -
* configuration, resource accounting and restriction on a parent
* cgroup cover those of its children. If %true, hierarchy support
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index d09af4b67cf1..8039893bc3ec 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -75,35 +75,68 @@ static inline bool cgroup_freezing(struct task_struct *task)
*/
-/* Tell the freezer not to count the current task as freezable. */
+/**
+ * freezer_do_not_count - tell freezer to ignore %current
+ *
+ * Tell freezers to ignore the current task when determining whether the
+ * target frozen state is reached. IOW, the current task will be
+ * considered frozen enough by freezers.
+ *
+ * The caller shouldn't do anything which isn't allowed for a frozen task
+ * until freezer_cont() is called. Usually, freezer[_do_not]_count() pair
+ * wrap a scheduling operation and nothing much else.
+ */
static inline void freezer_do_not_count(void)
{
current->flags |= PF_FREEZER_SKIP;
}
-/*
- * Tell the freezer to count the current task as freezable again and try to
- * freeze it.
+/**
+ * freezer_count - tell freezer to stop ignoring %current
+ *
+ * Undo freezer_do_not_count(). It tells freezers that %current should be
+ * considered again and tries to freeze if freezing condition is already in
+ * effect.
*/
static inline void freezer_count(void)
{
current->flags &= ~PF_FREEZER_SKIP;
+ /*
+ * If freezing is in progress, the following paired with smp_mb()
+ * in freezer_should_skip() ensures that either we see %true
+ * freezing() or freezer_should_skip() sees !PF_FREEZER_SKIP.
+ */
+ smp_mb();
try_to_freeze();
}
-/*
- * Check if the task should be counted as freezable by the freezer
+/**
+ * freezer_should_skip - whether to skip a task when determining frozen
+ * state is reached
+ * @p: task in quesion
+ *
+ * This function is used by freezers after establishing %true freezing() to
+ * test whether a task should be skipped when determining the target frozen
+ * state is reached. IOW, if this function returns %true, @p is considered
+ * frozen enough.
*/
-static inline int freezer_should_skip(struct task_struct *p)
+static inline bool freezer_should_skip(struct task_struct *p)
{
- return !!(p->flags & PF_FREEZER_SKIP);
+ /*
+ * The following smp_mb() paired with the one in freezer_count()
+ * ensures that either freezer_count() sees %true freezing() or we
+ * see cleared %PF_FREEZER_SKIP and return %false. This makes it
+ * impossible for a task to slip frozen state testing after
+ * clearing %PF_FREEZER_SKIP.
+ */
+ smp_mb();
+ return p->flags & PF_FREEZER_SKIP;
}
/*
- * These macros are intended to be used whenever you want allow a task that's
- * sleeping in TASK_UNINTERRUPTIBLE or TASK_KILLABLE state to be frozen. Note
- * that neither return any clear indication of whether a freeze event happened
- * while in this function.
+ * These macros are intended to be used whenever you want allow a sleeping
+ * task to be frozen. Note that neither return any clear indication of
+ * whether a freeze event happened while in this function.
*/
/* Like schedule(), but should not block the freezer. */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f24f724620dd..3070164e2036 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -171,8 +171,8 @@ struct css_id {
* The css to which this ID points. This pointer is set to valid value
* after cgroup is populated. If cgroup is removed, this will be NULL.
* This pointer is expected to be RCU-safe because destroy()
- * is called after synchronize_rcu(). But for safe use, css_is_removed()
- * css_tryget() should be used for avoiding race.
+ * is called after synchronize_rcu(). But for safe use, css_tryget()
+ * should be used for avoiding race.
*/
struct cgroup_subsys_state __rcu *css;
/*
@@ -854,30 +854,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
return inode;
}
-/*
- * Call subsys's pre_destroy handler.
- * This is called before css refcnt check.
- */
-static int cgroup_call_pre_destroy(struct cgroup *cgrp)
-{
- struct cgroup_subsys *ss;
- int ret = 0;
-
- for_each_subsys(cgrp->root, ss) {
- if (!ss->pre_destroy)
- continue;
-
- ret = ss->pre_destroy(cgrp);
- if (ret) {
- /* ->pre_destroy() failure is being deprecated */
- WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
- break;
- }
- }
-
- return ret;
-}
-
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
{
/* is dentry a directory ? if so, kfree() associated cgroup */
@@ -1015,33 +991,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
}
/*
- * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
- * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
- * reference to css->refcnt. In general, this refcnt is expected to goes down
- * to zero, soon.
- *
- * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
- */
-static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
-
-static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
-{
- if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
- wake_up_all(&cgroup_rmdir_waitq);
-}
-
-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
-{
- css_get(css);
-}
-
-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
-{
- cgroup_wakeup_rmdir_waiter(css->cgroup);
- css_put(css);
-}
-
-/*
* Call with cgroup_mutex held. Drops reference counts on modules, including
* any duplicate ones that parse_cgroupfs_options took. If this function
* returns an error, no reference counts are touched.
@@ -2025,12 +1974,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
}
synchronize_rcu();
-
- /*
- * wake up rmdir() waiter. the rmdir should fail since the cgroup
- * is no longer empty.
- */
- cgroup_wakeup_rmdir_waiter(cgrp);
out:
if (retval) {
for_each_subsys(root, ss) {
@@ -2200,7 +2143,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
* step 5: success! and cleanup
*/
synchronize_rcu();
- cgroup_wakeup_rmdir_waiter(cgrp);
retval = 0;
out_put_css_set_refs:
if (retval) {
@@ -4022,14 +3964,12 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
cgrp->subsys[ss->subsys_id] = css;
/*
- * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
- * which is put on the last css_put(). dput() requires process
- * context, which css_put() may be called without. @css->dput_work
- * will be used to invoke dput() asynchronously from css_put().
+ * css holds an extra ref to @cgrp->dentry which is put on the last
+ * css_put(). dput() requires process context, which css_put() may
+ * be called without. @css->dput_work will be used to invoke
+ * dput() asynchronously from css_put().
*/
INIT_WORK(&css->dput_work, css_dput_fn);
- if (ss->__DEPRECATED_clear_css_refs)
- set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
}
/*
@@ -4053,6 +3993,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
if (!cgrp)
return -ENOMEM;
+ /*
+ * Only live parents can have children. Note that the liveliness
+ * check isn't strictly necessary because cgroup_mkdir() and
+ * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
+ * anyway so that locking is contained inside cgroup proper and we
+ * don't get nasty surprises if we ever grow another caller.
+ */
+ if (!cgroup_lock_live_group(parent)) {
+ err = -ENODEV;
+ goto err_free;
+ }
+
/* Grab a reference on the superblock so the hierarchy doesn't
* get deleted on unmount if there are child cgroups. This
* can be done outside cgroup_mutex, since the sb can't
@@ -4060,8 +4012,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
* fs */
atomic_inc(&sb->s_active);
- mutex_lock(&cgroup_mutex);
-
init_cgroup_housekeeping(cgrp);
cgrp->parent = parent;
@@ -4109,10 +4059,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
if (err < 0)
goto err_remove;
- /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
+ /* each css holds a ref to the cgroup's dentry */
for_each_subsys(root, ss)
- if (!ss->__DEPRECATED_clear_css_refs)
- dget(dentry);
+ dget(dentry);
/* The cgroup directory was pre-locked for us */
BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
@@ -4143,7 +4092,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
/* Release the reference count that we took on the superblock */
deactivate_super(sb);
-
+err_free:
kfree(cgrp);
return err;
}
@@ -4197,71 +4146,6 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
return 0;
}
-/*
- * Atomically mark all (or else none) of the cgroup's CSS objects as
- * CSS_REMOVED. Return true on success, or false if the cgroup has
- * busy subsystems. Call with cgroup_mutex held
- *
- * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
- * not, cgroup removal behaves differently.
- *
- * If clear is set, css refcnt for the subsystem should be zero before
- * cgroup removal can be committed. This is implemented by
- * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
- * called multiple times until all css refcnts reach zero and is allowed to
- * veto removal on any invocation. This behavior is deprecated and will be
- * removed as soon as the existing user (memcg) is updated.
- *
- * If clear is not set, each css holds an extra reference to the cgroup's
- * dentry and cgroup removal proceeds regardless of css refs.
- * ->pre_destroy() will be called at least once and is not allowed to fail.
- * On the last put of each css, whenever that may be, the extra dentry ref
- * is put so that dentry destruction happens only after all css's are
- * released.
- */
-static int cgroup_clear_css_refs(struct cgroup *cgrp)
-{
- struct cgroup_subsys *ss;
- unsigned long flags;
- bool failed = false;
-
- local_irq_save(flags);
-
- /*
- * Block new css_tryget() by deactivating refcnt. If all refcnts
- * for subsystems w/ clear_css_refs set were 1 at the moment of
- * deactivation, we succeeded.
- */
- for_each_subsys(cgrp->root, ss) {
- struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
-
- WARN_ON(atomic_read(&css->refcnt) < 0);
- atomic_add(CSS_DEACT_BIAS, &css->refcnt);
-
- if (ss->__DEPRECATED_clear_css_refs)
- failed |= css_refcnt(css) != 1;
- }
-
- /*
- * If succeeded, set REMOVED and put all the base refs; otherwise,
- * restore refcnts to positive values. Either way, all in-progress
- * css_tryget() will be released.
- */
- for_each_subsys(cgrp->root, ss) {
- struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
-
- if (!failed) {
- set_bit(CSS_REMOVED, &css->flags);
- css_put(css);
- } else {
- atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
- }
- }
-
- local_irq_restore(flags);
- return !failed;
-}
-
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
{
struct cgroup *cgrp = dentry->d_fsdata;
@@ -4269,70 +4153,52 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
struct cgroup *parent;
DEFINE_WAIT(wait);
struct cgroup_event *event, *tmp;
- int ret;
+ struct cgroup_subsys *ss;
/* the vfs holds both inode->i_mutex already */
-again:
mutex_lock(&cgroup_mutex);
- if (atomic_read(&cgrp->count) != 0) {
- mutex_unlock(&cgroup_mutex);
- return -EBUSY;
- }
- if (!list_empty(&cgrp->children)) {
+ parent = cgrp->parent;
+ if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
- mutex_unlock(&cgroup_mutex);
/*
- * In general, subsystem has no css->refcnt after pre_destroy(). But
- * in racy cases, subsystem may have to get css->refcnt after
- * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
- * make rmdir return -EBUSY too often. To avoid that, we use waitqueue
- * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
- * and subsystem's reference count handling. Please see css_get/put
- * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
+ * Block new css_tryget() by deactivating refcnt and mark @cgrp
+ * removed. This makes future css_tryget() and child creation
+ * attempts fail thus maintaining the removal conditions verified
+ * above.
*/
- set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+ for_each_subsys(cgrp->root, ss) {
+ struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
- /*
- * Call pre_destroy handlers of subsys. Notify subsystems
- * that rmdir() request comes.
- */
- ret = cgroup_call_pre_destroy(cgrp);
- if (ret) {
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- return ret;
+ WARN_ON(atomic_read(&css->refcnt) < 0);
+ atomic_add(CSS_DEACT_BIAS, &css->refcnt);
}
+ set_bit(CGRP_REMOVED, &cgrp->flags);
+ /*
+ * Tell subsystems to initate destruction. pre_destroy() should be
+ * called with cgroup_mutex unlocked. See 3fa59dfbc3 ("cgroup: fix
+ * potential deadlock in pre_destroy") for details.
+ */
+ mutex_unlock(&cgroup_mutex);
+ for_each_subsys(cgrp->root, ss)
+ if (ss->pre_destroy)
+ ss->pre_destroy(cgrp);
mutex_lock(&cgroup_mutex);
- parent = cgrp->parent;
- if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- mutex_unlock(&cgroup_mutex);
- return -EBUSY;
- }
- prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
- if (!cgroup_clear_css_refs(cgrp)) {
- mutex_unlock(&cgroup_mutex);
- /*
- * Because someone may call cgroup_wakeup_rmdir_waiter() before
- * prepare_to_wait(), we need to check this flag.
- */
- if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
- schedule();
- finish_wait(&cgroup_rmdir_waitq, &wait);
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- if (signal_pending(current))
- return -EINTR;
- goto again;
- }
- /* NO css_tryget() can success after here. */
- finish_wait(&cgroup_rmdir_waitq, &wait);
- clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+
+ /*
+ * Put all the base refs. Each css holds an extra reference to the
+ * cgroup's dentry and cgroup removal proceeds regardless of css
+ * refs. On the last put of each css, whenever that may be, the
+ * extra dentry ref is put so that dentry destruction happens only
+ * after all css's are released.
+ */
+ for_each_subsys(cgrp->root, ss)
+ css_put(cgrp->subsys[ss->subsys_id]);
raw_spin_lock(&release_list_lock);
- set_bit(CGRP_REMOVED, &cgrp->flags);
if (!list_empty(&cgrp->release_list))
list_del_init(&cgrp->release_list);
raw_spin_unlock(&release_list_lock);
@@ -4832,44 +4698,19 @@ void cgroup_fork(struct task_struct *child)
}
/**
- * cgroup_fork_callbacks - run fork callbacks
- * @child: the new task
- *
- * Called on a new task very soon before adding it to the
- * tasklist. No need to take any locks since no-one can
- * be operating on this task.
- */
-void cgroup_fork_callbacks(struct task_struct *child)
-{
- if (need_forkexit_callback) {
- int i;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
-
- /*
- * forkexit callbacks are only supported for
- * builtin subsystems.
- */
- if (!ss || ss->module)
- continue;
-
- if (ss->fork)
- ss->fork(child);
- }
- }
-}
-
-/**
* cgroup_post_fork - called on a new task after adding it to the task list
* @child: the task in question
*
- * Adds the task to the list running through its css_set if necessary.
- * Has to be after the task is visible on the task list in case we race
- * with the first call to cgroup_iter_start() - to guarantee that the
- * new task ends up on its list.
+ * Adds the task to the list running through its css_set if necessary and
+ * call the subsystem fork() callbacks. Has to be after the task is
+ * visible on the task list in case we race with the first call to
+ * cgroup_iter_start() - to guarantee that the new task ends up on its
+ * list.
*/
void cgroup_post_fork(struct task_struct *child)
{
+ int i;
+
/*
* use_task_css_set_links is set to 1 before we walk the tasklist
* under the tasklist_lock and we read it here after we added the child
@@ -4889,7 +4730,30 @@ void cgroup_post_fork(struct task_struct *child)
task_unlock(child);
write_unlock(&css_set_lock);
}
+
+ /*
+ * Call ss->fork(). This must happen after @child is linked on
+ * css_set; otherwise, @child might change state between ->fork()
+ * and addition to css_set.
+ */
+ if (need_forkexit_callback) {
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+
+ /*
+ * fork/exit callbacks are supported only for
+ * builtin subsystems and we don't need further
+ * synchronization as they never go away.
+ */
+ if (!ss || ss->module)
+ continue;
+
+ if (ss->fork)
+ ss->fork(child);
+ }
+ }
}
+
/**
* cgroup_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
@@ -5022,15 +4886,17 @@ static void check_for_release(struct cgroup *cgrp)
/* Caller must verify that the css is not for root cgroup */
bool __css_tryget(struct cgroup_subsys_state *css)
{
- do {
- int v = css_refcnt(css);
+ while (true) {
+ int t, v;
- if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v)
+ v = css_refcnt(css);
+ t = atomic_cmpxchg(&css->refcnt, v, v + 1);
+ if (likely(t == v))
return true;
+ else if (t < 0)
+ return false;
cpu_relax();
- } while (!test_bit(CSS_REMOVED, &css->flags));
-
- return false;
+ }
}
EXPORT_SYMBOL_GPL(__css_tryget);
@@ -5049,11 +4915,9 @@ void __css_put(struct cgroup_subsys_state *css)
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
}
- cgroup_wakeup_rmdir_waiter(cgrp);
break;
case 0:
- if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
- schedule_work(&css->dput_work);
+ schedule_work(&css->dput_work);
break;
}
rcu_read_unlock();
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index b1724ce98981..bedefd9a22df 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -84,50 +84,6 @@ static const char *freezer_state_strs[] = {
struct cgroup_subsys freezer_subsys;
-/* Locks taken and their ordering
- * ------------------------------
- * cgroup_mutex (AKA cgroup_lock)
- * freezer->lock
- * css_set_lock
- * task->alloc_lock (AKA task_lock)
- * task->sighand->siglock
- *
- * cgroup code forces css_set_lock to be taken before task->alloc_lock
- *
- * freezer_create(), freezer_destroy():
- * cgroup_mutex [ by cgroup core ]
- *
- * freezer_can_attach():
- * cgroup_mutex (held by caller of can_attach)
- *
- * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
- * freezer->lock
- * sighand->siglock (if the cgroup is freezing)
- *
- * freezer_read():
- * cgroup_mutex
- * freezer->lock
- * write_lock css_set_lock (cgroup iterator start)
- * task->alloc_lock
- * read_lock css_set_lock (cgroup iterator start)
- *
- * freezer_write() (freeze):
- * cgroup_mutex
- * freezer->lock
- * write_lock css_set_lock (cgroup iterator start)
- * task->alloc_lock
- * read_lock css_set_lock (cgroup iterator start)
- * sighand->siglock (fake signal delivery inside freeze_task())
- *
- * freezer_write() (unfreeze):
- * cgroup_mutex
- * freezer->lock
- * write_lock css_set_lock (cgroup iterator start)
- * task->alloc_lock
- * read_lock css_set_lock (cgroup iterator start)
- * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
- * sighand->siglock
- */
static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup)
{
struct freezer *freezer;
@@ -150,96 +106,105 @@ static void freezer_destroy(struct cgroup *cgroup)
kfree(freezer);
}
-/* task is frozen or will freeze immediately when next it gets woken */
-static bool is_task_frozen_enough(struct task_struct *task)
-{
- return frozen(task) ||
- (task_is_stopped_or_traced(task) && freezing(task));
-}
-
/*
- * The call to cgroup_lock() in the freezer.state write method prevents
- * a write to that file racing against an attach, and hence the
- * can_attach() result will remain valid until the attach completes.
+ * Tasks can be migrated into a different freezer anytime regardless of its
+ * current state. freezer_attach() is responsible for making new tasks
+ * conform to the current state.
+ *
+ * Freezer state changes and task migration are synchronized via
+ * @freezer->lock. freezer_attach() makes the new tasks conform to the
+ * current state and all following state changes can see the new tasks.
*/
-static int freezer_can_attach(struct cgroup *new_cgroup,
- struct cgroup_taskset *tset)
+static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset)
{
- struct freezer *freezer;
+ struct freezer *freezer = cgroup_freezer(new_cgrp);
struct task_struct *task;
+ spin_lock_irq(&freezer->lock);
+
/*
- * Anything frozen can't move or be moved to/from.
+ * Make the new tasks conform to the current state of @new_cgrp.
+ * For simplicity, when migrating any task to a FROZEN cgroup, we
+ * revert it to FREEZING and let update_if_frozen() determine the
+ * correct state later.
+ *
+ * Tasks in @tset are on @new_cgrp but may not conform to its
+ * current state before executing the following - !frozen tasks may
+ * be visible in a FROZEN cgroup and frozen tasks in a THAWED one.
+ * This means that, to determine whether to freeze, one should test
+ * whether the state equals THAWED.
*/
- cgroup_taskset_for_each(task, new_cgroup, tset)
- if (cgroup_freezing(task))
- return -EBUSY;
-
- freezer = cgroup_freezer(new_cgroup);
- if (freezer->state != CGROUP_THAWED)
- return -EBUSY;
+ cgroup_taskset_for_each(task, new_cgrp, tset) {
+ if (freezer->state == CGROUP_THAWED) {
+ __thaw_task(task);
+ } else {
+ freeze_task(task);
+ freezer->state = CGROUP_FREEZING;
+ }
+ }
- return 0;
+ spin_unlock_irq(&freezer->lock);
}
static void freezer_fork(struct task_struct *task)
{
struct freezer *freezer;
- /*
- * No lock is needed, since the task isn't on tasklist yet,
- * so it can't be moved to another cgroup, which means the
- * freezer won't be removed and will be valid during this
- * function call. Nevertheless, apply RCU read-side critical
- * section to suppress RCU lockdep false positives.
- */
rcu_read_lock();
freezer = task_freezer(task);
- rcu_read_unlock();
/*
* The root cgroup is non-freezable, so we can skip the
* following check.
*/
if (!freezer->css.cgroup->parent)
- return;
+ goto out;
spin_lock_irq(&freezer->lock);
- BUG_ON(freezer->state == CGROUP_FROZEN);
-
- /* Locking avoids race with FREEZING -> THAWED transitions. */
- if (freezer->state == CGROUP_FREEZING)
+ /*
+ * @task might have been just migrated into a FROZEN cgroup. Test
+ * equality with THAWED. Read the comment in freezer_attach().
+ */
+ if (freezer->state != CGROUP_THAWED)
freeze_task(task);
spin_unlock_irq(&freezer->lock);
+out:
+ rcu_read_unlock();
}
/*
- * caller must hold freezer->lock
+ * We change from FREEZING to FROZEN lazily if the cgroup was only
+ * partially frozen when we exitted write. Caller must hold freezer->lock.
+ *
+ * Task states and freezer state might disagree while tasks are being
+ * migrated into or out of @cgroup, so we can't verify task states against
+ * @freezer state here. See freezer_attach() for details.
*/
-static void update_if_frozen(struct cgroup *cgroup,
- struct freezer *freezer)
+static void update_if_frozen(struct cgroup *cgroup, struct freezer *freezer)
{
struct cgroup_iter it;
struct task_struct *task;
- unsigned int nfrozen = 0, ntotal = 0;
- enum freezer_state old_state = freezer->state;
+
+ if (freezer->state != CGROUP_FREEZING)
+ return;
cgroup_iter_start(cgroup, &it);
- while ((task = cgroup_iter_next(cgroup, &it))) {
- ntotal++;
- if (freezing(task) && is_task_frozen_enough(task))
- nfrozen++;
- }
- if (old_state == CGROUP_THAWED) {
- BUG_ON(nfrozen > 0);
- } else if (old_state == CGROUP_FREEZING) {
- if (nfrozen == ntotal)
- freezer->state = CGROUP_FROZEN;
- } else { /* old_state == CGROUP_FROZEN */
- BUG_ON(nfrozen != ntotal);
+ while ((task = cgroup_iter_next(cgroup, &it))) {
+ if (freezing(task)) {
+ /*
+ * freezer_should_skip() indicates that the task
+ * should be skipped when determining freezing
+ * completion. Consider it frozen in addition to
+ * the usual frozen condition.
+ */
+ if (!frozen(task) && !freezer_should_skip(task))
+ goto notyet;
+ }
}
+ freezer->state = CGROUP_FROZEN;
+notyet:
cgroup_iter_end(cgroup, &it);
}
@@ -249,44 +214,26 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
struct freezer *freezer;
enum freezer_state state;
- if (!cgroup_lock_live_group(cgroup))
- return -ENODEV;
-
freezer = cgroup_freezer(cgroup);
spin_lock_irq(&freezer->lock);
+ update_if_frozen(cgroup, freezer);
state = freezer->state;
- if (state == CGROUP_FREEZING) {
- /* We change from FREEZING to FROZEN lazily if the cgroup was
- * only partially frozen when we exitted write. */
- update_if_frozen(cgroup, freezer);
- state = freezer->state;
- }
spin_unlock_irq(&freezer->lock);
- cgroup_unlock();
seq_puts(m, freezer_state_strs[state]);
seq_putc(m, '\n');
return 0;
}
-static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
+static void freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
{
struct cgroup_iter it;
struct task_struct *task;
- unsigned int num_cant_freeze_now = 0;
cgroup_iter_start(cgroup, &it);
- while ((task = cgroup_iter_next(cgroup, &it))) {
- if (!freeze_task(task))
- continue;
- if (is_task_frozen_enough(task))
- continue;
- if (!freezing(task) && !freezer_should_skip(task))
- num_cant_freeze_now++;
- }
+ while ((task = cgroup_iter_next(cgroup, &it)))
+ freeze_task(task);
cgroup_iter_end(cgroup, &it);
-
- return num_cant_freeze_now ? -EBUSY : 0;
}
static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
@@ -300,18 +247,14 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
cgroup_iter_end(cgroup, &it);
}
-static int freezer_change_state(struct cgroup *cgroup,
- enum freezer_state goal_state)
+static void freezer_change_state(struct cgroup *cgroup,
+ enum freezer_state goal_state)
{
- struct freezer *freezer;
- int retval = 0;
-
- freezer = cgroup_freezer(cgroup);
+ struct freezer *freezer = cgroup_freezer(cgroup);
+ /* also synchronizes against task migration, see freezer_attach() */
spin_lock_irq(&freezer->lock);
- update_if_frozen(cgroup, freezer);
-
switch (goal_state) {
case CGROUP_THAWED:
if (freezer->state != CGROUP_THAWED)
@@ -323,22 +266,19 @@ static int freezer_change_state(struct cgroup *cgroup,
if (freezer->state == CGROUP_THAWED)
atomic_inc(&system_freezing_cnt);
freezer->state = CGROUP_FREEZING;
- retval = try_to_freeze_cgroup(cgroup, freezer);
+ freeze_cgroup(cgroup, freezer);
break;
default:
BUG();
}
spin_unlock_irq(&freezer->lock);
-
- return retval;
}
static int freezer_write(struct cgroup *cgroup,
struct cftype *cft,
const char *buffer)
{
- int retval;
enum freezer_state goal_state;
if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0)
@@ -348,11 +288,8 @@ static int freezer_write(struct cgroup *cgroup,
else
return -EINVAL;
- if (!cgroup_lock_live_group(cgroup))
- return -ENODEV;
- retval = freezer_change_state(cgroup, goal_state);
- cgroup_unlock();
- return retval;
+ freezer_change_state(cgroup, goal_state);
+ return 0;
}
static struct cftype files[] = {
@@ -370,7 +307,7 @@ struct cgroup_subsys freezer_subsys = {
.create = freezer_create,
.destroy = freezer_destroy,
.subsys_id = freezer_subsys_id,
- .can_attach = freezer_can_attach,
+ .attach = freezer_attach,
.fork = freezer_fork,
.base_cftypes = files,
diff --git a/kernel/fork.c b/kernel/fork.c
index 8b20ab7d3aa2..acc4cb62f32f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1135,7 +1135,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
{
int retval;
struct task_struct *p;
- int cgroup_callbacks_done = 0;
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -1393,12 +1392,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
- /* Now that the task is set up, run cgroup callbacks if
- * necessary. We need to run them before the task is visible
- * on the tasklist. */
- cgroup_fork_callbacks(p);
- cgroup_callbacks_done = 1;
-
/* Need tasklist lock for parent etc handling! */
write_lock_irq(&tasklist_lock);
@@ -1503,7 +1496,7 @@ bad_fork_cleanup_cgroup:
#endif
if (clone_flags & CLONE_THREAD)
threadgroup_change_end(current);
- cgroup_exit(p, cgroup_callbacks_done);
+ cgroup_exit(p, 0);
delayacct_tsk_free(p);
module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 11f82a4d4eae..c38893b0efba 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -116,17 +116,10 @@ bool freeze_task(struct task_struct *p)
return false;
}
- if (!(p->flags & PF_KTHREAD)) {
+ if (!(p->flags & PF_KTHREAD))
fake_signal_wake_up(p);
- /*
- * fake_signal_wake_up() goes through p's scheduler
- * lock and guarantees that TASK_STOPPED/TRACED ->
- * TASK_RUNNING transition can't race with task state
- * testing in try_to_freeze_tasks().
- */
- } else {
+ else
wake_up_state(p, TASK_INTERRUPTIBLE);
- }
spin_unlock_irqrestore(&freezer_lock, flags);
return true;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 87da817f9e13..d5a258b60c6f 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -48,18 +48,7 @@ static int try_to_freeze_tasks(bool user_only)
if (p == current || !freeze_task(p))
continue;
- /*
- * Now that we've done set_freeze_flag, don't
- * perturb a task in TASK_STOPPED or TASK_TRACED.
- * It is "frozen enough". If the task does wake
- * up, it will immediately call try_to_freeze.
- *
- * Because freeze_task() goes through p's scheduler lock, it's
- * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING
- * transition can't race with task state testing here.
- */
- if (!task_is_stopped_or_traced(p) &&
- !freezer_should_skip(p))
+ if (!freezer_should_skip(p))
todo++;
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
diff --git a/kernel/signal.c b/kernel/signal.c
index 0af8868525d6..5ffb5626e072 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1908,7 +1908,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
preempt_disable();
read_unlock(&tasklist_lock);
preempt_enable_no_resched();
- schedule();
+ freezable_schedule();
} else {
/*
* By the time we got the lock, our tracer went away.
@@ -1930,13 +1930,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
}
/*
- * While in TASK_TRACED, we were considered "frozen enough".
- * Now that we woke up, it's crucial if we're supposed to be
- * frozen that we freeze now before running anything substantial.
- */
- try_to_freeze();
-
- /*
* We are back. Now reacquire the siglock before touching
* last_siginfo, so that we are sure to have synchronized with
* any signal-sending on another CPU that wants to examine it.
@@ -2092,7 +2085,7 @@ static bool do_signal_stop(int signr)
}
/* Now we don't run again until woken by SIGCONT or SIGKILL */
- schedule();
+ freezable_schedule();
return true;
} else {
/*
@@ -2200,15 +2193,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
if (unlikely(uprobe_deny_signal()))
return 0;
-relock:
/*
- * We'll jump back here after any time we were stopped in TASK_STOPPED.
- * While in TASK_STOPPED, we were considered "frozen enough".
- * Now that we woke up, it's crucial if we're supposed to be
- * frozen that we freeze now before running anything substantial.
+ * Do this once, we can't return to user-mode if freezing() == T.
+ * do_signal_stop() and ptrace_stop() do freezable_schedule() and
+ * thus do not need another check after return.
*/
try_to_freeze();
+relock:
spin_lock_irq(&sighand->siglock);
/*
* Every stopped thread goes here after wakeup. Check to see if
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index a3f358fb8a0c..0d3a1a317731 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -155,18 +155,13 @@ out:
* Force the hugetlb cgroup to empty the hugetlb resources by moving them to
* the parent cgroup.
*/
-static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
+static void hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
{
struct hstate *h;
struct page *page;
- int ret = 0, idx = 0;
+ int idx = 0;
do {
- if (cgroup_task_count(cgroup) ||
- !list_empty(&cgroup->children)) {
- ret = -EBUSY;
- goto out;
- }
for_each_hstate(h) {
spin_lock(&hugetlb_lock);
list_for_each_entry(page, &h->hugepage_activelist, lru)
@@ -177,8 +172,6 @@ static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
}
cond_resched();
} while (hugetlb_cgroup_have_usage(cgroup));
-out:
- return ret;
}
int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7acf43bf04a2..08adaaae6fcc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2337,7 +2337,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
again:
if (*ptr) { /* css should be a valid one */
memcg = *ptr;
- VM_BUG_ON(css_is_removed(&memcg->css));
if (mem_cgroup_is_root(memcg))
goto done;
if (nr_pages == 1 && consume_stock(memcg))
@@ -2477,9 +2476,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
/*
* A helper function to get mem_cgroup from ID. must be called under
- * rcu_read_lock(). The caller must check css_is_removed() or some if
- * it's concern. (dropping refcnt from swap can be called against removed
- * memcg.)
+ * rcu_read_lock(). The caller is responsible for calling css_tryget if
+ * the mem_cgroup is used for charging. (dropping refcnt from swap can be
+ * called against removed memcg.)
*/
static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
{
@@ -2676,13 +2675,6 @@ static int mem_cgroup_move_account(struct page *page,
/* caller should have done css_get */
pc->mem_cgroup = to;
mem_cgroup_charge_statistics(to, anon, nr_pages);
- /*
- * We charges against "to" which may not have any tasks. Then, "to"
- * can be under rmdir(). But in current implementation, caller of
- * this function is just force_empty() and move charge, so it's
- * guaranteed that "to" is never removed. So, we don't check rmdir
- * status here.
- */
move_unlock_mem_cgroup(from, &flags);
ret = 0;
unlock:
@@ -2696,10 +2688,27 @@ out:
return ret;
}
-/*
- * move charges to its parent.
+/**
+ * mem_cgroup_move_parent - moves page to the parent group
+ * @page: the page to move
+ * @pc: page_cgroup of the page
+ * @child: page's cgroup
+ *
+ * move charges to its parent or the root cgroup if the group has no
+ * parent (aka use_hierarchy==0).
+ * Although this might fail (get_page_unless_zero, isolate_lru_page or
+ * mem_cgroup_move_account fails) the failure is always temporary and
+ * it signals a race with a page removal/uncharge or migration. In the
+ * first case the page is on the way out and it will vanish from the LRU
+ * on the next attempt and the call should be retried later.
+ * Isolation from the LRU fails only if page has been isolated from
+ * the LRU since we looked at it and that usually means either global
+ * reclaim or migration going on. The page will either get back to the
+ * LRU or vanish.
+ * Finaly mem_cgroup_move_account fails only if the page got uncharged
+ * (!PageCgroupUsed) or moved to a different group. The page will
+ * disappear in the next attempt.
*/
-
static int mem_cgroup_move_parent(struct page *page,
struct page_cgroup *pc,
struct mem_cgroup *child)
@@ -2709,9 +2718,7 @@ static int mem_cgroup_move_parent(struct page *page,
unsigned long uninitialized_var(flags);
int ret;
- /* Is ROOT ? */
- if (mem_cgroup_is_root(child))
- return -EINVAL;
+ VM_BUG_ON(mem_cgroup_is_root(child));
ret = -EBUSY;
if (!get_page_unless_zero(page))
@@ -2728,8 +2735,10 @@ static int mem_cgroup_move_parent(struct page *page,
if (!parent)
parent = root_mem_cgroup;
- if (nr_pages > 1)
+ if (nr_pages > 1) {
+ VM_BUG_ON(!PageTransHuge(page));
flags = compound_lock_irqsave(page);
+ }
ret = mem_cgroup_move_account(page, nr_pages,
pc, child, parent);
@@ -2871,7 +2880,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
return;
if (!memcg)
return;
- cgroup_exclude_rmdir(&memcg->css);
__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
/*
@@ -2885,12 +2893,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
swp_entry_t ent = {.val = page_private(page)};
mem_cgroup_uncharge_swap(ent);
}
- /*
- * At swapin, we may charge account against cgroup which has no tasks.
- * So, rmdir()->pre_destroy() can be called while we do this charge.
- * In that case, we need to call pre_destroy() again. check it here.
- */
- cgroup_release_and_wakeup_rmdir(&memcg->css);
}
void mem_cgroup_commit_charge_swapin(struct page *page,
@@ -3338,8 +3340,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
if (!memcg)
return;
- /* blocks rmdir() */
- cgroup_exclude_rmdir(&memcg->css);
+
if (!migration_ok) {
used = oldpage;
unused = newpage;
@@ -3373,13 +3374,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
*/
if (anon)
mem_cgroup_uncharge_page(used);
- /*
- * At migration, we may charge account against cgroup which has no
- * tasks.
- * So, rmdir()->pre_destroy() can be called while we do this charge.
- * In that case, we need to call pre_destroy() again. check it here.
- */
- cgroup_release_and_wakeup_rmdir(&memcg->css);
}
/*
@@ -3679,17 +3673,22 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
return nr_reclaimed;
}
-/*
+/**
+ * mem_cgroup_force_empty_list - clears LRU of a group
+ * @memcg: group to clear
+ * @node: NUMA node
+ * @zid: zone id
+ * @lru: lru to to clear
+ *
* Traverse a specified page_cgroup list and try to drop them all. This doesn't
- * reclaim the pages page themselves - it just removes the page_cgroups.
- * Returns true if some page_cgroups were not freed, indicating that the caller
- * must retry this operation.
+ * reclaim the pages page themselves - pages are moved to the parent (or root)
+ * group.
*/
-static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
+static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
int node, int zid, enum lru_list lru)
{
struct mem_cgroup_per_zone *mz;
- unsigned long flags, loop;
+ unsigned long flags;
struct list_head *list;
struct page *busy;
struct zone *zone;
@@ -3698,11 +3697,8 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
mz = mem_cgroup_zoneinfo(memcg, node, zid);
list = &mz->lruvec.lists[lru];
- loop = mz->lru_size[lru];
- /* give some margin against EBUSY etc...*/
- loop += 256;
busy = NULL;
- while (loop--) {
+ do {
struct page_cgroup *pc;
struct page *page;
@@ -3728,76 +3724,72 @@ static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
cond_resched();
} else
busy = NULL;
- }
- return !list_empty(list);
+ } while (!list_empty(list));
}
/*
- * make mem_cgroup's charge to be 0 if there is no task.
+ * make mem_cgroup's charge to be 0 if there is no task by moving
+ * all the charges and pages to the parent.
* This enables deleting this mem_cgroup.
+ *
+ * Caller is responsible for holding css reference on the memcg.
*/
-static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
+static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
{
- int ret;
- int node, zid, shrink;
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- struct cgroup *cgrp = memcg->css.cgroup;
-
- css_get(&memcg->css);
+ int node, zid;
- shrink = 0;
- /* should free all ? */
- if (free_all)
- goto try_to_free;
-move_account:
do {
- ret = -EBUSY;
- if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
- goto out;
/* This is for making all *used* pages to be on LRU. */
lru_add_drain_all();
drain_all_stock_sync(memcg);
- ret = 0;
mem_cgroup_start_move(memcg);
for_each_node_state(node, N_HIGH_MEMORY) {
- for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
enum lru_list lru;
for_each_lru(lru) {
- ret = mem_cgroup_force_empty_list(memcg,
+ mem_cgroup_force_empty_list(memcg,
node, zid, lru);
- if (ret)
- break;
}
}
- if (ret)
- break;
}
mem_cgroup_end_move(memcg);
memcg_oom_recover(memcg);
cond_resched();
- /* "ret" should also be checked to ensure all lists are empty. */
- } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
-out:
- css_put(&memcg->css);
- return ret;
-try_to_free:
+ /*
+ * This is a safety check because mem_cgroup_force_empty_list
+ * could have raced with mem_cgroup_replace_page_cache callers
+ * so the lru seemed empty but the page could have been added
+ * right after the check. RES_USAGE should be safe as we always
+ * charge before adding to the LRU.
+ */
+ } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0);
+}
+
+/*
+ * Reclaims as many pages from the given memcg as possible and moves
+ * the rest to the parent.
+ *
+ * Caller is responsible for holding css reference for memcg.
+ */
+static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
+{
+ int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ struct cgroup *cgrp = memcg->css.cgroup;
+
/* returns EBUSY if there is a task or if we come here twice. */
- if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
- ret = -EBUSY;
- goto out;
- }
+ if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
+ return -EBUSY;
+
/* we call try-to-free pages for make this cgroup empty */
lru_add_drain_all();
/* try to free all pages in this cgroup */
- shrink = 1;
while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
int progress;
- if (signal_pending(current)) {
- ret = -EINTR;
- goto out;
- }
+ if (signal_pending(current))
+ return -EINTR;
+
progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
false);
if (!progress) {
@@ -3808,13 +3800,23 @@ try_to_free:
}
lru_add_drain();
- /* try move_account...there may be some *locked* pages. */
- goto move_account;
+ mem_cgroup_reparent_charges(memcg);
+
+ return 0;
}
static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
{
- return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+ int ret;
+
+ if (mem_cgroup_is_root(memcg))
+ return -EINVAL;
+ css_get(&memcg->css);
+ ret = mem_cgroup_force_empty(memcg);
+ css_put(&memcg->css);
+
+ return ret;
}
@@ -5001,11 +5003,11 @@ free_out:
return ERR_PTR(error);
}
-static int mem_cgroup_pre_destroy(struct cgroup *cont)
+static void mem_cgroup_pre_destroy(struct cgroup *cont)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
- return mem_cgroup_force_empty(memcg, false);
+ mem_cgroup_reparent_charges(memcg);
}
static void mem_cgroup_destroy(struct cgroup *cont)
@@ -5607,7 +5609,6 @@ struct cgroup_subsys mem_cgroup_subsys = {
.base_cftypes = mem_cgroup_files,
.early_init = 0,
.use_id = 1,
- .__DEPRECATED_clear_css_refs = true,
};
#ifdef CONFIG_MEMCG_SWAP