summaryrefslogtreecommitdiff
path: root/include/linux/cgroup.h
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2013-06-13 19:39:16 -0700
committerTejun Heo <tj@kernel.org>2013-06-13 19:43:12 -0700
commitd3daf28da16a30af95bfb303189a634a87606725 (patch)
treed73530e2b759b2f180daec0814124e71e8bb149b /include/linux/cgroup.h
parent2b0e53a7c8a6972755c0f0152d7fad2289fdc5eb (diff)
downloadlwn-d3daf28da16a30af95bfb303189a634a87606725.tar.gz
lwn-d3daf28da16a30af95bfb303189a634a87606725.zip
cgroup: use percpu refcnt for cgroup_subsys_states
A css (cgroup_subsys_state) is how each cgroup is represented to a controller. As such, it can be used in hot paths across the various subsystems different controllers are associated with. One of the common operations is reference counting, which up until now has been implemented using a global atomic counter and can have significant adverse impact on scalability. For example, css refcnt can be gotten and put multiple times by blkcg for each IO request. For highops configurations which try to do as much per-cpu as possible, the global frequent refcnting can be very expensive. In general, given the various and hugely diverse paths css's end up being used from, we need to make it cheap and highly scalable. In its usage, css refcnting isn't very different from module refcnting. This patch converts css refcnting to use the recently added percpu_ref. css_get/tryget/put() directly maps to the matching percpu_ref operations and the deactivation logic is no longer necessary as percpu_ref already has refcnt killing. The only complication is that as the refcnt is per-cpu, percpu_ref_kill() in itself doesn't ensure that further tryget operations will fail, which we need to guarantee before invoking ->css_offline()'s. This is resolved collecting kill confirmation using percpu_ref_kill_and_confirm() and initiating the offline phase of destruction after all css refcnt's are confirmed to be seen as killed on all CPUs. The previous patches already splitted destruction into two phases, so percpu_ref_kill_and_confirm() can be hooked up easily. This patch removes css_refcnt() which is used for rcu dereference sanity check in css_id(). While we can add a percpu refcnt API to ask the same question, css_id() itself is scheduled to be removed fairly soon, so let's not bother with it. Just drop the sanity check and use rcu_dereference_raw() instead. v2: - init_cgroup_css() was calling percpu_ref_init() without checking the return value. This causes two problems - the obvious lack of error handling and percpu_ref_init() being called from cgroup_init_subsys() before the allocators are up, which triggers warnings but doesn't cause actual problems as the refcnt isn't used for roots anyway. Fix both by moving percpu_ref_init() to cgroup_create(). - The base references were put too early by percpu_ref_kill_and_confirm() and cgroup_offline_fn() put the refs one extra time. This wasn't noticeable because css's go through another RCU grace period before being freed. Update cgroup_destroy_locked() to grab an extra reference before killing the refcnts. This problem was noticed by Kent. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Kent Overstreet <koverstreet@google.com> Acked-by: Li Zefan <lizefan@huawei.com> Cc: Michal Hocko <mhocko@suse.cz> Cc: Mike Snitzer <snitzer@redhat.com> Cc: Vivek Goyal <vgoyal@redhat.com> Cc: "Alasdair G. Kergon" <agk@redhat.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Mikulas Patocka <mpatocka@redhat.com> Cc: Glauber Costa <glommer@gmail.com>
Diffstat (limited to 'include/linux/cgroup.h')
-rw-r--r--include/linux/cgroup.h23
1 files changed, 8 insertions, 15 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e345d8b90046..b7bd4beae294 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -20,6 +20,7 @@
#include <linux/workqueue.h>
#include <linux/xattr.h>
#include <linux/fs.h>
+#include <linux/percpu-refcount.h>
#ifdef CONFIG_CGROUPS
@@ -72,13 +73,8 @@ struct cgroup_subsys_state {
*/
struct cgroup *cgroup;
- /*
- * State maintained by the cgroup system to allow subsystems
- * to be "busy". Should be accessed via css_get(),
- * css_tryget() and css_put().
- */
-
- atomic_t refcnt;
+ /* reference count - access via css_[try]get() and css_put() */
+ struct percpu_ref refcnt;
unsigned long flags;
/* ID for this css, if possible */
@@ -104,11 +100,9 @@ static inline void css_get(struct cgroup_subsys_state *css)
{
/* We don't need to reference count the root state */
if (!(css->flags & CSS_ROOT))
- atomic_inc(&css->refcnt);
+ percpu_ref_get(&css->refcnt);
}
-extern bool __css_tryget(struct cgroup_subsys_state *css);
-
/**
* css_tryget - try to obtain a reference on the specified css
* @css: target css
@@ -123,11 +117,9 @@ static inline bool css_tryget(struct cgroup_subsys_state *css)
{
if (css->flags & CSS_ROOT)
return true;
- return __css_tryget(css);
+ return percpu_ref_tryget(&css->refcnt);
}
-extern void __css_put(struct cgroup_subsys_state *css);
-
/**
* css_put - put a css reference
* @css: target css
@@ -137,7 +129,7 @@ extern void __css_put(struct cgroup_subsys_state *css);
static inline void css_put(struct cgroup_subsys_state *css)
{
if (!(css->flags & CSS_ROOT))
- __css_put(css);
+ percpu_ref_put(&css->refcnt);
}
/* bits in struct cgroup flags field */
@@ -231,9 +223,10 @@ struct cgroup {
struct list_head pidlists;
struct mutex pidlist_mutex;
- /* For RCU-protected deletion */
+ /* For css percpu_ref killing and RCU-protected deletion */
struct rcu_head rcu_head;
struct work_struct destroy_work;
+ atomic_t css_kill_cnt;
/* List of events which userspace want to receive */
struct list_head event_list;