summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2013-11-29 10:42:59 -0500
committerTejun Heo <tj@kernel.org>2013-11-29 10:42:59 -0500
commitafb2bc14e1c989cf0635bd04edb5ff55b8c1c7bd (patch)
treef7b3984bb26c3230d96a726f1af68bf882175c6a
parent045023658ca1e30dc0bb1f148b42c95b740d3e02 (diff)
downloadlwn-afb2bc14e1c989cf0635bd04edb5ff55b8c1c7bd.tar.gz
lwn-afb2bc14e1c989cf0635bd04edb5ff55b8c1c7bd.zip
cgroup: don't guarantee cgroup.procs is sorted if sane_behavior
For some reason, tasks and cgroup.procs guarantee that the result is sorted. This is the only reason this whole pidlist logic is necessary instead of just iterating through sorted member tasks. We can't do anything about the existing interface but at least ensure that such expectation doesn't exist for the new interface so that pidlist logic may be removed in the distant future. This patch scrambles the sort order if sane_behavior so that the output is usually not sorted in the new interface. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com>
-rw-r--r--include/linux/cgroup.h3
-rw-r--r--kernel/cgroup.c51
2 files changed, 49 insertions, 5 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 5207c28c2402..50d8cc37498b 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -275,6 +275,9 @@ enum {
* - "tasks" is removed. Everything should be at process
* granularity. Use "cgroup.procs" instead.
*
+ * - "cgroup.procs" is not sorted. pids will be unique unless they
+ * got recycled inbetween reads.
+ *
* - "release_agent" and "notify_on_release" are removed.
* Replacement notification mechanism will be implemented.
*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a2458031d851..f9f5fe3526ac 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3564,11 +3564,49 @@ after:
return dest;
}
+/*
+ * The two pid files - task and cgroup.procs - guaranteed that the result
+ * is sorted, which forced this whole pidlist fiasco. As pid order is
+ * different per namespace, each namespace needs differently sorted list,
+ * making it impossible to use, for example, single rbtree of member tasks
+ * sorted by task pointer. As pidlists can be fairly large, allocating one
+ * per open file is dangerous, so cgroup had to implement shared pool of
+ * pidlists keyed by cgroup and namespace.
+ *
+ * All this extra complexity was caused by the original implementation
+ * committing to an entirely unnecessary property. In the long term, we
+ * want to do away with it. Explicitly scramble sort order if
+ * sane_behavior so that no such expectation exists in the new interface.
+ *
+ * Scrambling is done by swapping every two consecutive bits, which is
+ * non-identity one-to-one mapping which disturbs sort order sufficiently.
+ */
+static pid_t pid_fry(pid_t pid)
+{
+ unsigned a = pid & 0x55555555;
+ unsigned b = pid & 0xAAAAAAAA;
+
+ return (a << 1) | (b >> 1);
+}
+
+static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
+{
+ if (cgroup_sane_behavior(cgrp))
+ return pid_fry(pid);
+ else
+ return pid;
+}
+
static int cmppid(const void *a, const void *b)
{
return *(pid_t *)a - *(pid_t *)b;
}
+static int fried_cmppid(const void *a, const void *b)
+{
+ return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
+}
+
static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
enum cgroup_filetype type)
{
@@ -3656,7 +3694,10 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
css_task_iter_end(&it);
length = n;
/* now sort & (if procs) strip out duplicates */
- sort(array, length, sizeof(pid_t), cmppid, NULL);
+ if (cgroup_sane_behavior(cgrp))
+ sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
+ else
+ sort(array, length, sizeof(pid_t), cmppid, NULL);
if (type == CGROUP_FILE_PROCS)
length = pidlist_uniq(array, length);
@@ -3777,10 +3818,10 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
while (index < end) {
int mid = (index + end) / 2;
- if (l->list[mid] == pid) {
+ if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
index = mid;
break;
- } else if (l->list[mid] <= pid)
+ } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
index = mid + 1;
else
end = mid;
@@ -3791,7 +3832,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
return NULL;
/* Update the abstract position to be the actual pid that we found */
iter = l->list + index;
- *pos = *iter;
+ *pos = cgroup_pid_fry(cgrp, *iter);
return iter;
}
@@ -3820,7 +3861,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
if (p >= end) {
return NULL;
} else {
- *pos = *p;
+ *pos = cgroup_pid_fry(of->cgrp, *p);
return p;
}
}