sched: Add 'autogroup' scheduling feature: automated per session task groups

A recurring complaint from CFS users is that parallel kbuild has a negative impact on desktop interactivity. This patch implements an idea from Linus, to automatically create task groups. Currently, only per session autogroups are implemented, but the patch leaves the way open for enhancement. Implementation: each task's signal struct contains an inherited pointer to a refcounted autogroup struct containing a task group pointer, the default for all tasks pointing to the init_task_group. When a task calls setsid(), a new task group is created, the process is moved into the new task group, and a reference to the preveious task group is dropped. Child processes inherit this task group thereafter, and increase it's refcount. When the last thread of a process exits, the process's reference is dropped, such that when the last process referencing an autogroup exits, the autogroup is destroyed. At runqueue selection time, IFF a task has no cgroup assignment, its current autogroup is used. Autogroup bandwidth is controllable via setting it's nice level through the proc filesystem: cat /proc/<pid>/autogroup Displays the task's group and the group's nice level. echo <nice level> > /proc/<pid>/autogroup Sets the task group's shares to the weight of nice <level> task. Setting nice level is rate limited for !admin users due to the abuse risk of task group locking. The feature is enabled from boot by default if CONFIG_SCHED_AUTOGROUP=y is selected, but can be disabled via the boot option noautogroup, and can also be turned on/off on the fly via: echo [01] > /proc/sys/kernel/sched_autogroup_enabled ... which will automatically move tasks to/from the root task group. Signed-off-by: Mike Galbraith <efault@gmx.de> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Paul Turner <pjt@google.com> Cc: Oleg Nesterov <oleg@redhat.com> [ Removed the task_group_path() debug code, and fixed !EVENTFD build failure. ] Signed-off-by: Ingo Molnar <mingo@elte.hu> LKML-Reference: <1290281700.28711.9.camel@maggy.simson.net> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Mike Galbraith <efault@gmx.de> 2010-11-30 14:18:03 +0100
committer: Ingo Molnar <mingo@elte.hu> 2010-11-30 16:03:35 +0100
commit: 5091faa449ee0b7d73bc296a93bca9540fc51d0a (patch)
tree: 55f5e96e189af65c85c769fce48627b8a5abb86b /kernel
parent: 822bc180a7f7a7bc5fcaaea195f41b487cc8cae8 (diff)
download: lwn-5091faa449ee0b7d73bc296a93bca9540fc51d0a.tar.gz
lwn-5091faa449ee0b7d73bc296a93bca9540fc51d0a.zip
7 files changed, 292 insertions, 49 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b159c5991b7..b6f2475f1e83 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -174,8 +174,10 @@ static inline void free_signal_struct(struct signal_struct *sig)
 
 static inline void put_signal_struct(struct signal_struct *sig)
 {
-	if (atomic_dec_and_test(&sig->sigcnt))
+	if (atomic_dec_and_test(&sig->sigcnt)) {
+		sched_autogroup_exit(sig);
 		free_signal_struct(sig);
+	}
 }
 
 void __put_task_struct(struct task_struct *tsk)
@@ -904,6 +906,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	posix_cpu_timers_init_group(sig);
 
 	tty_audit_fork(sig);
+	sched_autogroup_fork(sig);
 
 	sig->oom_adj = current->signal->oom_adj;
 	sig->oom_score_adj = current->signal->oom_score_adj;
diff --git a/kernel/sched.c b/kernel/sched.c
index 66ef5790d932..b646dad4a40e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -79,6 +79,7 @@
 
 #include "sched_cpupri.h"
 #include "workqueue_sched.h"
+#include "sched_autogroup.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -271,6 +272,10 @@ struct task_group {
 	struct task_group *parent;
 	struct list_head siblings;
 	struct list_head children;
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+	struct autogroup *autogroup;
+#endif
 };
 
 #define root_task_group init_task_group
@@ -603,11 +608,14 @@ static inline int cpu_of(struct rq *rq)
  */
 static inline struct task_group *task_group(struct task_struct *p)
 {
+	struct task_group *tg;
 	struct cgroup_subsys_state *css;
 
 	css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
 			lockdep_is_held(&task_rq(p)->lock));
-	return container_of(css, struct task_group, css);
+	tg = container_of(css, struct task_group, css);
+
+	return autogroup_task_group(p, tg);
 }
 
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -1869,6 +1877,7 @@ static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
 #include "sched_idletask.c"
 #include "sched_fair.c"
 #include "sched_rt.c"
+#include "sched_autogroup.c"
 #include "sched_stoptask.c"
 #ifdef CONFIG_SCHED_DEBUG
 # include "sched_debug.c"
@@ -7750,7 +7759,7 @@ void __init sched_init(void)
 #ifdef CONFIG_CGROUP_SCHED
 	list_add(&init_task_group.list, &task_groups);
 	INIT_LIST_HEAD(&init_task_group.children);
-
+	autogroup_init(&init_task);
 #endif /* CONFIG_CGROUP_SCHED */
 
 	for_each_possible_cpu(i) {
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
new file mode 100644
index 000000000000..57a7ac286a02
--- /dev/null
+++ b/kernel/sched_autogroup.c
@@ -0,0 +1,229 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kallsyms.h>
+#include <linux/utsname.h>
+
+unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
+static struct autogroup autogroup_default;
+static atomic_t autogroup_seq_nr;
+
+static void autogroup_init(struct task_struct *init_task)
+{
+	autogroup_default.tg = &init_task_group;
+	init_task_group.autogroup = &autogroup_default;
+	kref_init(&autogroup_default.kref);
+	init_rwsem(&autogroup_default.lock);
+	init_task->signal->autogroup = &autogroup_default;
+}
+
+static inline void autogroup_free(struct task_group *tg)
+{
+	kfree(tg->autogroup);
+}
+
+static inline void autogroup_destroy(struct kref *kref)
+{
+	struct autogroup *ag = container_of(kref, struct autogroup, kref);
+
+	sched_destroy_group(ag->tg);
+}
+
+static inline void autogroup_kref_put(struct autogroup *ag)
+{
+	kref_put(&ag->kref, autogroup_destroy);
+}
+
+static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
+{
+	kref_get(&ag->kref);
+	return ag;
+}
+
+static inline struct autogroup *autogroup_create(void)
+{
+	struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
+	struct task_group *tg;
+
+	if (!ag)
+		goto out_fail;
+
+	tg = sched_create_group(&init_task_group);
+
+	if (IS_ERR(tg))
+		goto out_free;
+
+	kref_init(&ag->kref);
+	init_rwsem(&ag->lock);
+	ag->id = atomic_inc_return(&autogroup_seq_nr);
+	ag->tg = tg;
+	tg->autogroup = ag;
+
+	return ag;
+
+out_free:
+	kfree(ag);
+out_fail:
+	if (printk_ratelimit()) {
+		printk(KERN_WARNING "autogroup_create: %s failure.\n",
+			ag ? "sched_create_group()" : "kmalloc()");
+	}
+
+	return autogroup_kref_get(&autogroup_default);
+}
+
+static inline bool
+task_wants_autogroup(struct task_struct *p, struct task_group *tg)
+{
+	if (tg != &root_task_group)
+		return false;
+
+	if (p->sched_class != &fair_sched_class)
+		return false;
+
+	/*
+	 * We can only assume the task group can't go away on us if
+	 * autogroup_move_group() can see us on ->thread_group list.
+	 */
+	if (p->flags & PF_EXITING)
+		return false;
+
+	return true;
+}
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+	int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+
+	if (enabled && task_wants_autogroup(p, tg))
+		return p->signal->autogroup->tg;
+
+	return tg;
+}
+
+static void
+autogroup_move_group(struct task_struct *p, struct autogroup *ag)
+{
+	struct autogroup *prev;
+	struct task_struct *t;
+	unsigned long flags;
+
+	BUG_ON(!lock_task_sighand(p, &flags));
+
+	prev = p->signal->autogroup;
+	if (prev == ag) {
+		unlock_task_sighand(p, &flags);
+		return;
+	}
+
+	p->signal->autogroup = autogroup_kref_get(ag);
+
+	t = p;
+	do {
+		sched_move_task(t);
+	} while_each_thread(p, t);
+
+	unlock_task_sighand(p, &flags);
+	autogroup_kref_put(prev);
+}
+
+/* Allocates GFP_KERNEL, cannot be called under any spinlock */
+void sched_autogroup_create_attach(struct task_struct *p)
+{
+	struct autogroup *ag = autogroup_create();
+
+	autogroup_move_group(p, ag);
+	/* drop extra refrence added by autogroup_create() */
+	autogroup_kref_put(ag);
+}
+EXPORT_SYMBOL(sched_autogroup_create_attach);
+
+/* Cannot be called under siglock.  Currently has no users */
+void sched_autogroup_detach(struct task_struct *p)
+{
+	autogroup_move_group(p, &autogroup_default);
+}
+EXPORT_SYMBOL(sched_autogroup_detach);
+
+void sched_autogroup_fork(struct signal_struct *sig)
+{
+	struct task_struct *p = current;
+
+	spin_lock_irq(&p->sighand->siglock);
+	sig->autogroup = autogroup_kref_get(p->signal->autogroup);
+	spin_unlock_irq(&p->sighand->siglock);
+}
+
+void sched_autogroup_exit(struct signal_struct *sig)
+{
+	autogroup_kref_put(sig->autogroup);
+}
+
+static int __init setup_autogroup(char *str)
+{
+	sysctl_sched_autogroup_enabled = 0;
+
+	return 1;
+}
+
+__setup("noautogroup", setup_autogroup);
+
+#ifdef CONFIG_PROC_FS
+
+/* Called with siglock held. */
+int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
+{
+	static unsigned long next = INITIAL_JIFFIES;
+	struct autogroup *ag;
+	int err;
+
+	if (*nice < -20 || *nice > 19)
+		return -EINVAL;
+
+	err = security_task_setnice(current, *nice);
+	if (err)
+		return err;
+
+	if (*nice < 0 && !can_nice(current, *nice))
+		return -EPERM;
+
+	/* this is a heavy operation taking global locks.. */
+	if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
+		return -EAGAIN;
+
+	next = HZ / 10 + jiffies;
+	ag = autogroup_kref_get(p->signal->autogroup);
+
+	down_write(&ag->lock);
+	err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
+	if (!err)
+		ag->nice = *nice;
+	up_write(&ag->lock);
+
+	autogroup_kref_put(ag);
+
+	return err;
+}
+
+void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
+{
+	struct autogroup *ag = autogroup_kref_get(p->signal->autogroup);
+
+	down_read(&ag->lock);
+	seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
+	up_read(&ag->lock);
+
+	autogroup_kref_put(ag);
+}
+#endif /* CONFIG_PROC_FS */
+
+#ifdef CONFIG_SCHED_DEBUG
+static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+	return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
+}
+#endif /* CONFIG_SCHED_DEBUG */
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
new file mode 100644
index 000000000000..5358e241cb20
--- /dev/null
+++ b/kernel/sched_autogroup.h
@@ -0,0 +1,32 @@
+#ifdef CONFIG_SCHED_AUTOGROUP
+
+struct autogroup {
+	struct kref		kref;
+	struct task_group	*tg;
+	struct rw_semaphore	lock;
+	unsigned long		id;
+	int			nice;
+};
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg);
+
+#else /* !CONFIG_SCHED_AUTOGROUP */
+
+static inline void autogroup_init(struct task_struct *init_task) {  }
+static inline void autogroup_free(struct task_group *tg) { }
+
+static inline struct task_group *
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+	return tg;
+}
+
+#ifdef CONFIG_SCHED_DEBUG
+static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+{
+	return 0;
+}
+#endif
+
+#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index e95b77414a99..1dfae3d014b5 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -54,8 +54,7 @@ static unsigned long nsec_low(unsigned long long nsec)
 #define SPLIT_NS(x) nsec_high(x), nsec_low(x)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void print_cfs_group_stats(struct seq_file *m, int cpu,
-		struct task_group *tg)
+static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
 {
 	struct sched_entity *se = tg->se[cpu];
 	if (!se)
@@ -110,16 +109,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 		0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
 #endif
 
-#ifdef CONFIG_CGROUP_SCHED
-	{
-		char path[64];
-
-		rcu_read_lock();
-		cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
-		rcu_read_unlock();
-		SEQ_printf(m, " %s", path);
-	}
-#endif
 	SEQ_printf(m, "\n");
 }
 
@@ -147,19 +136,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
 	read_unlock_irqrestore(&tasklist_lock, flags);
 }
 
-#if defined(CONFIG_CGROUP_SCHED) && \
-	(defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
-static void task_group_path(struct task_group *tg, char *buf, int buflen)
-{
-	/* may be NULL if the underlying cgroup isn't fully-created yet */
-	if (!tg->css.cgroup) {
-		buf[0] = '\0';
-		return;
-	}
-	cgroup_path(tg->css.cgroup, buf, buflen);
-}
-#endif
-
 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
 	s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@@ -168,16 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	struct sched_entity *last;
 	unsigned long flags;
 
-#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
-	char path[128];
-	struct task_group *tg = cfs_rq->tg;
-
-	task_group_path(tg, path, sizeof(path));
-
-	SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
-#else
 	SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
-#endif
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
 			SPLIT_NS(cfs_rq->exec_clock));
 
@@ -215,7 +182,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	SEQ_printf(m, "  .%-30s: %ld\n", "load_contrib",
 			cfs_rq->load_contribution);
 	SEQ_printf(m, "  .%-30s: %d\n", "load_tg",
-			atomic_read(&tg->load_weight));
+			atomic_read(&cfs_rq->tg->load_weight));
 #endif
 
 	print_cfs_group_stats(m, cpu, cfs_rq->tg);
@@ -224,17 +191,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 
 void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
-#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
-	char path[128];
-	struct task_group *tg = rt_rq->tg;
-
-	task_group_path(tg, path, sizeof(path));
-
-	SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
-#else
 	SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
-#endif
-
 
 #define P(x) \
 	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
diff --git a/kernel/sys.c b/kernel/sys.c
index 7f5a0cd296a9..2745dcdb6c6c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1080,8 +1080,10 @@ SYSCALL_DEFINE0(setsid)
 	err = session;
 out:
 	write_unlock_irq(&tasklist_lock);
-	if (err > 0)
+	if (err > 0) {
 		proc_sid_connector(group_leader);
+		sched_autogroup_create_attach(group_leader);
+	}
 	return err;
 }
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a00fdefd24ce..121e4fff03d1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -370,6 +370,17 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_SCHED_AUTOGROUP
+	{
+		.procname	= "sched_autogroup_enabled",
+		.data		= &sysctl_sched_autogroup_enabled,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.procname	= "prove_locking",
author	Mike Galbraith <efault@gmx.de>	2010-11-30 14:18:03 +0100
committer	Ingo Molnar <mingo@elte.hu>	2010-11-30 16:03:35 +0100
commit	5091faa449ee0b7d73bc296a93bca9540fc51d0a (patch)
tree	55f5e96e189af65c85c769fce48627b8a5abb86b /kernel
parent	822bc180a7f7a7bc5fcaaea195f41b487cc8cae8 (diff)
download	lwn-5091faa449ee0b7d73bc296a93bca9540fc51d0a.tar.gz lwn-5091faa449ee0b7d73bc296a93bca9540fc51d0a.zip