From 370eaf38450c77ec9b5ce6bc74bc575b2e2ce448 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Oct 2009 17:21:18 +0200 Subject: futex: Revert "futex: Wake up waiter outside the hb->lock section" This reverts commit 928686b77ab275fd7f828ff24bd510baca995425. The patch was an optimization of the old futex wake code where we woke the waiter and then set q->lock_ptr to NULL. When the waiter preempted the waker then we run into lock contention on q->lock_ptr aka. hb->lock. commit f1a11e (futex: remove the wait queue) changes the wakeup logic by setting q->lock_ptr to NULL _before_ waking the task. It keeps a reference on the task struct of the to be woken task to avoid an exit race. The combination of both patches resulted in different race on -RT: A is blocked on futex B calls futex_wake B sets q(A)->lock_ptr to NULL and puts A on the wake list B is preempted ... A wakes up (e.g. timer, signal) A detects q->lock_ptr = NULL and returns A waits on a different futex B is scheduled back in B wakes A A sees a spurious wake up Reported-by: Blaise Gassend Debugged-by: Darren Hart Signed-off-by: Thomas Gleixner enter the commit message for your changes. Lines starting --- kernel/fork.c | 1 - kernel/futex.c | 66 +++++++--------------------------------------------------- 2 files changed, 7 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 604f84972eb4..a9652601a6d0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1190,7 +1190,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif INIT_LIST_HEAD(&p->pi_state_list); p->pi_state_cache = NULL; - p->futex_wakeup = NULL; #endif /* * sigaltstack should be cleared when sharing the same VM diff --git a/kernel/futex.c b/kernel/futex.c index 60b759026433..b87480c794ee 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -713,7 +713,7 @@ retry: * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. */ -static void wake_futex(struct task_struct **wake_list, struct futex_q *q) +static void wake_futex(struct futex_q *q) { struct task_struct *p = q->task; @@ -736,51 +736,8 @@ static void wake_futex(struct task_struct **wake_list, struct futex_q *q) smp_wmb(); q->lock_ptr = NULL; - /* - * Atomically grab the task, if ->futex_wakeup is !0 already it means - * its already queued (either by us or someone else) and will get the - * wakeup due to that. - * - * This cmpxchg() implies a full barrier, which pairs with the write - * barrier implied by the wakeup in wake_futex_list(). - */ - if (cmpxchg(&p->futex_wakeup, 0, p) != 0) { - /* - * It was already queued, drop the extra ref and we're done. - */ - put_task_struct(p); - return; - } - - /* - * Put the task on our wakeup list by atomically switching it with - * the list head. (XXX its a local list, no possible concurrency, - * this could be written without cmpxchg). - */ - do { - p->futex_wakeup = *wake_list; - } while (cmpxchg(wake_list, p->futex_wakeup, p) != p->futex_wakeup); -} - -/* - * For each task on the list, deliver the pending wakeup and release the - * task reference obtained in wake_futex(). - */ -static void wake_futex_list(struct task_struct *head) -{ - while (head != &init_task) { - struct task_struct *next = head->futex_wakeup; - - head->futex_wakeup = NULL; - /* - * wake_up_state() implies a wmb() to pair with the queueing - * in wake_futex() so as to not miss wakeups. - */ - wake_up_state(head, TASK_NORMAL); - put_task_struct(head); - - head = next; - } + wake_up_state(p, TASK_NORMAL); + put_task_struct(p); } static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) @@ -894,7 +851,6 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) struct futex_q *this, *next; struct plist_head *head; union futex_key key = FUTEX_KEY_INIT; - struct task_struct *wake_list = &init_task; int ret; if (!bitset) @@ -919,7 +875,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) if (!(this->bitset & bitset)) continue; - wake_futex(&wake_list, this); + wake_futex(this); if (++ret >= nr_wake) break; } @@ -927,8 +883,6 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) spin_unlock(&hb->lock); put_futex_key(fshared, &key); - - wake_futex_list(wake_list); out: return ret; } @@ -945,7 +899,6 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, struct futex_hash_bucket *hb1, *hb2; struct plist_head *head; struct futex_q *this, *next; - struct task_struct *wake_list = &init_task; int ret, op_ret; retry: @@ -996,7 +949,7 @@ retry_private: plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key1)) { - wake_futex(&wake_list, this); + wake_futex(this); if (++ret >= nr_wake) break; } @@ -1008,7 +961,7 @@ retry_private: op_ret = 0; plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key2)) { - wake_futex(&wake_list, this); + wake_futex(this); if (++op_ret >= nr_wake2) break; } @@ -1021,8 +974,6 @@ out_put_keys: put_futex_key(fshared, &key2); out_put_key1: put_futex_key(fshared, &key1); - - wake_futex_list(wake_list); out: return ret; } @@ -1177,7 +1128,6 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, struct futex_hash_bucket *hb1, *hb2; struct plist_head *head1; struct futex_q *this, *next; - struct task_struct *wake_list = &init_task; u32 curval2; if (requeue_pi) { @@ -1322,7 +1272,7 @@ retry_private: * woken by futex_unlock_pi(). */ if (++task_count <= nr_wake && !requeue_pi) { - wake_futex(&wake_list, this); + wake_futex(this); continue; } @@ -1368,8 +1318,6 @@ out_put_keys: put_futex_key(fshared, &key2); out_put_key1: put_futex_key(fshared, &key1); - - wake_futex_list(wake_list); out: if (pi_state != NULL) free_pi_state(pi_state); -- cgit v1.2.3 From 6b3d1c1f46275c0c59f40e8d2955778d29fae4e4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Oct 2009 18:51:01 +0200 Subject: softirq: Revert "softirq: Do not create hrtimer softirq thread..." This reverts commit d69c5d3773d7e0122358b9c87cc756837021b5f6. The softirq is necessary even in the CONFIG_HIGH_RES_TIMERS=n case. Signed-off-by: Thomas Gleixner --- kernel/softirq.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 352697636f0a..aae8d459f728 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -1161,8 +1161,6 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, per_cpu(ksoftirqd, hotcpu)[i].tsk = NULL; } for (i = 0; i < NR_SOFTIRQS; i++) { - if (!softirq_names[i]) - continue; p = kthread_create(ksoftirqd, &per_cpu(ksoftirqd, hotcpu)[i], "sirq-%s/%d", softirq_names[i], @@ -1179,11 +1177,8 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - for (i = 0; i < NR_SOFTIRQS; i++) { - p = per_cpu(ksoftirqd, hotcpu)[i].tsk; - if (p) - wake_up_process(p); - } + for (i = 0; i < NR_SOFTIRQS; i++) + wake_up_process(per_cpu(ksoftirqd, hotcpu)[i].tsk); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: @@ -1197,11 +1192,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, for (i = 0; i < NR_SOFTIRQS; i++) { param.sched_priority = MAX_RT_PRIO-1; p = per_cpu(ksoftirqd, hotcpu)[i].tsk; - if (p) { - sched_setscheduler(p, SCHED_FIFO, ¶m); - per_cpu(ksoftirqd, hotcpu)[i].tsk = NULL; - kthread_stop(p); - } + sched_setscheduler(p, SCHED_FIFO, ¶m); + per_cpu(ksoftirqd, hotcpu)[i].tsk = NULL; + kthread_stop(p); } takeover_tasklets(hotcpu); break; -- cgit v1.2.3 From d2638539b55c202e07deb4730cd8620174c757e8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Oct 2009 18:52:52 +0200 Subject: softirq: Name hrtimer softirq also when CONFIG_HIGH_RES_TIMERS=n Remy Bohmer pointed out that we create the hrtimer softirq thread even when CONFIG_HIGH_RES_TIMERS is off. That results in a softirq-NULL name for the thread. The thread is needed on -rt Signed-off-by: Thomas Gleixner --- kernel/softirq.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index aae8d459f728..590049c17dc8 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -1139,9 +1139,7 @@ static const char *softirq_names [] = [NET_RX_SOFTIRQ] = "net-rx", [BLOCK_SOFTIRQ] = "block", [TASKLET_SOFTIRQ] = "tasklet", -#ifdef CONFIG_HIGH_RES_TIMERS [HRTIMER_SOFTIRQ] = "hrtimer", -#endif [RCU_SOFTIRQ] = "rcu", }; -- cgit v1.2.3 From 41890f2456998c170f416fc29715fadfd57e6626 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Oct 2009 21:03:43 +0200 Subject: futex: Handle spurious wake up The futex code does not handle spurious wake up in futex_wait and futex_wait_requeue_pi. The code assumes that any wake up which was not caused by futex_wake / requeue or by a timeout was caused by a signal wake up and returns one of the syscall restart error codes. In case of a spurious wake up the signal delivery code which deals with the restart error codes is not invoked and we return that error code to user space. That causes applications which actually check the return codes to fail. Blaise reported that on preempt-rt a python test program run into a exception trap. -rt exposed that due to a built in spurious wake up accelerator :) Solve this by checking signal_pending(current) in the wake up path and handle the spurious wake up case w/o returning to user space. Reported-by: Blaise Gassend Debugged-by: Darren Hart Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: stable@kernel.org LKML-Reference: Conflicts: kernel/futex.c Signed-off-by: Thomas Gleixner --- kernel/futex.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index b87480c794ee..e2e83a7fb397 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1753,6 +1753,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, current->timer_slack_ns); } +retry: /* Prepare to wait on uaddr. */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); if (ret) @@ -1770,9 +1771,14 @@ static int futex_wait(u32 __user *uaddr, int fshared, goto out_put_key; /* - * We expect signal_pending(current), but another thread may - * have handled it for us already. + * We expect signal_pending(current), but we might be the + * victim of a spurious wakeup as well. */ + if (!signal_pending(current)) { + put_futex_key(fshared, &q.key); + goto retry; + } + ret = -ERESTARTSYS; if (!abs_time) goto out_put_key; @@ -2079,9 +2085,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, */ plist_del(&q->list, &q->list.plist); + /* Handle spurious wakeups gracefully */ + ret = -EAGAIN; if (timeout && !timeout->task) ret = -ETIMEDOUT; - else + else if (signal_pending(current)) ret = -ERESTARTNOINTR; } return ret; @@ -2163,6 +2171,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, q.bitset = bitset; q.rt_waiter = &rt_waiter; +retry: key2 = FUTEX_KEY_INIT; ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); if (unlikely(ret != 0)) @@ -2255,6 +2264,9 @@ out_put_keys: out_key2: put_futex_key(fshared, &key2); + /* Spurious wakeup ? */ + if (ret == -EAGAIN) + goto retry; out: if (to) { hrtimer_cancel(&to->timer); -- cgit v1.2.3