diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-06-27 13:32:54 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-06-27 13:32:54 -0700 |
commit | b4b27b9eed8ebdbf9f3046197d29d733c8c944f3 (patch) | |
tree | 836d7997effa9b977233dedc5f5f7c05685c111b | |
parent | 625acffd7ae2c52898d249e6c5c39f348db0d8df (diff) | |
download | lwn-b4b27b9eed8ebdbf9f3046197d29d733c8c944f3.tar.gz lwn-b4b27b9eed8ebdbf9f3046197d29d733c8c944f3.zip |
Revert "signal: Allow tasks to cache one sigqueue struct"
This reverts commits 4bad58ebc8bc4f20d89cff95417c9b4674769709 (and
399f8dd9a866e107639eabd3c1979cd526ca3a98, which tried to fix it).
I do not believe these are correct, and I'm about to release 5.13, so am
reverting them out of an abundance of caution.
The locking is odd, and appears broken.
On the allocation side (in __sigqueue_alloc()), the locking is somewhat
straightforward: it depends on sighand->siglock. Since one caller
doesn't hold that lock, it further then tests 'sigqueue_flags' to avoid
the case with no locks held.
On the freeing side (in sigqueue_cache_or_free()), there is no locking
at all, and the logic instead depends on 'current' being a single
thread, and not able to race with itself.
To make things more exciting, there's also the data race between freeing
a signal and allocating one, which is handled by using WRITE_ONCE() and
READ_ONCE(), and being mutually exclusive wrt the initial state (ie
freeing will only free if the old state was NULL, while allocating will
obviously only use the value if it was non-NULL, so only one or the
other will actually act on the value).
However, while the free->alloc paths do seem mutually exclusive thanks
to just the data value dependency, it's not clear what the memory
ordering constraints are on it. Could writes from the previous
allocation possibly be delayed and seen by the new allocation later,
causing logical inconsistencies?
So it's all very exciting and unusual.
And in particular, it seems that the freeing side is incorrect in
depending on "current" being single-threaded. Yes, 'current' is a
single thread, but in the presense of asynchronous events even a single
thread can have data races.
And such asynchronous events can and do happen, with interrupts causing
signals to be flushed and thus free'd (for example - sending a
SIGCONT/SIGSTOP can happen from interrupt context, and can flush
previously queued process control signals).
So regardless of all the other questions about the memory ordering and
locking for this new cached allocation, the sigqueue_cache_or_free()
assumptions seem to be fundamentally incorrect.
It may be that people will show me the errors of my ways, and tell me
why this is all safe after all. We can reinstate it if so. But my
current belief is that the WRITE_ONCE() that sets the cached entry needs
to be a smp_store_release(), and the READ_ONCE() that finds a cached
entry needs to be a smp_load_acquire() to handle memory ordering
correctly.
And the sequence in sigqueue_cache_or_free() would need to either use a
lock or at least be interrupt-safe some way (perhaps by using something
like the percpu 'cmpxchg': it doesn't need to be SMP-safe, but like the
percpu operations it needs to be interrupt-safe).
Fixes: 399f8dd9a866 ("signal: Prevent sigqueue caching after task got released")
Fixes: 4bad58ebc8bc ("signal: Allow tasks to cache one sigqueue struct")
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Christian Brauner <christian.brauner@ubuntu.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/sched.h | 1 | ||||
-rw-r--r-- | include/linux/signal.h | 1 | ||||
-rw-r--r-- | kernel/exit.c | 1 | ||||
-rw-r--r-- | kernel/fork.c | 1 | ||||
-rw-r--r-- | kernel/signal.c | 59 |
5 files changed, 2 insertions, 61 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index 28a98fc4ded4..32813c345115 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -997,7 +997,6 @@ struct task_struct { /* Signal handlers: */ struct signal_struct *signal; struct sighand_struct __rcu *sighand; - struct sigqueue *sigqueue_cache; sigset_t blocked; sigset_t real_blocked; /* Restored if set_restore_sigmask() was used: */ diff --git a/include/linux/signal.h b/include/linux/signal.h index 201f88e3738b..5160fd45e5ca 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -267,7 +267,6 @@ static inline void init_sigpending(struct sigpending *sig) } extern void flush_sigqueue(struct sigpending *queue); -extern void exit_task_sigqueue_cache(struct task_struct *tsk); /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */ static inline int valid_signal(unsigned long sig) diff --git a/kernel/exit.c b/kernel/exit.c index fd1c04193e18..65809fac3038 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -162,7 +162,6 @@ static void __exit_signal(struct task_struct *tsk) flush_sigqueue(&sig->shared_pending); tty_kref_put(tty); } - exit_task_sigqueue_cache(tsk); } static void delayed_put_task_struct(struct rcu_head *rhp) diff --git a/kernel/fork.c b/kernel/fork.c index dc06afd725cb..a070caed5c8e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2008,7 +2008,6 @@ static __latent_entropy struct task_struct *copy_process( spin_lock_init(&p->alloc_lock); init_sigpending(&p->pending); - p->sigqueue_cache = NULL; p->utime = p->stime = p->gtime = 0; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME diff --git a/kernel/signal.c b/kernel/signal.c index f1ecd8f0c11d..30a0bee5ff9b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -431,22 +431,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, rcu_read_unlock(); if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { - /* - * Preallocation does not hold sighand::siglock so it can't - * use the cache. The lockless caching requires that only - * one consumer and only one producer run at a time. - * - * For the regular allocation case it is sufficient to - * check @q for NULL because this code can only be called - * if the target task @t has not been reaped yet; which - * means this code can never observe the error pointer which is - * written to @t->sigqueue_cache in exit_task_sigqueue_cache(). - */ - q = READ_ONCE(t->sigqueue_cache); - if (!q || sigqueue_flags) - q = kmem_cache_alloc(sigqueue_cachep, gfp_flags); - else - WRITE_ONCE(t->sigqueue_cache, NULL); + q = kmem_cache_alloc(sigqueue_cachep, gfp_flags); } else { print_dropped_signal(sig); } @@ -463,53 +448,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, return q; } -void exit_task_sigqueue_cache(struct task_struct *tsk) -{ - /* Race free because @tsk is mopped up */ - struct sigqueue *q = tsk->sigqueue_cache; - - if (q) { - /* - * Hand it back to the cache as the task might - * be self reaping which would leak the object. - */ - kmem_cache_free(sigqueue_cachep, q); - } - - /* - * Set an error pointer to ensure that @tsk will not cache a - * sigqueue when it is reaping it's child tasks - */ - tsk->sigqueue_cache = ERR_PTR(-1); -} - -static void sigqueue_cache_or_free(struct sigqueue *q) -{ - /* - * Cache one sigqueue per task. This pairs with the consumer side - * in __sigqueue_alloc() and needs READ/WRITE_ONCE() to prevent the - * compiler from store tearing and to tell KCSAN that the data race - * is intentional when run without holding current->sighand->siglock, - * which is fine as current obviously cannot run __sigqueue_free() - * concurrently. - * - * The NULL check is safe even if current has been reaped already, - * in which case exit_task_sigqueue_cache() wrote an error pointer - * into current->sigqueue_cache. - */ - if (!READ_ONCE(current->sigqueue_cache)) - WRITE_ONCE(current->sigqueue_cache, q); - else - kmem_cache_free(sigqueue_cachep, q); -} - static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) return; if (atomic_dec_and_test(&q->user->sigpending)) free_uid(q->user); - sigqueue_cache_or_free(q); + kmem_cache_free(sigqueue_cachep, q); } void flush_sigqueue(struct sigpending *queue) |