diff options
| author | Christian Brauner <brauner@kernel.org> | 2026-02-26 14:50:59 +0100 |
|---|---|---|
| committer | Christian Brauner <brauner@kernel.org> | 2026-03-11 23:14:02 +0100 |
| commit | 12ae2c81b21cfaa193db2faf035d495807edc3a7 (patch) | |
| tree | e839b0f4f37e51fb23ee98a868fa6cd50f84fb7b | |
| parent | 6de23f81a5e08be8fbf5e8d7e9febc72a5b5f27f (diff) | |
| download | lwn-12ae2c81b21cfaa193db2faf035d495807edc3a7.tar.gz lwn-12ae2c81b21cfaa193db2faf035d495807edc3a7.zip | |
clone: add CLONE_AUTOREAP
Add a new clone3() flag CLONE_AUTOREAP that makes a child process
auto-reap on exit without ever becoming a zombie. This is a per-process
property in contrast to the existing auto-reap mechanism via
SA_NOCLDWAIT or SIG_IGN for SIGCHLD which applies to all children of a
given parent.
Currently the only way to automatically reap children is to set
SA_NOCLDWAIT or SIG_IGN on SIGCHLD. This is a parent-scoped property
affecting all children which makes it unsuitable for libraries or
applications that need selective auto-reaping of specific children while
still being able to wait() on others.
CLONE_AUTOREAP stores an autoreap flag in the child's signal_struct.
When the child exits do_notify_parent() checks this flag and causes
exit_notify() to transition the task directly to EXIT_DEAD. Since the
flag lives on the child it survives reparenting: if the original parent
exits and the child is reparented to a subreaper or init the child still
auto-reaps when it eventually exits.
CLONE_AUTOREAP can be combined with CLONE_PIDFD to allow the parent to
monitor the child's exit via poll() and retrieve exit status via
PIDFD_GET_INFO. Without CLONE_PIDFD it provides a fire-and-forget
pattern where the parent simply doesn't care about the child's exit
status. No exit signal is delivered so exit_signal must be zero.
CLONE_AUTOREAP is rejected in combination with CLONE_PARENT. If a
CLONE_AUTOREAP child were to clone(CLONE_PARENT) the new grandchild
would inherit exit_signal == 0 from the autoreap parent's group leader
but without signal->autoreap. This grandchild would become a zombie that
never sends a signal and is never autoreaped - confusing and arguably
broken behavior.
The flag is not inherited by the autoreap process's own children. Each
child that should be autoreaped must be explicitly created with
CLONE_AUTOREAP.
Link: https://github.com/uapi-group/kernel-features/issues/45
Link: https://patch.msgid.link/20260226-work-pidfs-autoreap-v5-1-d148b984a989@kernel.org
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Christian Brauner <brauner@kernel.org>
| -rw-r--r-- | include/linux/sched/signal.h | 1 | ||||
| -rw-r--r-- | include/uapi/linux/sched.h | 5 | ||||
| -rw-r--r-- | kernel/fork.c | 17 | ||||
| -rw-r--r-- | kernel/ptrace.c | 3 | ||||
| -rw-r--r-- | kernel/signal.c | 4 |
5 files changed, 26 insertions, 4 deletions
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index a22248aebcf9..f842c86b806f 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -132,6 +132,7 @@ struct signal_struct { */ unsigned int is_child_subreaper:1; unsigned int has_child_subreaper:1; + unsigned int autoreap:1; #ifdef CONFIG_POSIX_TIMERS diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 359a14cc76a4..69f7b4f9eb0c 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -34,8 +34,9 @@ #define CLONE_IO 0x80000000 /* Clone io context */ /* Flags for the clone3() syscall. */ -#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ -#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */ +#define CLONE_CLEAR_SIGHAND (1ULL << 32) /* Clear any signal handler and reset to SIG_DFL. */ +#define CLONE_INTO_CGROUP (1ULL << 33) /* Clone into a specific cgroup given the right permissions. */ +#define CLONE_AUTOREAP (1ULL << 34) /* Auto-reap child on exit. */ /* * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 diff --git a/kernel/fork.c b/kernel/fork.c index e832da9d15a4..10549574fda6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2028,6 +2028,18 @@ __latent_entropy struct task_struct *copy_process( return ERR_PTR(-EINVAL); } + if (clone_flags & CLONE_AUTOREAP) { + if (clone_flags & CLONE_THREAD) + return ERR_PTR(-EINVAL); + if (clone_flags & CLONE_PARENT) + return ERR_PTR(-EINVAL); + if (args->exit_signal) + return ERR_PTR(-EINVAL); + } + + if ((clone_flags & CLONE_PARENT) && current->signal->autoreap) + return ERR_PTR(-EINVAL); + /* * Force any signals received before this point to be delivered * before the fork happens. Collect up signals sent to multiple @@ -2435,6 +2447,8 @@ __latent_entropy struct task_struct *copy_process( */ p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper || p->real_parent->signal->is_child_subreaper; + if (clone_flags & CLONE_AUTOREAP) + p->signal->autoreap = 1; list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); attach_pid(p, PIDTYPE_TGID); @@ -2897,7 +2911,8 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs) { /* Verify that no unknown flags are passed along. */ if (kargs->flags & - ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP)) + ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP | + CLONE_AUTOREAP)) return false; /* diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 392ec2f75f01..68c17daef8d4 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -549,7 +549,8 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) if (!dead && thread_group_empty(p)) { if (!same_thread_group(p->real_parent, tracer)) dead = do_notify_parent(p, p->exit_signal); - else if (ignoring_children(tracer->sighand)) { + else if (ignoring_children(tracer->sighand) || + p->signal->autoreap) { __wake_up_parent(p, tracer); dead = true; } diff --git a/kernel/signal.c b/kernel/signal.c index d65d0fe24bfb..e61f39fa8c8a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2251,6 +2251,10 @@ bool do_notify_parent(struct task_struct *tsk, int sig) if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) sig = 0; } + if (!tsk->ptrace && tsk->signal->autoreap) { + autoreap = true; + sig = 0; + } /* * Send with __send_signal as si_pid and si_uid are in the * parent's namespaces. |
