summaryrefslogtreecommitdiff
path: root/include/linux/rseq.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/rseq.h')
-rw-r--r--include/linux/rseq.h237
1 files changed, 155 insertions, 82 deletions
diff --git a/include/linux/rseq.h b/include/linux/rseq.h
index bc8af3eb5598..7ef79b25e714 100644
--- a/include/linux/rseq.h
+++ b/include/linux/rseq.h
@@ -3,129 +3,202 @@
#define _LINUX_RSEQ_H
#ifdef CONFIG_RSEQ
-
-#include <linux/preempt.h>
#include <linux/sched.h>
-/*
- * Map the event mask on the user-space ABI enum rseq_cs_flags
- * for direct mask checks.
- */
-enum rseq_event_mask_bits {
- RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
- RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
- RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
-};
-
-enum rseq_event_mask {
- RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT),
- RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT),
- RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT),
-};
-
-static inline void rseq_set_notify_resume(struct task_struct *t)
-{
- if (t->rseq)
- set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
-}
+#include <uapi/linux/rseq.h>
-void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);
+void __rseq_handle_slowpath(struct pt_regs *regs);
-static inline void rseq_handle_notify_resume(struct ksignal *ksig,
- struct pt_regs *regs)
+static __always_inline bool rseq_v2(struct task_struct *t)
{
- if (current->rseq)
- __rseq_handle_notify_resume(ksig, regs);
+ return IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY) && likely(t->rseq.event.has_rseq > 1);
}
-static inline void rseq_signal_deliver(struct ksignal *ksig,
- struct pt_regs *regs)
+/* Invoked from resume_user_mode_work() */
+static inline void rseq_handle_slowpath(struct pt_regs *regs)
{
- preempt_disable();
- __set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
- preempt_enable();
- rseq_handle_notify_resume(ksig, regs);
-}
-
-/* rseq_preempt() requires preemption to be disabled. */
-static inline void rseq_preempt(struct task_struct *t)
-{
- __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
- rseq_set_notify_resume(t);
+ if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) {
+ if (current->rseq.event.slowpath)
+ __rseq_handle_slowpath(regs);
+ } else {
+ if (current->rseq.event.sched_switch && current->rseq.event.has_rseq)
+ __rseq_handle_slowpath(regs);
+ }
}
-/* rseq_migrate() requires preemption to be disabled. */
-static inline void rseq_migrate(struct task_struct *t)
-{
- __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
- rseq_set_notify_resume(t);
-}
+void __rseq_signal_deliver(int sig, struct pt_regs *regs);
/*
- * If parent process has a registered restartable sequences area, the
- * child inherits. Unregister rseq for a clone with CLONE_VM set.
+ * Invoked from signal delivery to fixup based on the register context before
+ * switching to the signal delivery context.
*/
-static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
+static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs)
{
- if (clone_flags & CLONE_VM) {
- t->rseq = NULL;
- t->rseq_len = 0;
- t->rseq_sig = 0;
- t->rseq_event_mask = 0;
+ if (rseq_v2(current)) {
+ /* has_rseq is implied in rseq_v2() */
+ if (current->rseq.event.user_irq)
+ __rseq_signal_deliver(ksig->sig, regs);
} else {
- t->rseq = current->rseq;
- t->rseq_len = current->rseq_len;
- t->rseq_sig = current->rseq_sig;
- t->rseq_event_mask = current->rseq_event_mask;
+ if (current->rseq.event.has_rseq)
+ __rseq_signal_deliver(ksig->sig, regs);
}
}
-static inline void rseq_execve(struct task_struct *t)
+static inline void rseq_raise_notify_resume(struct task_struct *t)
{
- t->rseq = NULL;
- t->rseq_len = 0;
- t->rseq_sig = 0;
- t->rseq_event_mask = 0;
+ set_tsk_thread_flag(t, TIF_RSEQ);
}
-#else
+/* Invoked from context switch to force evaluation on exit to user */
+static __always_inline void rseq_sched_switch_event(struct task_struct *t)
+{
+ struct rseq_event *ev = &t->rseq.event;
+
+ /*
+ * Only apply the user_irq optimization for RSEQ ABI V2 registrations.
+ * Legacy users like TCMalloc rely on the original ABI V1 behaviour
+ * which updates IDs on every context swtich.
+ */
+ if (rseq_v2(t)) {
+ /*
+ * Avoid a boat load of conditionals by using simple logic to
+ * determine whether TIF_NOTIFY_RESUME or TIF_RSEQ needs to be
+ * raised.
+ *
+ * It's required when the CPU or MM CID has changed or the entry
+ * was via interrupt from user space. ev->has_rseq does not have
+ * to be evaluated here because rseq_v2() implies has_rseq.
+ */
+ bool raise = ev->user_irq | ev->ids_changed;
+
+ if (raise) {
+ ev->sched_switch = true;
+ rseq_raise_notify_resume(t);
+ }
+ } else {
+ if (ev->has_rseq) {
+ t->rseq.event.ids_changed = true;
+ t->rseq.event.sched_switch = true;
+ rseq_raise_notify_resume(t);
+ }
+ }
+}
-static inline void rseq_set_notify_resume(struct task_struct *t)
+/*
+ * Invoked from __set_task_cpu() when a task migrates or from
+ * mm_cid_schedin() when the CID changes to enforce an IDs update.
+ *
+ * This does not raise TIF_NOTIFY_RESUME as that happens in
+ * rseq_sched_switch_event().
+ */
+static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t)
{
+ t->rseq.event.ids_changed = true;
}
-static inline void rseq_handle_notify_resume(struct ksignal *ksig,
- struct pt_regs *regs)
+
+/* Enforce a full update after RSEQ registration and when execve() failed */
+static inline void rseq_force_update(void)
{
+ if (current->rseq.event.has_rseq) {
+ current->rseq.event.ids_changed = true;
+ current->rseq.event.sched_switch = true;
+ rseq_raise_notify_resume(current);
+ }
}
-static inline void rseq_signal_deliver(struct ksignal *ksig,
- struct pt_regs *regs)
+
+/*
+ * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
+ * which clears TIF_NOTIFY_RESUME on architectures that don't use the
+ * generic TIF bits and therefore can't provide a separate TIF_RSEQ flag.
+ *
+ * To avoid updating user space RSEQ in that case just to do it eventually
+ * again before returning to user space, because __rseq_handle_slowpath()
+ * does nothing when invoked with NULL register state.
+ *
+ * After returning from guest mode, before exiting to userspace, hypervisors
+ * must invoke this function to re-raise TIF_NOTIFY_RESUME if necessary.
+ */
+static inline void rseq_virt_userspace_exit(void)
{
+ /*
+ * The generic optimization for deferring RSEQ updates until the next
+ * exit relies on having a dedicated TIF_RSEQ.
+ */
+ if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) &&
+ current->rseq.event.sched_switch)
+ rseq_raise_notify_resume(current);
}
-static inline void rseq_preempt(struct task_struct *t)
+
+static inline void rseq_reset(struct task_struct *t)
{
+ /* Protect against preemption and membarrier IPI */
+ guard(irqsave)();
+ memset(&t->rseq, 0, sizeof(t->rseq));
+ t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED;
}
-static inline void rseq_migrate(struct task_struct *t)
+
+static inline void rseq_execve(struct task_struct *t)
{
+ rseq_reset(t);
}
-static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
+
+/*
+ * If parent process has a registered restartable sequences area, the
+ * child inherits. Unregister rseq for a clone with CLONE_VM set.
+ *
+ * On fork, keep the IDs (CPU, MMCID) of the parent, which avoids a fault
+ * on the COW page on exit to user space, when the child stays on the same
+ * CPU as the parent. That's obviously not guaranteed, but in overcommit
+ * scenarios it is more likely and optimizes for the fork/exec case without
+ * taking the fault.
+ */
+static inline void rseq_fork(struct task_struct *t, u64 clone_flags)
{
+ if (clone_flags & CLONE_VM)
+ rseq_reset(t);
+ else
+ t->rseq = current->rseq;
}
-static inline void rseq_execve(struct task_struct *t)
+
+/*
+ * Value returned by getauxval(AT_RSEQ_ALIGN) and expected by rseq
+ * registration. This is the active rseq area size rounded up to next
+ * power of 2, which guarantees that the rseq structure will always be
+ * aligned on the nearest power of two large enough to contain it, even
+ * as it grows.
+ */
+static inline unsigned int rseq_alloc_align(void)
{
+ return 1U << get_count_order(offsetof(struct rseq, end));
}
-#endif
+#else /* CONFIG_RSEQ */
+static inline bool rseq_v2(struct task_struct *t) { return false; }
+static inline void rseq_handle_slowpath(struct pt_regs *regs) { }
+static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { }
+static inline void rseq_sched_switch_event(struct task_struct *t) { }
+static inline void rseq_sched_set_ids_changed(struct task_struct *t) { }
+static inline void rseq_force_update(void) { }
+static inline void rseq_virt_userspace_exit(void) { }
+static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
+static inline void rseq_execve(struct task_struct *t) { }
+#endif /* !CONFIG_RSEQ */
#ifdef CONFIG_DEBUG_RSEQ
-
void rseq_syscall(struct pt_regs *regs);
-
-#else
-
-static inline void rseq_syscall(struct pt_regs *regs)
+#else /* CONFIG_DEBUG_RSEQ */
+static inline void rseq_syscall(struct pt_regs *regs) { }
+#endif /* !CONFIG_DEBUG_RSEQ */
+
+#ifdef CONFIG_RSEQ_SLICE_EXTENSION
+void rseq_syscall_enter_work(long syscall);
+int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3);
+#else /* CONFIG_RSEQ_SLICE_EXTENSION */
+static inline void rseq_syscall_enter_work(long syscall) { }
+static inline int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3)
{
+ return -ENOTSUPP;
}
-
-#endif
+#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
#endif /* _LINUX_RSEQ_H */