diff options
Diffstat (limited to 'kernel/rseq.c')
| -rw-r--r-- | kernel/rseq.c | 1003 |
1 files changed, 729 insertions, 274 deletions
diff --git a/kernel/rseq.c b/kernel/rseq.c index 9de6e35fe679..e75e3a5e312c 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -8,25 +8,7 @@ * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> */ -#include <linux/sched.h> -#include <linux/uaccess.h> -#include <linux/syscalls.h> -#include <linux/rseq.h> -#include <linux/types.h> -#include <asm/ptrace.h> - -#define CREATE_TRACE_POINTS -#include <trace/events/rseq.h> - -/* The original rseq structure size (including padding) is 32 bytes. */ -#define ORIG_RSEQ_SIZE 32 - -#define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \ - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) - /* - * * Restartable sequences are a lightweight interface that allows * user-level code to be executed atomically relative to scheduler * preemption and signal delivery. Typically used for implementing @@ -85,350 +67,823 @@ * F1. <failure> */ -static int rseq_update_cpu_node_id(struct task_struct *t) +/* Required to select the proper per_cpu ops for rseq_stats_inc() */ +#define RSEQ_BUILD_SLOW_PATH + +#include <linux/debugfs.h> +#include <linux/hrtimer.h> +#include <linux/percpu.h> +#include <linux/prctl.h> +#include <linux/ratelimit.h> +#include <linux/rseq_entry.h> +#include <linux/sched.h> +#include <linux/syscalls.h> +#include <linux/uaccess.h> +#include <linux/types.h> +#include <linux/rseq.h> +#include <asm/ptrace.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/rseq.h> + +DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); + +static inline void rseq_control_debug(bool on) { - struct rseq __user *rseq = t->rseq; - u32 cpu_id = raw_smp_processor_id(); - u32 node_id = cpu_to_node(cpu_id); - u32 mm_cid = task_mm_cid(t); + if (on) + static_branch_enable(&rseq_debug_enabled); + else + static_branch_disable(&rseq_debug_enabled); +} - WARN_ON_ONCE((int) mm_cid < 0); - if (!user_write_access_begin(rseq, t->rseq_len)) - goto efault; - unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end); - unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end); - unsafe_put_user(node_id, &rseq->node_id, efault_end); - unsafe_put_user(mm_cid, &rseq->mm_cid, efault_end); - /* - * Additional feature fields added after ORIG_RSEQ_SIZE - * need to be conditionally updated only if - * t->rseq_len != ORIG_RSEQ_SIZE. - */ - user_write_access_end(); +static int __init rseq_setup_debug(char *str) +{ + bool on; + + if (kstrtobool(str, &on)) + return -EINVAL; + rseq_control_debug(on); + return 1; +} +__setup("rseq_debug=", rseq_setup_debug); + +#ifdef CONFIG_TRACEPOINTS +/* + * Out of line, so the actual update functions can be in a header to be + * inlined into the exit to user code. + */ +void __rseq_trace_update(struct task_struct *t) +{ trace_rseq_update(t); +} + +void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, + unsigned long offset, unsigned long abort_ip) +{ + trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip); +} +#endif /* CONFIG_TRACEPOINTS */ + +#ifdef CONFIG_RSEQ_STATS +DEFINE_PER_CPU(struct rseq_stats, rseq_stats); + +static int rseq_stats_show(struct seq_file *m, void *p) +{ + struct rseq_stats stats = { }; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + stats.exit += data_race(per_cpu(rseq_stats.exit, cpu)); + stats.signal += data_race(per_cpu(rseq_stats.signal, cpu)); + stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu)); + stats.fastpath += data_race(per_cpu(rseq_stats.fastpath, cpu)); + stats.ids += data_race(per_cpu(rseq_stats.ids, cpu)); + stats.cs += data_race(per_cpu(rseq_stats.cs, cpu)); + stats.clear += data_race(per_cpu(rseq_stats.clear, cpu)); + stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu)); + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { + stats.s_granted += data_race(per_cpu(rseq_stats.s_granted, cpu)); + stats.s_expired += data_race(per_cpu(rseq_stats.s_expired, cpu)); + stats.s_revoked += data_race(per_cpu(rseq_stats.s_revoked, cpu)); + stats.s_yielded += data_race(per_cpu(rseq_stats.s_yielded, cpu)); + stats.s_aborted += data_race(per_cpu(rseq_stats.s_aborted, cpu)); + } + } + + seq_printf(m, "exit: %16lu\n", stats.exit); + seq_printf(m, "signal: %16lu\n", stats.signal); + seq_printf(m, "slowp: %16lu\n", stats.slowpath); + seq_printf(m, "fastp: %16lu\n", stats.fastpath); + seq_printf(m, "ids: %16lu\n", stats.ids); + seq_printf(m, "cs: %16lu\n", stats.cs); + seq_printf(m, "clear: %16lu\n", stats.clear); + seq_printf(m, "fixup: %16lu\n", stats.fixup); + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) { + seq_printf(m, "sgrant: %16lu\n", stats.s_granted); + seq_printf(m, "sexpir: %16lu\n", stats.s_expired); + seq_printf(m, "srevok: %16lu\n", stats.s_revoked); + seq_printf(m, "syield: %16lu\n", stats.s_yielded); + seq_printf(m, "sabort: %16lu\n", stats.s_aborted); + } return 0; +} -efault_end: - user_write_access_end(); -efault: - return -EFAULT; +static int rseq_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, rseq_stats_show, inode->i_private); +} + +static const struct file_operations stat_ops = { + .open = rseq_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init rseq_stats_init(struct dentry *root_dir) +{ + debugfs_create_file("stats", 0444, root_dir, NULL, &stat_ops); + return 0; +} +#else +static inline void rseq_stats_init(struct dentry *root_dir) { } +#endif /* CONFIG_RSEQ_STATS */ + +static int rseq_debug_show(struct seq_file *m, void *p) +{ + bool on = static_branch_unlikely(&rseq_debug_enabled); + + seq_printf(m, "%d\n", on); + return 0; +} + +static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + bool on; + + if (kstrtobool_from_user(ubuf, count, &on)) + return -EINVAL; + + rseq_control_debug(on); + return count; +} + +static int rseq_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, rseq_debug_show, inode->i_private); +} + +static const struct file_operations debug_ops = { + .open = rseq_debug_open, + .read = seq_read, + .write = rseq_debug_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static void rseq_slice_ext_init(struct dentry *root_dir); + +static int __init rseq_debugfs_init(void) +{ + struct dentry *root_dir = debugfs_create_dir("rseq", NULL); + + debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops); + rseq_stats_init(root_dir); + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) + rseq_slice_ext_init(root_dir); + return 0; } +__initcall(rseq_debugfs_init); -static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) +static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) { - u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, - mm_cid = 0; + struct rseq __user *urseq = t->rseq.usrptr; + u64 csaddr; + + scoped_user_read_access(urseq, efault) + unsafe_get_user(csaddr, &urseq->rseq_cs, efault); + if (likely(!csaddr)) + return true; + return rseq_update_user_cs(t, regs, csaddr); +efault: + return false; +} +static void rseq_slowpath_update_usr(struct pt_regs *regs) +{ /* - * Reset cpu_id_start to its initial state (0). + * Preserve has_rseq and user_irq state. The generic entry code clears + * user_irq on the way out, the non-generic entry architectures are not + * setting user_irq. */ - if (put_user(cpu_id_start, &t->rseq->cpu_id_start)) - return -EFAULT; + const struct rseq_event evt_mask = { + .has_rseq = RSEQ_HAS_RSEQ_VERSION_MASK, + .user_irq = true, + }; + struct task_struct *t = current; + struct rseq_ids ids; + bool event; + + if (unlikely(t->flags & PF_EXITING)) + return; + + rseq_stat_inc(rseq_stats.slowpath); + /* - * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming - * in after unregistration can figure out that rseq needs to be - * registered again. + * Read and clear the event pending bit first. If the task + * was not preempted or migrated or a signal is on the way, + * there is no point in doing any of the heavy lifting here + * on production kernels. In that case TIF_NOTIFY_RESUME + * was raised by some other functionality. + * + * This is correct because the read/clear operation is + * guarded against scheduler preemption, which makes it CPU + * local atomic. If the task is preempted right after + * re-enabling preemption then TIF_NOTIFY_RESUME is set + * again and this function is invoked another time _before_ + * the task is able to return to user mode. + * + * On a debug kernel, invoke the fixup code unconditionally + * with the result handed in to allow the detection of + * inconsistencies. */ - if (put_user(cpu_id, &t->rseq->cpu_id)) - return -EFAULT; + scoped_guard(irq) { + event = t->rseq.event.sched_switch; + t->rseq.event.all &= evt_mask.all; + ids.cpu_id = task_cpu(t); + ids.mm_cid = task_mm_cid(t); + } + + if (!event) + return; + + ids.node_id = cpu_to_node(ids.cpu_id); + + if (unlikely(!rseq_update_usr(t, regs, &ids))) { + /* + * Clear the errors just in case this might survive magically, but + * leave the rest intact. + */ + t->rseq.event.error = 0; + force_sig(SIGSEGV); + } +} + +void __rseq_handle_slowpath(struct pt_regs *regs) +{ /* - * Reset node_id to its initial state (0). + * If invoked from hypervisors before entering the guest via + * resume_user_mode_work(), then @regs is a NULL pointer. + * + * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises + * it before returning from the ioctl() to user space when + * rseq_event.sched_switch is set. + * + * So it's safe to ignore here instead of pointlessly updating it + * in the vcpu_run() loop. */ - if (put_user(node_id, &t->rseq->node_id)) - return -EFAULT; + if (!regs) + return; + + rseq_slowpath_update_usr(regs); +} + +void __rseq_signal_deliver(int sig, struct pt_regs *regs) +{ + rseq_stat_inc(rseq_stats.signal); + /* - * Reset mm_cid to its initial state (0). + * Don't update IDs yet, they are handled on exit to user if + * necessary. The important thing is to abort a critical section of + * the interrupted context as after this point the instruction + * pointer in @regs points to the signal handler. */ - if (put_user(mm_cid, &t->rseq->mm_cid)) - return -EFAULT; + if (unlikely(!rseq_handle_cs(current, regs))) { + /* + * Clear the errors just in case this might survive + * magically, but leave the rest intact. + */ + current->rseq.event.error = 0; + force_sigsegv(sig); + } + /* - * Additional feature fields added after ORIG_RSEQ_SIZE - * need to be conditionally reset only if - * t->rseq_len != ORIG_RSEQ_SIZE. + * In legacy mode, force the update of IDs before returning to user + * space to stay compatible. */ - return 0; + if (!rseq_v2(current)) + rseq_force_update(); } -static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) +/* + * Terminate the process if a syscall is issued within a restartable + * sequence. + */ +void __rseq_debug_syscall_return(struct pt_regs *regs) { - struct rseq_cs __user *urseq_cs; - u64 ptr; - u32 __user *usig; - u32 sig; - int ret; + struct task_struct *t = current; + u64 csaddr; -#ifdef CONFIG_64BIT - if (get_user(ptr, &t->rseq->rseq_cs)) - return -EFAULT; -#else - if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr))) - return -EFAULT; + if (!t->rseq.event.has_rseq) + return; + if (get_user(csaddr, &t->rseq.usrptr->rseq_cs)) + goto fail; + if (likely(!csaddr)) + return; + if (unlikely(csaddr >= TASK_SIZE)) + goto fail; + if (rseq_debug_update_user_cs(t, regs, csaddr)) + return; +fail: + force_sig(SIGSEGV); +} + +#ifdef CONFIG_DEBUG_RSEQ +/* Kept around to keep GENERIC_ENTRY=n architectures supported. */ +void rseq_syscall(struct pt_regs *regs) +{ + __rseq_debug_syscall_return(regs); +} #endif - if (!ptr) { - memset(rseq_cs, 0, sizeof(*rseq_cs)); - return 0; + +static bool rseq_reset_ids(void) +{ + struct rseq __user *rseq = current->rseq.usrptr; + + /* + * If this fails, terminate it because this leaves the kernel in + * stupid state as exit to user space will try to fixup the ids + * again. + */ + scoped_user_rw_access(rseq, efault) { + unsafe_put_user(0, &rseq->cpu_id_start, efault); + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); + unsafe_put_user(0, &rseq->node_id, efault); + unsafe_put_user(0, &rseq->mm_cid, efault); } - if (ptr >= TASK_SIZE) - return -EINVAL; - urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; - if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) + return true; + +efault: + force_sig(SIGSEGV); + return false; +} + +/* The original rseq structure size (including padding) is 32 bytes. */ +#define ORIG_RSEQ_SIZE 32 + +static long rseq_register(struct rseq __user * rseq, u32 rseq_len, int flags, u32 sig) +{ + u32 rseqfl = 0; + u8 version = 1; + + if (!access_ok(rseq, rseq_len)) return -EFAULT; - if (rseq_cs->start_ip >= TASK_SIZE || - rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || - rseq_cs->abort_ip >= TASK_SIZE || - rseq_cs->version > 0) - return -EINVAL; - /* Check for overflow. */ - if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) - return -EINVAL; - /* Ensure that abort_ip is not in the critical section. */ - if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) - return -EINVAL; + /* + * Architectures, which use the generic IRQ entry code (at least) enable + * registrations with a size greater than the original v1 fixed sized + * @rseq_len, which has been validated already to utilize the optimized + * v2 ABI mode which also enables extended RSEQ features beyond MMCID. + */ + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY) && rseq_len > ORIG_RSEQ_SIZE) + version = 2; + + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION) && version > 1) { + if (rseq_slice_extension_enabled()) { + rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + if (flags & RSEQ_FLAG_SLICE_EXT_DEFAULT_ON) + rseqfl |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + } + } - usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); - ret = get_user(sig, usig); - if (ret) - return ret; + scoped_user_write_access(rseq, efault) { + /* + * If the rseq_cs pointer is non-NULL on registration, clear it to + * avoid a potential segfault on return to user-space. The proper thing + * to do would have been to fail the registration but this would break + * older libcs that reuse the rseq area for new threads without + * clearing the fields. Don't bother reading it, just reset it. + */ + unsafe_put_user(0UL, &rseq->rseq_cs, efault); + unsafe_put_user(rseqfl, &rseq->flags, efault); + /* Initialize IDs in user space */ + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault); + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); + unsafe_put_user(0U, &rseq->node_id, efault); + unsafe_put_user(0U, &rseq->mm_cid, efault); - if (current->rseq_sig != sig) { - printk_ratelimited(KERN_WARNING - "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", - sig, current->rseq_sig, current->pid, usig); - return -EINVAL; + /* + * All fields past mm_cid are only valid for non-legacy v2 + * registrations. + */ + if (version > 1) { + if (IS_ENABLED(CONFIG_RSEQ_SLICE_EXTENSION)) + unsafe_put_user(0U, &rseq->slice_ctrl.all, efault); + } } + + /* + * Activate the registration by setting the rseq area address, length + * and signature in the task struct. + */ + current->rseq.usrptr = rseq; + current->rseq.len = rseq_len; + current->rseq.sig = sig; + +#ifdef CONFIG_RSEQ_SLICE_EXTENSION + current->rseq.slice.state.enabled = !!(rseqfl & RSEQ_CS_FLAG_SLICE_EXT_ENABLED); +#endif + + /* + * Ensure the cpu_id_start and cpu_id fields are updated before + * returning to user-space. + */ + current->rseq.event.has_rseq = version; + rseq_force_update(); return 0; + +efault: + return -EFAULT; } -static bool rseq_warn_flags(const char *str, u32 flags) +static long rseq_unregister(struct rseq __user * rseq, u32 rseq_len, int flags, u32 sig) { - u32 test_flags; + if (flags & ~RSEQ_FLAG_UNREGISTER) + return -EINVAL; + if (current->rseq.usrptr != rseq || !current->rseq.usrptr) + return -EINVAL; + if (rseq_len != current->rseq.len) + return -EINVAL; + if (current->rseq.sig != sig) + return -EPERM; + if (!rseq_reset_ids()) + return -EFAULT; + rseq_reset(current); + return 0; +} - if (!flags) +static long rseq_reregister(struct rseq __user * rseq, u32 rseq_len, u32 sig) +{ + /* + * If rseq is already registered, check whether the provided address + * differs from the prior one. + */ + if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) + return -EINVAL; + if (current->rseq.sig != sig) + return -EPERM; + /* Already registered. */ + return -EBUSY; +} + +static bool rseq_length_valid(struct rseq __user *rseq, unsigned int rseq_len) +{ + /* + * Ensure the provided rseq is properly aligned, as communicated to + * user-space through the ELF auxiliary vector AT_RSEQ_ALIGN. If + * rseq_len is the original rseq size, the required alignment is the + * original struct rseq alignment. + * + * In order to be valid, rseq_len is either the original rseq size, or + * large enough to contain all supported fields, as communicated to + * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. + */ + if (rseq_len < ORIG_RSEQ_SIZE) return false; - test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS; - if (test_flags) - pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str); - test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS; - if (test_flags) - pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str); - return true; + + if (rseq_len == ORIG_RSEQ_SIZE) + return IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE); + + return IS_ALIGNED((unsigned long)rseq, rseq_alloc_align()) && + rseq_len >= offsetof(struct rseq, end); } -static int rseq_need_restart(struct task_struct *t, u32 cs_flags) +#define RSEQ_FLAGS_SUPPORTED (RSEQ_FLAG_SLICE_EXT_DEFAULT_ON) + +/* + * sys_rseq - Register or unregister restartable sequences for the caller thread. + */ +SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { - u32 flags, event_mask; - int ret; + if (flags & RSEQ_FLAG_UNREGISTER) + return rseq_unregister(rseq, rseq_len, flags, sig); - if (rseq_warn_flags("rseq_cs", cs_flags)) + if (unlikely(flags & ~RSEQ_FLAGS_SUPPORTED)) return -EINVAL; - /* Get thread flags. */ - ret = get_user(flags, &t->rseq->flags); - if (ret) - return ret; + if (current->rseq.usrptr) + return rseq_reregister(rseq, rseq_len, sig); - if (rseq_warn_flags("rseq", flags)) + if (!rseq_length_valid(rseq, rseq_len)) return -EINVAL; + return rseq_register(rseq, rseq_len, flags, sig); +} + +#ifdef CONFIG_RSEQ_SLICE_EXTENSION +struct slice_timer { + struct hrtimer timer; + void *cookie; +}; + +static const unsigned int rseq_slice_ext_nsecs_min = 5 * NSEC_PER_USEC; +static const unsigned int rseq_slice_ext_nsecs_max = 50 * NSEC_PER_USEC; +unsigned int rseq_slice_ext_nsecs __read_mostly = rseq_slice_ext_nsecs_min; +static DEFINE_PER_CPU(struct slice_timer, slice_timer); +DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key); + +/* + * When the timer expires and the task is still in user space, the return + * from interrupt will revoke the grant and schedule. If the task already + * entered the kernel via a syscall and the timer fires before the syscall + * work was able to cancel it, then depending on the preemption model this + * will either reschedule on return from interrupt or in the syscall work + * below. + */ +static enum hrtimer_restart rseq_slice_expired(struct hrtimer *tmr) +{ + struct slice_timer *st = container_of(tmr, struct slice_timer, timer); + + /* + * Validate that the task which armed the timer is still on the + * CPU. It could have been scheduled out without canceling the + * timer. + */ + if (st->cookie == current && current->rseq.slice.state.granted) { + rseq_stat_inc(rseq_stats.s_expired); + set_need_resched_current(); + } + return HRTIMER_NORESTART; +} + +bool __rseq_arm_slice_extension_timer(void) +{ + struct slice_timer *st = this_cpu_ptr(&slice_timer); + struct task_struct *curr = current; + + lockdep_assert_irqs_disabled(); + /* - * Load and clear event mask atomically with respect to - * scheduler preemption. + * This check prevents a task, which got a time slice extension + * granted, from exceeding the maximum scheduling latency when the + * grant expired before going out to user space. Don't bother to + * clear the grant here, it will be cleaned up automatically before + * going out to user space after being scheduled back in. */ - preempt_disable(); - event_mask = t->rseq_event_mask; - t->rseq_event_mask = 0; - preempt_enable(); + if ((unlikely(curr->rseq.slice.expires < ktime_get_mono_fast_ns()))) { + set_need_resched_current(); + return true; + } - return !!event_mask; + /* + * Store the task pointer as a cookie for comparison in the timer + * function. This is safe as the timer is CPU local and cannot be + * in the expiry function at this point. + */ + st->cookie = curr; + hrtimer_start(&st->timer, curr->rseq.slice.expires, HRTIMER_MODE_ABS_PINNED_HARD); + /* Arm the syscall entry work */ + set_task_syscall_work(curr, SYSCALL_RSEQ_SLICE); + return false; } -static int clear_rseq_cs(struct task_struct *t) +static void rseq_cancel_slice_extension_timer(void) { + struct slice_timer *st = this_cpu_ptr(&slice_timer); + /* - * The rseq_cs field is set to NULL on preemption or signal - * delivery on top of rseq assembly block, as well as on top - * of code outside of the rseq assembly block. This performs - * a lazy clear of the rseq_cs field. + * st->cookie can be safely read as preemption is disabled and the + * timer is CPU local. * - * Set rseq_cs to NULL. + * As this is most probably the first expiring timer, the cancel is + * expensive as it has to reprogram the hardware, but that's less + * expensive than going through a full hrtimer_interrupt() cycle + * for nothing. + * + * hrtimer_try_to_cancel() is sufficient here as the timer is CPU + * local and once the hrtimer code disabled interrupts the timer + * callback cannot be running. */ -#ifdef CONFIG_64BIT - return put_user(0UL, &t->rseq->rseq_cs); -#else - if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs))) - return -EFAULT; - return 0; -#endif + if (st->cookie == current) + hrtimer_try_to_cancel(&st->timer); } -/* - * Unsigned comparison will be true when ip >= start_ip, and when - * ip < start_ip + post_commit_offset. - */ -static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) +static inline void rseq_slice_set_need_resched(struct task_struct *curr) { - return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; + /* + * The interrupt guard is required to prevent inconsistent state in + * this case: + * + * set_tsk_need_resched() + * --> Interrupt + * wakeup() + * set_tsk_need_resched() + * set_preempt_need_resched() + * schedule_on_return() + * clear_tsk_need_resched() + * clear_preempt_need_resched() + * set_preempt_need_resched() <- Inconsistent state + * + * This is safe vs. a remote set of TIF_NEED_RESCHED because that + * only sets the already set bit and does not create inconsistent + * state. + */ + scoped_guard(irq) + set_need_resched_current(); } -static int rseq_ip_fixup(struct pt_regs *regs) +static void rseq_slice_validate_ctrl(u32 expected) { - unsigned long ip = instruction_pointer(regs); - struct task_struct *t = current; - struct rseq_cs rseq_cs; - int ret; + u32 __user *sctrl = ¤t->rseq.usrptr->slice_ctrl.all; + u32 uval; - ret = rseq_get_rseq_cs(t, &rseq_cs); - if (ret) - return ret; - - /* - * Handle potentially not being within a critical section. - * If not nested over a rseq critical section, restart is useless. - * Clear the rseq_cs pointer and return. - */ - if (!in_rseq_cs(ip, &rseq_cs)) - return clear_rseq_cs(t); - ret = rseq_need_restart(t, rseq_cs.flags); - if (ret <= 0) - return ret; - ret = clear_rseq_cs(t); - if (ret) - return ret; - trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, - rseq_cs.abort_ip); - instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); - return 0; + if (get_user(uval, sctrl) || uval != expected) + force_sig(SIGSEGV); } /* - * This resume handler must always be executed between any of: - * - preemption, - * - signal delivery, - * and return to user-space. + * Invoked from syscall entry if a time slice extension was granted and the + * kernel did not clear it before user space left the critical section. * - * This is how we can ensure that the entire rseq critical section - * will issue the commit instruction only if executed atomically with - * respect to other threads scheduled on the same CPU, and with respect - * to signal handlers. + * While the recommended way to relinquish the CPU side effect free is + * rseq_slice_yield(2), any syscall within a granted slice terminates the + * grant and immediately reschedules if required. This supports onion layer + * applications, where the code requesting the grant cannot control the + * code within the critical section. */ -void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) +void rseq_syscall_enter_work(long syscall) { - struct task_struct *t = current; - int ret, sig; + struct task_struct *curr = current; + struct rseq_slice_ctrl ctrl = { .granted = curr->rseq.slice.state.granted }; - if (unlikely(t->flags & PF_EXITING)) + clear_task_syscall_work(curr, SYSCALL_RSEQ_SLICE); + + if (static_branch_unlikely(&rseq_debug_enabled)) + rseq_slice_validate_ctrl(ctrl.all); + + /* + * The kernel might have raced, revoked the grant and updated + * userspace, but kept the SLICE work set. + */ + if (!ctrl.granted) return; /* - * regs is NULL if and only if the caller is in a syscall path. Skip - * fixup and leave rseq_cs as is so that rseq_sycall() will detect and - * kill a misbehaving userspace on debug kernels. + * Required to stabilize the per CPU timer pointer and to make + * set_tsk_need_resched() correct on PREEMPT[RT] kernels. + * + * Leaving the scope will reschedule on preemption models FULL, + * LAZY and RT if necessary. */ - if (regs) { - ret = rseq_ip_fixup(regs); - if (unlikely(ret < 0)) - goto error; + scoped_guard(preempt) { + rseq_cancel_slice_extension_timer(); + /* + * Now that preemption is disabled, quickly check whether + * the task was already rescheduled before arriving here. + */ + if (!curr->rseq.event.sched_switch) { + rseq_slice_set_need_resched(curr); + + if (syscall == __NR_rseq_slice_yield) { + rseq_stat_inc(rseq_stats.s_yielded); + /* Update the yielded state for syscall return */ + curr->rseq.slice.yielded = 1; + } else { + rseq_stat_inc(rseq_stats.s_aborted); + } + } } - if (unlikely(rseq_update_cpu_node_id(t))) - goto error; - return; + /* Reschedule on NONE/VOLUNTARY preemption models */ + cond_resched(); -error: - sig = ksig ? ksig->sig : 0; - force_sigsegv(sig); + /* Clear the grant in kernel state and user space */ + curr->rseq.slice.state.granted = false; + if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all)) + force_sig(SIGSEGV); } -#ifdef CONFIG_DEBUG_RSEQ +int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) +{ + switch (arg2) { + case PR_RSEQ_SLICE_EXTENSION_GET: + if (arg3) + return -EINVAL; + return current->rseq.slice.state.enabled ? PR_RSEQ_SLICE_EXT_ENABLE : 0; -/* - * Terminate the process if a syscall is issued within a restartable - * sequence. + case PR_RSEQ_SLICE_EXTENSION_SET: { + u32 rflags, valid = RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + bool enable = !!(arg3 & PR_RSEQ_SLICE_EXT_ENABLE); + + if (arg3 & ~PR_RSEQ_SLICE_EXT_ENABLE) + return -EINVAL; + if (!rseq_slice_extension_enabled()) + return -ENOTSUPP; + if (!current->rseq.usrptr) + return -ENXIO; + if (!rseq_v2(current)) + return -ENOTSUPP; + + /* No change? */ + if (enable == !!current->rseq.slice.state.enabled) + return 0; + + if (get_user(rflags, ¤t->rseq.usrptr->flags)) + goto die; + + if (current->rseq.slice.state.enabled) + valid |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + + if ((rflags & valid) != valid) + goto die; + + rflags &= ~RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + rflags |= RSEQ_CS_FLAG_SLICE_EXT_AVAILABLE; + if (enable) + rflags |= RSEQ_CS_FLAG_SLICE_EXT_ENABLED; + + if (put_user(rflags, ¤t->rseq.usrptr->flags)) + goto die; + + current->rseq.slice.state.enabled = enable; + return 0; + } + default: + return -EINVAL; + } +die: + force_sig(SIGSEGV); + return -EFAULT; +} + +/** + * sys_rseq_slice_yield - yield the current processor side effect free if a + * task granted with a time slice extension is done with + * the critical work before being forced out. + * + * Return: 1 if the task successfully yielded the CPU within the granted slice. + * 0 if the slice extension was either never granted or was revoked by + * going over the granted extension, using a syscall other than this one + * or being scheduled out earlier due to a subsequent interrupt. + * + * The syscall does not schedule because the syscall entry work immediately + * relinquishes the CPU and schedules if required. */ -void rseq_syscall(struct pt_regs *regs) +SYSCALL_DEFINE0(rseq_slice_yield) { - unsigned long ip = instruction_pointer(regs); - struct task_struct *t = current; - struct rseq_cs rseq_cs; + int yielded = !!current->rseq.slice.yielded; - if (!t->rseq) - return; - if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) - force_sig(SIGSEGV); + current->rseq.slice.yielded = 0; + return yielded; } -#endif - -/* - * sys_rseq - setup restartable sequences for caller thread. - */ -SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, - int, flags, u32, sig) +static int rseq_slice_ext_show(struct seq_file *m, void *p) { - int ret; + seq_printf(m, "%d\n", rseq_slice_ext_nsecs); + return 0; +} - if (flags & RSEQ_FLAG_UNREGISTER) { - if (flags & ~RSEQ_FLAG_UNREGISTER) - return -EINVAL; - /* Unregister rseq for current thread. */ - if (current->rseq != rseq || !current->rseq) - return -EINVAL; - if (rseq_len != current->rseq_len) - return -EINVAL; - if (current->rseq_sig != sig) - return -EPERM; - ret = rseq_reset_rseq_cpu_node_id(current); - if (ret) - return ret; - current->rseq = NULL; - current->rseq_sig = 0; - current->rseq_len = 0; - return 0; - } +static ssize_t rseq_slice_ext_write(struct file *file, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + unsigned int nsecs; - if (unlikely(flags)) + if (kstrtouint_from_user(ubuf, count, 10, &nsecs)) return -EINVAL; - if (current->rseq) { - /* - * If rseq is already registered, check whether - * the provided address differs from the prior - * one. - */ - if (current->rseq != rseq || rseq_len != current->rseq_len) - return -EINVAL; - if (current->rseq_sig != sig) - return -EPERM; - /* Already registered. */ - return -EBUSY; - } + if (nsecs < rseq_slice_ext_nsecs_min) + return -ERANGE; - /* - * If there was no rseq previously registered, ensure the provided rseq - * is properly aligned, as communcated to user-space through the ELF - * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq - * size, the required alignment is the original struct rseq alignment. - * - * In order to be valid, rseq_len is either the original rseq size, or - * large enough to contain all supported fields, as communicated to - * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. - */ - if (rseq_len < ORIG_RSEQ_SIZE || - (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || - (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || - rseq_len < offsetof(struct rseq, end)))) - return -EINVAL; - if (!access_ok(rseq, rseq_len)) - return -EFAULT; - current->rseq = rseq; - current->rseq_len = rseq_len; - current->rseq_sig = sig; - /* - * If rseq was previously inactive, and has just been - * registered, ensure the cpu_id_start and cpu_id fields - * are updated before returning to user-space. - */ - rseq_set_notify_resume(current); + if (nsecs > rseq_slice_ext_nsecs_max) + return -ERANGE; + rseq_slice_ext_nsecs = nsecs; + + return count; +} + +static int rseq_slice_ext_open(struct inode *inode, struct file *file) +{ + return single_open(file, rseq_slice_ext_show, inode->i_private); +} + +static const struct file_operations slice_ext_ops = { + .open = rseq_slice_ext_open, + .read = seq_read, + .write = rseq_slice_ext_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static void rseq_slice_ext_init(struct dentry *root_dir) +{ + debugfs_create_file("slice_ext_nsec", 0644, root_dir, NULL, &slice_ext_ops); +} + +static int __init rseq_slice_cmdline(char *str) +{ + bool on; + + if (kstrtobool(str, &on)) + return 0; + + if (!on) + static_branch_disable(&rseq_slice_extension_key); + return 1; +} +__setup("rseq_slice_ext=", rseq_slice_cmdline); + +static int __init rseq_slice_init(void) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) { + hrtimer_setup(per_cpu_ptr(&slice_timer.timer, cpu), rseq_slice_expired, + CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED_HARD); + } return 0; } +device_initcall(rseq_slice_init); +#else +static void rseq_slice_ext_init(struct dentry *root_dir) { } +#endif /* CONFIG_RSEQ_SLICE_EXTENSION */ |
