diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-04-14 10:53:44 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-04-14 10:53:44 -0700 |
| commit | f21f7b5162e9dbde6d3d5ce727d4ca2552d76ce9 (patch) | |
| tree | 2c1d858605001adedeff10f66f031e20da1db34d /kernel/time | |
| parent | c1fe867b5bf9c57ab7856486d342720e2b205eed (diff) | |
| parent | 7138a8698a39e81eb153e05500823fff76d5b3bd (diff) | |
| download | lwn-f21f7b5162e9dbde6d3d5ce727d4ca2552d76ce9.tar.gz lwn-f21f7b5162e9dbde6d3d5ce727d4ca2552d76ce9.zip | |
Merge tag 'timers-vdso-2026-04-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull vdso updates from Thomas Gleixner:
- Make the handling of compat functions consistent and more robust
- Rework the underlying data store so that it is dynamically allocated,
which allows the conversion of the last holdout SPARC64 to the
generic VDSO implementation
- Rework the SPARC64 VDSO to utilize the generic implementation
- Mop up the left overs of the non-generic VDSO support in the core
code
- Expand the VDSO selftest and make them more robust
- Allow time namespaces to be enabled independently of the generic VDSO
support, which was not possible before due to SPARC64 not using it
- Various cleanups and improvements in the related code
* tag 'timers-vdso-2026-04-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (51 commits)
timens: Use task_lock guard in timens_get*()
timens: Use mutex guard in proc_timens_set_offset()
timens: Simplify some calls to put_time_ns()
timens: Add a __free() wrapper for put_time_ns()
timens: Remove dependency on the vDSO
vdso/timens: Move functions to new file
selftests: vDSO: vdso_test_correctness: Add a test for time()
selftests: vDSO: vdso_test_correctness: Use facilities from parse_vdso.c
selftests: vDSO: vdso_test_correctness: Handle different tv_usec types
selftests: vDSO: vdso_test_correctness: Drop SYS_getcpu fallbacks
selftests: vDSO: vdso_test_gettimeofday: Remove nolibc checks
Revert "selftests: vDSO: parse_vdso: Use UAPI headers instead of libc headers"
random: vDSO: Remove ifdeffery
random: vDSO: Trim vDSO includes
vdso/datapage: Trim down unnecessary includes
vdso/datapage: Remove inclusion of gettimeofday.h
vdso/helpers: Explicitly include vdso/processor.h
vdso/gettimeofday: Add explicit includes
random: vDSO: Add explicit includes
MIPS: vdso: Explicitly include asm/vdso/vdso.h
...
Diffstat (limited to 'kernel/time')
| -rw-r--r-- | kernel/time/Kconfig | 4 | ||||
| -rw-r--r-- | kernel/time/Makefile | 1 | ||||
| -rw-r--r-- | kernel/time/namespace.c | 203 | ||||
| -rw-r--r-- | kernel/time/namespace_internal.h | 28 | ||||
| -rw-r--r-- | kernel/time/namespace_vdso.c | 160 |
5 files changed, 225 insertions, 171 deletions
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 6a11964377e6..02aac7c5aa76 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -9,10 +9,6 @@ config CLOCKSOURCE_WATCHDOG bool -# Architecture has extra clocksource data -config ARCH_CLOCKSOURCE_DATA - bool - # Architecture has extra clocksource init called from registration config ARCH_CLOCKSOURCE_INIT bool diff --git a/kernel/time/Makefile b/kernel/time/Makefile index f7d52d9543cc..eaf290c972f9 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -30,5 +30,6 @@ obj-$(CONFIG_GENERIC_GETTIMEOFDAY) += vsyscall.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o obj-$(CONFIG_TIME_NS) += namespace.o +obj-$(CONFIG_TIME_NS_VDSO) += namespace_vdso.o obj-$(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) += clocksource-wdtest.o obj-$(CONFIG_TIME_KUNIT_TEST) += time_test.o diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 652744e00eb4..4bca3f78c8ea 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -18,8 +18,9 @@ #include <linux/cred.h> #include <linux/err.h> #include <linux/mm.h> +#include <linux/cleanup.h> -#include <vdso/datapage.h> +#include "namespace_internal.h" ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, struct timens_offsets *ns_offsets) @@ -93,8 +94,8 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, if (!ns) goto fail_dec; - ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!ns->vvar_page) + err = timens_vdso_alloc_vvar_page(ns); + if (err) goto fail_free; err = ns_common_init(ns); @@ -109,7 +110,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, return ns; fail_free_page: - __free_page(ns->vvar_page); + timens_vdso_free_vvar_page(ns); fail_free: kfree(ns); fail_dec: @@ -138,117 +139,7 @@ struct time_namespace *copy_time_ns(u64 flags, return clone_time_ns(user_ns, old_ns); } -static struct timens_offset offset_from_ts(struct timespec64 off) -{ - struct timens_offset ret; - - ret.sec = off.tv_sec; - ret.nsec = off.tv_nsec; - - return ret; -} - -/* - * A time namespace VVAR page has the same layout as the VVAR page which - * contains the system wide VDSO data. - * - * For a normal task the VVAR pages are installed in the normal ordering: - * VVAR - * PVCLOCK - * HVCLOCK - * TIMENS <- Not really required - * - * Now for a timens task the pages are installed in the following order: - * TIMENS - * PVCLOCK - * HVCLOCK - * VVAR - * - * The check for vdso_clock->clock_mode is in the unlikely path of - * the seq begin magic. So for the non-timens case most of the time - * 'seq' is even, so the branch is not taken. - * - * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check - * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the - * update to finish and for 'seq' to become even anyway. - * - * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which - * enforces the time namespace handling path. - */ -static void timens_setup_vdso_clock_data(struct vdso_clock *vc, - struct time_namespace *ns) -{ - struct timens_offset *offset = vc->offset; - struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); - struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); - - vc->seq = 1; - vc->clock_mode = VDSO_CLOCKMODE_TIMENS; - offset[CLOCK_MONOTONIC] = monotonic; - offset[CLOCK_MONOTONIC_RAW] = monotonic; - offset[CLOCK_MONOTONIC_COARSE] = monotonic; - offset[CLOCK_BOOTTIME] = boottime; - offset[CLOCK_BOOTTIME_ALARM] = boottime; -} - -struct page *find_timens_vvar_page(struct vm_area_struct *vma) -{ - if (likely(vma->vm_mm == current->mm)) - return current->nsproxy->time_ns->vvar_page; - - /* - * VM_PFNMAP | VM_IO protect .fault() handler from being called - * through interfaces like /proc/$pid/mem or - * process_vm_{readv,writev}() as long as there's no .access() - * in special_mapping_vmops(). - * For more details check_vma_flags() and __access_remote_vm() - */ - - WARN(1, "vvar_page accessed remotely"); - - return NULL; -} - -/* - * Protects possibly multiple offsets writers racing each other - * and tasks entering the namespace. - */ -static DEFINE_MUTEX(offset_lock); - -static void timens_set_vvar_page(struct task_struct *task, - struct time_namespace *ns) -{ - struct vdso_time_data *vdata; - struct vdso_clock *vc; - unsigned int i; - - if (ns == &init_time_ns) - return; - - /* Fast-path, taken by every task in namespace except the first. */ - if (likely(ns->frozen_offsets)) - return; - - mutex_lock(&offset_lock); - /* Nothing to-do: vvar_page has been already initialized. */ - if (ns->frozen_offsets) - goto out; - - ns->frozen_offsets = true; - vdata = page_address(ns->vvar_page); - vc = vdata->clock_data; - - for (i = 0; i < CS_BASES; i++) - timens_setup_vdso_clock_data(&vc[i], ns); - - if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) { - for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++) - timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns); - } - -out: - mutex_unlock(&offset_lock); -} +DEFINE_MUTEX(timens_offset_lock); void free_time_ns(struct time_namespace *ns) { @@ -256,41 +147,39 @@ void free_time_ns(struct time_namespace *ns) dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_common_free(ns); - __free_page(ns->vvar_page); + timens_vdso_free_vvar_page(ns); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); } static struct ns_common *timens_get(struct task_struct *task) { - struct time_namespace *ns = NULL; + struct time_namespace *ns; struct nsproxy *nsproxy; - task_lock(task); + guard(task_lock)(task); nsproxy = task->nsproxy; - if (nsproxy) { - ns = nsproxy->time_ns; - get_time_ns(ns); - } - task_unlock(task); + if (!nsproxy) + return NULL; - return ns ? &ns->ns : NULL; + ns = nsproxy->time_ns; + get_time_ns(ns); + return &ns->ns; } static struct ns_common *timens_for_children_get(struct task_struct *task) { - struct time_namespace *ns = NULL; + struct time_namespace *ns; struct nsproxy *nsproxy; - task_lock(task); + guard(task_lock)(task); nsproxy = task->nsproxy; - if (nsproxy) { - ns = nsproxy->time_ns_for_children; - get_time_ns(ns); - } - task_unlock(task); + if (!nsproxy) + return NULL; - return ns ? &ns->ns : NULL; + ns = nsproxy->time_ns_for_children; + get_time_ns(ns); + return &ns->ns; } static void timens_put(struct ns_common *ns) @@ -298,12 +187,6 @@ static void timens_put(struct ns_common *ns) put_time_ns(to_time_ns(ns)); } -void timens_commit(struct task_struct *tsk, struct time_namespace *ns) -{ - timens_set_vvar_page(tsk, ns); - vdso_join_timens(tsk, ns); -} - static int timens_install(struct nsset *nsset, struct ns_common *new) { struct nsproxy *nsproxy = nsset->nsproxy; @@ -367,36 +250,33 @@ static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts) void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m) { - struct ns_common *ns; - struct time_namespace *time_ns; + struct time_namespace *time_ns __free(time_ns) = NULL; + struct ns_common *ns = timens_for_children_get(p); - ns = timens_for_children_get(p); if (!ns) return; + time_ns = to_time_ns(ns); show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic); show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime); - put_time_ns(time_ns); } int proc_timens_set_offset(struct file *file, struct task_struct *p, struct proc_timens_offset *offsets, int noffsets) { - struct ns_common *ns; - struct time_namespace *time_ns; + struct time_namespace *time_ns __free(time_ns) = NULL; + struct ns_common *ns = timens_for_children_get(p); struct timespec64 tp; - int i, err; + int i; - ns = timens_for_children_get(p); if (!ns) return -ESRCH; + time_ns = to_time_ns(ns); - if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) { - put_time_ns(time_ns); + if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) return -EPERM; - } for (i = 0; i < noffsets; i++) { struct proc_timens_offset *off = &offsets[i]; @@ -409,15 +289,12 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p, ktime_get_boottime_ts64(&tp); break; default: - err = -EINVAL; - goto out; + return -EINVAL; } - err = -ERANGE; - if (off->val.tv_sec > KTIME_SEC_MAX || off->val.tv_sec < -KTIME_SEC_MAX) - goto out; + return -ERANGE; tp = timespec64_add(tp, off->val); /* @@ -425,16 +302,13 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p, * still unreachable. */ if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2) - goto out; + return -ERANGE; } - mutex_lock(&offset_lock); - if (time_ns->frozen_offsets) { - err = -EACCES; - goto out_unlock; - } + guard(mutex)(&timens_offset_lock); + if (time_ns->frozen_offsets) + return -EACCES; - err = 0; /* Don't report errors after this line */ for (i = 0; i < noffsets; i++) { struct proc_timens_offset *off = &offsets[i]; @@ -452,12 +326,7 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p, *offset = off->val; } -out_unlock: - mutex_unlock(&offset_lock); -out: - put_time_ns(time_ns); - - return err; + return 0; } const struct proc_ns_operations timens_operations = { diff --git a/kernel/time/namespace_internal.h b/kernel/time/namespace_internal.h new file mode 100644 index 000000000000..b37ba179f43b --- /dev/null +++ b/kernel/time/namespace_internal.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TIME_NAMESPACE_INTERNAL_H +#define _TIME_NAMESPACE_INTERNAL_H + +#include <linux/mutex.h> + +struct time_namespace; + +/* + * Protects possibly multiple offsets writers racing each other + * and tasks entering the namespace. + */ +extern struct mutex timens_offset_lock; + +#ifdef CONFIG_TIME_NS_VDSO +int timens_vdso_alloc_vvar_page(struct time_namespace *ns); +void timens_vdso_free_vvar_page(struct time_namespace *ns); +#else /* !CONFIG_TIME_NS_VDSO */ +static inline int timens_vdso_alloc_vvar_page(struct time_namespace *ns) +{ + return 0; +} +static inline void timens_vdso_free_vvar_page(struct time_namespace *ns) +{ +} +#endif /* CONFIG_TIME_NS_VDSO */ + +#endif /* _TIME_NAMESPACE_INTERNAL_H */ diff --git a/kernel/time/namespace_vdso.c b/kernel/time/namespace_vdso.c new file mode 100644 index 000000000000..88c075cd16a3 --- /dev/null +++ b/kernel/time/namespace_vdso.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Author: Andrei Vagin <avagin@openvz.org> + * Author: Dmitry Safonov <dima@arista.com> + */ + +#include <linux/cleanup.h> +#include <linux/mm.h> +#include <linux/time_namespace.h> +#include <linux/time.h> +#include <linux/vdso_datastore.h> + +#include <vdso/clocksource.h> +#include <vdso/datapage.h> + +#include "namespace_internal.h" + +static struct timens_offset offset_from_ts(struct timespec64 off) +{ + struct timens_offset ret; + + ret.sec = off.tv_sec; + ret.nsec = off.tv_nsec; + + return ret; +} + +/* + * A time namespace VVAR page has the same layout as the VVAR page which + * contains the system wide VDSO data. + * + * For a normal task the VVAR pages are installed in the normal ordering: + * VVAR + * PVCLOCK + * HVCLOCK + * TIMENS <- Not really required + * + * Now for a timens task the pages are installed in the following order: + * TIMENS + * PVCLOCK + * HVCLOCK + * VVAR + * + * The check for vdso_clock->clock_mode is in the unlikely path of + * the seq begin magic. So for the non-timens case most of the time + * 'seq' is even, so the branch is not taken. + * + * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check + * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the + * update to finish and for 'seq' to become even anyway. + * + * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which + * enforces the time namespace handling path. + */ +static void timens_setup_vdso_clock_data(struct vdso_clock *vc, + struct time_namespace *ns) +{ + struct timens_offset *offset = vc->offset; + struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); + struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); + + vc->seq = 1; + vc->clock_mode = VDSO_CLOCKMODE_TIMENS; + offset[CLOCK_MONOTONIC] = monotonic; + offset[CLOCK_MONOTONIC_RAW] = monotonic; + offset[CLOCK_MONOTONIC_COARSE] = monotonic; + offset[CLOCK_BOOTTIME] = boottime; + offset[CLOCK_BOOTTIME_ALARM] = boottime; +} + +struct page *find_timens_vvar_page(struct vm_area_struct *vma) +{ + if (likely(vma->vm_mm == current->mm)) + return current->nsproxy->time_ns->vvar_page; + + /* + * VM_PFNMAP | VM_IO protect .fault() handler from being called + * through interfaces like /proc/$pid/mem or + * process_vm_{readv,writev}() as long as there's no .access() + * in special_mapping_vmops(). + * For more details check_vma_flags() and __access_remote_vm() + */ + + WARN(1, "vvar_page accessed remotely"); + + return NULL; +} + +static void timens_set_vvar_page(struct task_struct *task, + struct time_namespace *ns) +{ + struct vdso_time_data *vdata; + struct vdso_clock *vc; + unsigned int i; + + if (ns == &init_time_ns) + return; + + /* Fast-path, taken by every task in namespace except the first. */ + if (likely(ns->frozen_offsets)) + return; + + guard(mutex)(&timens_offset_lock); + /* Nothing to-do: vvar_page has been already initialized. */ + if (ns->frozen_offsets) + return; + + ns->frozen_offsets = true; + vdata = page_address(ns->vvar_page); + vc = vdata->clock_data; + + for (i = 0; i < CS_BASES; i++) + timens_setup_vdso_clock_data(&vc[i], ns); + + if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) { + for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++) + timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns); + } +} + +/* + * The vvar page layout depends on whether a task belongs to the root or + * non-root time namespace. Whenever a task changes its namespace, the VVAR + * page tables are cleared and then they will be re-faulted with a + * corresponding layout. + * See also the comment near timens_setup_vdso_clock_data() for details. + */ +static int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) +{ + struct mm_struct *mm = task->mm; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + + guard(mmap_read_lock)(mm); + for_each_vma(vmi, vma) { + if (vma_is_special_mapping(vma, &vdso_vvar_mapping)) + zap_vma_pages(vma); + } + return 0; +} + +void timens_commit(struct task_struct *tsk, struct time_namespace *ns) +{ + timens_set_vvar_page(tsk, ns); + vdso_join_timens(tsk, ns); +} + +int timens_vdso_alloc_vvar_page(struct time_namespace *ns) +{ + ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!ns->vvar_page) + return -ENOMEM; + + return 0; +} + +void timens_vdso_free_vvar_page(struct time_namespace *ns) +{ + __free_page(ns->vvar_page); +} |
