summaryrefslogtreecommitdiff
path: root/kernel/time
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-04-14 10:53:44 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2026-04-14 10:53:44 -0700
commitf21f7b5162e9dbde6d3d5ce727d4ca2552d76ce9 (patch)
tree2c1d858605001adedeff10f66f031e20da1db34d /kernel/time
parentc1fe867b5bf9c57ab7856486d342720e2b205eed (diff)
parent7138a8698a39e81eb153e05500823fff76d5b3bd (diff)
downloadlwn-f21f7b5162e9dbde6d3d5ce727d4ca2552d76ce9.tar.gz
lwn-f21f7b5162e9dbde6d3d5ce727d4ca2552d76ce9.zip
Merge tag 'timers-vdso-2026-04-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull vdso updates from Thomas Gleixner: - Make the handling of compat functions consistent and more robust - Rework the underlying data store so that it is dynamically allocated, which allows the conversion of the last holdout SPARC64 to the generic VDSO implementation - Rework the SPARC64 VDSO to utilize the generic implementation - Mop up the left overs of the non-generic VDSO support in the core code - Expand the VDSO selftest and make them more robust - Allow time namespaces to be enabled independently of the generic VDSO support, which was not possible before due to SPARC64 not using it - Various cleanups and improvements in the related code * tag 'timers-vdso-2026-04-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (51 commits) timens: Use task_lock guard in timens_get*() timens: Use mutex guard in proc_timens_set_offset() timens: Simplify some calls to put_time_ns() timens: Add a __free() wrapper for put_time_ns() timens: Remove dependency on the vDSO vdso/timens: Move functions to new file selftests: vDSO: vdso_test_correctness: Add a test for time() selftests: vDSO: vdso_test_correctness: Use facilities from parse_vdso.c selftests: vDSO: vdso_test_correctness: Handle different tv_usec types selftests: vDSO: vdso_test_correctness: Drop SYS_getcpu fallbacks selftests: vDSO: vdso_test_gettimeofday: Remove nolibc checks Revert "selftests: vDSO: parse_vdso: Use UAPI headers instead of libc headers" random: vDSO: Remove ifdeffery random: vDSO: Trim vDSO includes vdso/datapage: Trim down unnecessary includes vdso/datapage: Remove inclusion of gettimeofday.h vdso/helpers: Explicitly include vdso/processor.h vdso/gettimeofday: Add explicit includes random: vDSO: Add explicit includes MIPS: vdso: Explicitly include asm/vdso/vdso.h ...
Diffstat (limited to 'kernel/time')
-rw-r--r--kernel/time/Kconfig4
-rw-r--r--kernel/time/Makefile1
-rw-r--r--kernel/time/namespace.c203
-rw-r--r--kernel/time/namespace_internal.h28
-rw-r--r--kernel/time/namespace_vdso.c160
5 files changed, 225 insertions, 171 deletions
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 6a11964377e6..02aac7c5aa76 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -9,10 +9,6 @@
config CLOCKSOURCE_WATCHDOG
bool
-# Architecture has extra clocksource data
-config ARCH_CLOCKSOURCE_DATA
- bool
-
# Architecture has extra clocksource init called from registration
config ARCH_CLOCKSOURCE_INIT
bool
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index f7d52d9543cc..eaf290c972f9 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -30,5 +30,6 @@ obj-$(CONFIG_GENERIC_GETTIMEOFDAY) += vsyscall.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
obj-$(CONFIG_TIME_NS) += namespace.o
+obj-$(CONFIG_TIME_NS_VDSO) += namespace_vdso.o
obj-$(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) += clocksource-wdtest.o
obj-$(CONFIG_TIME_KUNIT_TEST) += time_test.o
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 652744e00eb4..4bca3f78c8ea 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -18,8 +18,9 @@
#include <linux/cred.h>
#include <linux/err.h>
#include <linux/mm.h>
+#include <linux/cleanup.h>
-#include <vdso/datapage.h>
+#include "namespace_internal.h"
ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim,
struct timens_offsets *ns_offsets)
@@ -93,8 +94,8 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
if (!ns)
goto fail_dec;
- ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!ns->vvar_page)
+ err = timens_vdso_alloc_vvar_page(ns);
+ if (err)
goto fail_free;
err = ns_common_init(ns);
@@ -109,7 +110,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
return ns;
fail_free_page:
- __free_page(ns->vvar_page);
+ timens_vdso_free_vvar_page(ns);
fail_free:
kfree(ns);
fail_dec:
@@ -138,117 +139,7 @@ struct time_namespace *copy_time_ns(u64 flags,
return clone_time_ns(user_ns, old_ns);
}
-static struct timens_offset offset_from_ts(struct timespec64 off)
-{
- struct timens_offset ret;
-
- ret.sec = off.tv_sec;
- ret.nsec = off.tv_nsec;
-
- return ret;
-}
-
-/*
- * A time namespace VVAR page has the same layout as the VVAR page which
- * contains the system wide VDSO data.
- *
- * For a normal task the VVAR pages are installed in the normal ordering:
- * VVAR
- * PVCLOCK
- * HVCLOCK
- * TIMENS <- Not really required
- *
- * Now for a timens task the pages are installed in the following order:
- * TIMENS
- * PVCLOCK
- * HVCLOCK
- * VVAR
- *
- * The check for vdso_clock->clock_mode is in the unlikely path of
- * the seq begin magic. So for the non-timens case most of the time
- * 'seq' is even, so the branch is not taken.
- *
- * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check
- * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the
- * update to finish and for 'seq' to become even anyway.
- *
- * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which
- * enforces the time namespace handling path.
- */
-static void timens_setup_vdso_clock_data(struct vdso_clock *vc,
- struct time_namespace *ns)
-{
- struct timens_offset *offset = vc->offset;
- struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic);
- struct timens_offset boottime = offset_from_ts(ns->offsets.boottime);
-
- vc->seq = 1;
- vc->clock_mode = VDSO_CLOCKMODE_TIMENS;
- offset[CLOCK_MONOTONIC] = monotonic;
- offset[CLOCK_MONOTONIC_RAW] = monotonic;
- offset[CLOCK_MONOTONIC_COARSE] = monotonic;
- offset[CLOCK_BOOTTIME] = boottime;
- offset[CLOCK_BOOTTIME_ALARM] = boottime;
-}
-
-struct page *find_timens_vvar_page(struct vm_area_struct *vma)
-{
- if (likely(vma->vm_mm == current->mm))
- return current->nsproxy->time_ns->vvar_page;
-
- /*
- * VM_PFNMAP | VM_IO protect .fault() handler from being called
- * through interfaces like /proc/$pid/mem or
- * process_vm_{readv,writev}() as long as there's no .access()
- * in special_mapping_vmops().
- * For more details check_vma_flags() and __access_remote_vm()
- */
-
- WARN(1, "vvar_page accessed remotely");
-
- return NULL;
-}
-
-/*
- * Protects possibly multiple offsets writers racing each other
- * and tasks entering the namespace.
- */
-static DEFINE_MUTEX(offset_lock);
-
-static void timens_set_vvar_page(struct task_struct *task,
- struct time_namespace *ns)
-{
- struct vdso_time_data *vdata;
- struct vdso_clock *vc;
- unsigned int i;
-
- if (ns == &init_time_ns)
- return;
-
- /* Fast-path, taken by every task in namespace except the first. */
- if (likely(ns->frozen_offsets))
- return;
-
- mutex_lock(&offset_lock);
- /* Nothing to-do: vvar_page has been already initialized. */
- if (ns->frozen_offsets)
- goto out;
-
- ns->frozen_offsets = true;
- vdata = page_address(ns->vvar_page);
- vc = vdata->clock_data;
-
- for (i = 0; i < CS_BASES; i++)
- timens_setup_vdso_clock_data(&vc[i], ns);
-
- if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) {
- for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++)
- timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns);
- }
-
-out:
- mutex_unlock(&offset_lock);
-}
+DEFINE_MUTEX(timens_offset_lock);
void free_time_ns(struct time_namespace *ns)
{
@@ -256,41 +147,39 @@ void free_time_ns(struct time_namespace *ns)
dec_time_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
ns_common_free(ns);
- __free_page(ns->vvar_page);
+ timens_vdso_free_vvar_page(ns);
/* Concurrent nstree traversal depends on a grace period. */
kfree_rcu(ns, ns.ns_rcu);
}
static struct ns_common *timens_get(struct task_struct *task)
{
- struct time_namespace *ns = NULL;
+ struct time_namespace *ns;
struct nsproxy *nsproxy;
- task_lock(task);
+ guard(task_lock)(task);
nsproxy = task->nsproxy;
- if (nsproxy) {
- ns = nsproxy->time_ns;
- get_time_ns(ns);
- }
- task_unlock(task);
+ if (!nsproxy)
+ return NULL;
- return ns ? &ns->ns : NULL;
+ ns = nsproxy->time_ns;
+ get_time_ns(ns);
+ return &ns->ns;
}
static struct ns_common *timens_for_children_get(struct task_struct *task)
{
- struct time_namespace *ns = NULL;
+ struct time_namespace *ns;
struct nsproxy *nsproxy;
- task_lock(task);
+ guard(task_lock)(task);
nsproxy = task->nsproxy;
- if (nsproxy) {
- ns = nsproxy->time_ns_for_children;
- get_time_ns(ns);
- }
- task_unlock(task);
+ if (!nsproxy)
+ return NULL;
- return ns ? &ns->ns : NULL;
+ ns = nsproxy->time_ns_for_children;
+ get_time_ns(ns);
+ return &ns->ns;
}
static void timens_put(struct ns_common *ns)
@@ -298,12 +187,6 @@ static void timens_put(struct ns_common *ns)
put_time_ns(to_time_ns(ns));
}
-void timens_commit(struct task_struct *tsk, struct time_namespace *ns)
-{
- timens_set_vvar_page(tsk, ns);
- vdso_join_timens(tsk, ns);
-}
-
static int timens_install(struct nsset *nsset, struct ns_common *new)
{
struct nsproxy *nsproxy = nsset->nsproxy;
@@ -367,36 +250,33 @@ static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts)
void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m)
{
- struct ns_common *ns;
- struct time_namespace *time_ns;
+ struct time_namespace *time_ns __free(time_ns) = NULL;
+ struct ns_common *ns = timens_for_children_get(p);
- ns = timens_for_children_get(p);
if (!ns)
return;
+
time_ns = to_time_ns(ns);
show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic);
show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime);
- put_time_ns(time_ns);
}
int proc_timens_set_offset(struct file *file, struct task_struct *p,
struct proc_timens_offset *offsets, int noffsets)
{
- struct ns_common *ns;
- struct time_namespace *time_ns;
+ struct time_namespace *time_ns __free(time_ns) = NULL;
+ struct ns_common *ns = timens_for_children_get(p);
struct timespec64 tp;
- int i, err;
+ int i;
- ns = timens_for_children_get(p);
if (!ns)
return -ESRCH;
+
time_ns = to_time_ns(ns);
- if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) {
- put_time_ns(time_ns);
+ if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME))
return -EPERM;
- }
for (i = 0; i < noffsets; i++) {
struct proc_timens_offset *off = &offsets[i];
@@ -409,15 +289,12 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p,
ktime_get_boottime_ts64(&tp);
break;
default:
- err = -EINVAL;
- goto out;
+ return -EINVAL;
}
- err = -ERANGE;
-
if (off->val.tv_sec > KTIME_SEC_MAX ||
off->val.tv_sec < -KTIME_SEC_MAX)
- goto out;
+ return -ERANGE;
tp = timespec64_add(tp, off->val);
/*
@@ -425,16 +302,13 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p,
* still unreachable.
*/
if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2)
- goto out;
+ return -ERANGE;
}
- mutex_lock(&offset_lock);
- if (time_ns->frozen_offsets) {
- err = -EACCES;
- goto out_unlock;
- }
+ guard(mutex)(&timens_offset_lock);
+ if (time_ns->frozen_offsets)
+ return -EACCES;
- err = 0;
/* Don't report errors after this line */
for (i = 0; i < noffsets; i++) {
struct proc_timens_offset *off = &offsets[i];
@@ -452,12 +326,7 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p,
*offset = off->val;
}
-out_unlock:
- mutex_unlock(&offset_lock);
-out:
- put_time_ns(time_ns);
-
- return err;
+ return 0;
}
const struct proc_ns_operations timens_operations = {
diff --git a/kernel/time/namespace_internal.h b/kernel/time/namespace_internal.h
new file mode 100644
index 000000000000..b37ba179f43b
--- /dev/null
+++ b/kernel/time/namespace_internal.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _TIME_NAMESPACE_INTERNAL_H
+#define _TIME_NAMESPACE_INTERNAL_H
+
+#include <linux/mutex.h>
+
+struct time_namespace;
+
+/*
+ * Protects possibly multiple offsets writers racing each other
+ * and tasks entering the namespace.
+ */
+extern struct mutex timens_offset_lock;
+
+#ifdef CONFIG_TIME_NS_VDSO
+int timens_vdso_alloc_vvar_page(struct time_namespace *ns);
+void timens_vdso_free_vvar_page(struct time_namespace *ns);
+#else /* !CONFIG_TIME_NS_VDSO */
+static inline int timens_vdso_alloc_vvar_page(struct time_namespace *ns)
+{
+ return 0;
+}
+static inline void timens_vdso_free_vvar_page(struct time_namespace *ns)
+{
+}
+#endif /* CONFIG_TIME_NS_VDSO */
+
+#endif /* _TIME_NAMESPACE_INTERNAL_H */
diff --git a/kernel/time/namespace_vdso.c b/kernel/time/namespace_vdso.c
new file mode 100644
index 000000000000..88c075cd16a3
--- /dev/null
+++ b/kernel/time/namespace_vdso.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Author: Andrei Vagin <avagin@openvz.org>
+ * Author: Dmitry Safonov <dima@arista.com>
+ */
+
+#include <linux/cleanup.h>
+#include <linux/mm.h>
+#include <linux/time_namespace.h>
+#include <linux/time.h>
+#include <linux/vdso_datastore.h>
+
+#include <vdso/clocksource.h>
+#include <vdso/datapage.h>
+
+#include "namespace_internal.h"
+
+static struct timens_offset offset_from_ts(struct timespec64 off)
+{
+ struct timens_offset ret;
+
+ ret.sec = off.tv_sec;
+ ret.nsec = off.tv_nsec;
+
+ return ret;
+}
+
+/*
+ * A time namespace VVAR page has the same layout as the VVAR page which
+ * contains the system wide VDSO data.
+ *
+ * For a normal task the VVAR pages are installed in the normal ordering:
+ * VVAR
+ * PVCLOCK
+ * HVCLOCK
+ * TIMENS <- Not really required
+ *
+ * Now for a timens task the pages are installed in the following order:
+ * TIMENS
+ * PVCLOCK
+ * HVCLOCK
+ * VVAR
+ *
+ * The check for vdso_clock->clock_mode is in the unlikely path of
+ * the seq begin magic. So for the non-timens case most of the time
+ * 'seq' is even, so the branch is not taken.
+ *
+ * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check
+ * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the
+ * update to finish and for 'seq' to become even anyway.
+ *
+ * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which
+ * enforces the time namespace handling path.
+ */
+static void timens_setup_vdso_clock_data(struct vdso_clock *vc,
+ struct time_namespace *ns)
+{
+ struct timens_offset *offset = vc->offset;
+ struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic);
+ struct timens_offset boottime = offset_from_ts(ns->offsets.boottime);
+
+ vc->seq = 1;
+ vc->clock_mode = VDSO_CLOCKMODE_TIMENS;
+ offset[CLOCK_MONOTONIC] = monotonic;
+ offset[CLOCK_MONOTONIC_RAW] = monotonic;
+ offset[CLOCK_MONOTONIC_COARSE] = monotonic;
+ offset[CLOCK_BOOTTIME] = boottime;
+ offset[CLOCK_BOOTTIME_ALARM] = boottime;
+}
+
+struct page *find_timens_vvar_page(struct vm_area_struct *vma)
+{
+ if (likely(vma->vm_mm == current->mm))
+ return current->nsproxy->time_ns->vvar_page;
+
+ /*
+ * VM_PFNMAP | VM_IO protect .fault() handler from being called
+ * through interfaces like /proc/$pid/mem or
+ * process_vm_{readv,writev}() as long as there's no .access()
+ * in special_mapping_vmops().
+ * For more details check_vma_flags() and __access_remote_vm()
+ */
+
+ WARN(1, "vvar_page accessed remotely");
+
+ return NULL;
+}
+
+static void timens_set_vvar_page(struct task_struct *task,
+ struct time_namespace *ns)
+{
+ struct vdso_time_data *vdata;
+ struct vdso_clock *vc;
+ unsigned int i;
+
+ if (ns == &init_time_ns)
+ return;
+
+ /* Fast-path, taken by every task in namespace except the first. */
+ if (likely(ns->frozen_offsets))
+ return;
+
+ guard(mutex)(&timens_offset_lock);
+ /* Nothing to-do: vvar_page has been already initialized. */
+ if (ns->frozen_offsets)
+ return;
+
+ ns->frozen_offsets = true;
+ vdata = page_address(ns->vvar_page);
+ vc = vdata->clock_data;
+
+ for (i = 0; i < CS_BASES; i++)
+ timens_setup_vdso_clock_data(&vc[i], ns);
+
+ if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) {
+ for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++)
+ timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns);
+ }
+}
+
+/*
+ * The vvar page layout depends on whether a task belongs to the root or
+ * non-root time namespace. Whenever a task changes its namespace, the VVAR
+ * page tables are cleared and then they will be re-faulted with a
+ * corresponding layout.
+ * See also the comment near timens_setup_vdso_clock_data() for details.
+ */
+static int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
+{
+ struct mm_struct *mm = task->mm;
+ struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
+
+ guard(mmap_read_lock)(mm);
+ for_each_vma(vmi, vma) {
+ if (vma_is_special_mapping(vma, &vdso_vvar_mapping))
+ zap_vma_pages(vma);
+ }
+ return 0;
+}
+
+void timens_commit(struct task_struct *tsk, struct time_namespace *ns)
+{
+ timens_set_vvar_page(tsk, ns);
+ vdso_join_timens(tsk, ns);
+}
+
+int timens_vdso_alloc_vvar_page(struct time_namespace *ns)
+{
+ ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!ns->vvar_page)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void timens_vdso_free_vvar_page(struct time_namespace *ns)
+{
+ __free_page(ns->vvar_page);
+}