summaryrefslogtreecommitdiff
path: root/kernel/time
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/time')
-rw-r--r--kernel/time/.kunitconfig2
-rw-r--r--kernel/time/Kconfig45
-rw-r--r--kernel/time/Makefile9
-rw-r--r--kernel/time/alarmtimer.c98
-rw-r--r--kernel/time/clockevents.c80
-rw-r--r--kernel/time/clocksource-wdtest.c268
-rw-r--r--kernel/time/clocksource.c815
-rw-r--r--kernel/time/hrtimer.c1284
-rw-r--r--kernel/time/itimer.c3
-rw-r--r--kernel/time/jiffies.c223
-rw-r--r--kernel/time/namespace.c229
-rw-r--r--kernel/time/namespace_internal.h28
-rw-r--r--kernel/time/namespace_vdso.c160
-rw-r--r--kernel/time/ntp.c75
-rw-r--r--kernel/time/ntp_internal.h13
-rw-r--r--kernel/time/posix-clock.c29
-rw-r--r--kernel/time/posix-cpu-timers.c13
-rw-r--r--kernel/time/posix-timers.c580
-rw-r--r--kernel/time/posix-timers.h1
-rw-r--r--kernel/time/sched_clock.c31
-rw-r--r--kernel/time/sleep_timeout.c6
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c4
-rw-r--r--kernel/time/tick-broadcast.c11
-rw-r--r--kernel/time/tick-common.c41
-rw-r--r--kernel/time/tick-internal.h4
-rw-r--r--kernel/time/tick-oneshot.c22
-rw-r--r--kernel/time/tick-sched.c88
-rw-r--r--kernel/time/time.c24
-rw-r--r--kernel/time/time_test.c4
-rw-r--r--kernel/time/timecounter.c37
-rw-r--r--kernel/time/timekeeping.c913
-rw-r--r--kernel/time/timekeeping.h2
-rw-r--r--kernel/time/timekeeping_internal.h3
-rw-r--r--kernel/time/timer.c102
-rw-r--r--kernel/time/timer_list.c22
-rw-r--r--kernel/time/timer_migration.c548
-rw-r--r--kernel/time/timer_migration.h2
-rw-r--r--kernel/time/vsyscall.c120
38 files changed, 3712 insertions, 2227 deletions
diff --git a/kernel/time/.kunitconfig b/kernel/time/.kunitconfig
new file mode 100644
index 000000000000..d60a611b2853
--- /dev/null
+++ b/kernel/time/.kunitconfig
@@ -0,0 +1,2 @@
+CONFIG_KUNIT=y
+CONFIG_TIME_KUNIT_TEST=y
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index b0b97a60aaa6..02aac7c5aa76 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -9,14 +9,13 @@
config CLOCKSOURCE_WATCHDOG
bool
-# Architecture has extra clocksource data
-config ARCH_CLOCKSOURCE_DATA
- bool
-
# Architecture has extra clocksource init called from registration
config ARCH_CLOCKSOURCE_INIT
bool
+config ARCH_WANTS_CLOCKSOURCE_READ_INLINE
+ bool
+
# Timekeeping vsyscall support
config GENERIC_TIME_VSYSCALL
bool
@@ -44,10 +43,23 @@ config GENERIC_CLOCKEVENTS_BROADCAST_IDLE
config GENERIC_CLOCKEVENTS_MIN_ADJUST
bool
+config GENERIC_CLOCKEVENTS_COUPLED
+ bool
+
+config GENERIC_CLOCKEVENTS_COUPLED_INLINE
+ select GENERIC_CLOCKEVENTS_COUPLED
+ bool
+
# Generic update of CMOS clock
config GENERIC_CMOS_UPDATE
bool
+# Deferred rearming of the hrtimer interrupt
+config HRTIMER_REARM_DEFERRED
+ def_bool y
+ depends on GENERIC_ENTRY && HAVE_GENERIC_TIF_BITS
+ depends on HIGH_RES_TIMERS && SCHED_HRTICK
+
# Select to handle posix CPU timers from task_work
# and not from the timer interrupt context
config HAVE_POSIX_CPU_TIMERS_TASK_WORK
@@ -82,9 +94,9 @@ config CONTEXT_TRACKING_IDLE
help
Tracks idle state on behalf of RCU.
-if GENERIC_CLOCKEVENTS
menu "Timers subsystem"
+if GENERIC_CLOCKEVENTS
# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is
# only related to the tick functionality. Oneshot clockevent devices
# are supported independent of this.
@@ -196,18 +208,17 @@ config HIGH_RES_TIMERS
hardware is not capable then this option only increases
the size of the kernel image.
-config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
- int "Clocksource watchdog maximum allowable skew (in microseconds)"
- depends on CLOCKSOURCE_WATCHDOG
- range 50 1000
- default 125
+endif
+
+config POSIX_AUX_CLOCKS
+ bool "Enable auxiliary POSIX clocks"
+ depends on POSIX_TIMERS
help
- Specify the maximum amount of allowable watchdog skew in
- microseconds before reporting the clocksource to be unstable.
- The default is based on a half-second clocksource watchdog
- interval and NTP's maximum frequency drift of 500 parts
- per million. If the clocksource is good enough for NTP,
- it is good enough for the clocksource watchdog!
+ Auxiliary POSIX clocks are clocks which can be steered
+ independently of the core timekeeper, which controls the
+ MONOTONIC, REALTIME, BOOTTIME and TAI clocks. They are useful to
+ provide e.g. lockless time accessors to independent PTP clocks
+ and other clock domains, which are not correlated to the TAI/NTP
+ notion of time.
endmenu
-endif
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index fe0ae82124fe..eaf290c972f9 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,10 @@
# SPDX-License-Identifier: GPL-2.0
+
+# Branch profiling isn't noinstr-safe
+ifdef CONFIG_TRACE_BRANCH_PROFILING
+CFLAGS_sched_clock.o += -DDISABLE_BRANCH_PROFILING
+endif
+
obj-y += time.o timer.o hrtimer.o sleep_timeout.o
obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
obj-y += timeconv.o timecounter.o alarmtimer.o
@@ -20,9 +26,10 @@ obj-$(CONFIG_LEGACY_TIMER_TICK) += tick-legacy.o
ifeq ($(CONFIG_SMP),y)
obj-$(CONFIG_NO_HZ_COMMON) += timer_migration.o
endif
-obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o
+obj-$(CONFIG_GENERIC_GETTIMEOFDAY) += vsyscall.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
obj-$(CONFIG_TIME_NS) += namespace.o
+obj-$(CONFIG_TIME_NS_VDSO) += namespace_vdso.o
obj-$(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) += clocksource-wdtest.o
obj-$(CONFIG_TIME_KUNIT_TEST) += time_test.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 0ddccdff119a..6e173d70d825 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -35,7 +35,7 @@
/**
* struct alarm_base - Alarm timer bases
- * @lock: Lock for syncrhonized access to the base
+ * @lock: Lock for synchronized access to the base
* @timerqueue: Timerqueue head managing the list of events
* @get_ktime: Function to read the time correlating to the base
* @get_timespec: Function to read the namespace time correlating to the base
@@ -70,12 +70,10 @@ static DEFINE_SPINLOCK(rtcdev_lock);
*/
struct rtc_device *alarmtimer_get_rtcdev(void)
{
- unsigned long flags;
struct rtc_device *ret;
- spin_lock_irqsave(&rtcdev_lock, flags);
+ guard(spinlock_irqsave)(&rtcdev_lock);
ret = rtcdev;
- spin_unlock_irqrestore(&rtcdev_lock, flags);
return ret;
}
@@ -83,7 +81,6 @@ EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev);
static int alarmtimer_rtc_add_device(struct device *dev)
{
- unsigned long flags;
struct rtc_device *rtc = to_rtc_device(dev);
struct platform_device *pdev;
int ret = 0;
@@ -101,25 +98,18 @@ static int alarmtimer_rtc_add_device(struct device *dev)
if (!IS_ERR(pdev))
device_init_wakeup(&pdev->dev, true);
- spin_lock_irqsave(&rtcdev_lock, flags);
- if (!IS_ERR(pdev) && !rtcdev) {
- if (!try_module_get(rtc->owner)) {
+ scoped_guard(spinlock_irqsave, &rtcdev_lock) {
+ if (!IS_ERR(pdev) && !rtcdev && try_module_get(rtc->owner)) {
+ rtcdev = rtc;
+ /* hold a reference so it doesn't go away */
+ get_device(dev);
+ pdev = NULL;
+ } else {
ret = -1;
- goto unlock;
}
-
- rtcdev = rtc;
- /* hold a reference so it doesn't go away */
- get_device(dev);
- pdev = NULL;
- } else {
- ret = -1;
}
-unlock:
- spin_unlock_irqrestore(&rtcdev_lock, flags);
platform_device_unregister(pdev);
-
return ret;
}
@@ -198,7 +188,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
struct alarm *alarm = container_of(timer, struct alarm, timer);
struct alarm_base *base = &alarm_bases[alarm->type];
- scoped_guard (spinlock_irqsave, &base->lock)
+ scoped_guard(spinlock_irqsave, &base->lock)
alarmtimer_dequeue(base, alarm);
if (alarm->function)
@@ -228,37 +218,39 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining);
static int alarmtimer_suspend(struct device *dev)
{
ktime_t min, now, expires;
- int i, ret, type;
struct rtc_device *rtc;
- unsigned long flags;
struct rtc_time tm;
+ int i, ret, type;
- spin_lock_irqsave(&freezer_delta_lock, flags);
- min = freezer_delta;
- expires = freezer_expires;
- type = freezer_alarmtype;
- freezer_delta = 0;
- spin_unlock_irqrestore(&freezer_delta_lock, flags);
+ scoped_guard(spinlock_irqsave, &freezer_delta_lock) {
+ min = freezer_delta;
+ expires = freezer_expires;
+ type = freezer_alarmtype;
+ freezer_delta = 0;
+ }
rtc = alarmtimer_get_rtcdev();
/* If we have no rtcdev, just return */
if (!rtc)
return 0;
- /* Find the soonest timer to expire*/
+ /* Find the soonest timer to expire */
for (i = 0; i < ALARM_NUMTYPE; i++) {
struct alarm_base *base = &alarm_bases[i];
struct timerqueue_node *next;
+ ktime_t next_expires;
ktime_t delta;
- spin_lock_irqsave(&base->lock, flags);
- next = timerqueue_getnext(&base->timerqueue);
- spin_unlock_irqrestore(&base->lock, flags);
+ scoped_guard(spinlock_irqsave, &base->lock) {
+ next = timerqueue_getnext(&base->timerqueue);
+ if (next)
+ next_expires = next->expires;
+ }
if (!next)
continue;
- delta = ktime_sub(next->expires, base->get_ktime());
+ delta = ktime_sub(next_expires, base->get_ktime());
if (!min || (delta < min)) {
- expires = next->expires;
+ expires = next_expires;
min = delta;
type = i;
}
@@ -352,13 +344,12 @@ EXPORT_SYMBOL_GPL(alarm_init);
void alarm_start(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- unsigned long flags;
- spin_lock_irqsave(&base->lock, flags);
- alarm->node.expires = start;
- alarmtimer_enqueue(base, alarm);
- hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
- spin_unlock_irqrestore(&base->lock, flags);
+ scoped_guard(spinlock_irqsave, &base->lock) {
+ alarm->node.expires = start;
+ alarmtimer_enqueue(base, alarm);
+ hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
+ }
trace_alarmtimer_start(alarm, base->get_ktime());
}
@@ -381,13 +372,11 @@ EXPORT_SYMBOL_GPL(alarm_start_relative);
void alarm_restart(struct alarm *alarm)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- unsigned long flags;
- spin_lock_irqsave(&base->lock, flags);
+ guard(spinlock_irqsave)(&base->lock);
hrtimer_set_expires(&alarm->timer, alarm->node.expires);
hrtimer_restart(&alarm->timer);
alarmtimer_enqueue(base, alarm);
- spin_unlock_irqrestore(&base->lock, flags);
}
EXPORT_SYMBOL_GPL(alarm_restart);
@@ -401,14 +390,13 @@ EXPORT_SYMBOL_GPL(alarm_restart);
int alarm_try_to_cancel(struct alarm *alarm)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- unsigned long flags;
int ret;
- spin_lock_irqsave(&base->lock, flags);
- ret = hrtimer_try_to_cancel(&alarm->timer);
- if (ret >= 0)
- alarmtimer_dequeue(base, alarm);
- spin_unlock_irqrestore(&base->lock, flags);
+ scoped_guard(spinlock_irqsave, &base->lock) {
+ ret = hrtimer_try_to_cancel(&alarm->timer);
+ if (ret >= 0)
+ alarmtimer_dequeue(base, alarm);
+ }
trace_alarmtimer_cancel(alarm, base->get_ktime());
return ret;
@@ -479,7 +467,6 @@ EXPORT_SYMBOL_GPL(alarm_forward_now);
static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
{
struct alarm_base *base;
- unsigned long flags;
ktime_t delta;
switch(type) {
@@ -498,13 +485,12 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
delta = ktime_sub(absexp, base->get_ktime());
- spin_lock_irqsave(&freezer_delta_lock, flags);
+ guard(spinlock_irqsave)(&freezer_delta_lock);
if (!freezer_delta || (delta < freezer_delta)) {
freezer_delta = delta;
freezer_expires = absexp;
freezer_alarmtype = type;
}
- spin_unlock_irqrestore(&freezer_delta_lock, flags);
}
/**
@@ -515,9 +501,9 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
{
if (clockid == CLOCK_REALTIME_ALARM)
return ALARM_REALTIME;
- if (clockid == CLOCK_BOOTTIME_ALARM)
- return ALARM_BOOTTIME;
- return -1;
+
+ WARN_ON_ONCE(clockid != CLOCK_BOOTTIME_ALARM);
+ return ALARM_BOOTTIME;
}
/**
@@ -558,7 +544,7 @@ static s64 alarm_timer_forward(struct k_itimer *timr, ktime_t now)
{
struct alarm *alarm = &timr->it.alarm.alarmtimer;
- return alarm_forward(alarm, timr->it_interval, now);
+ return alarm_forward(alarm, now, timr->it_interval);
}
/**
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index f3e831f62906..0014d163f989 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -2,7 +2,7 @@
/*
* This file contains functions which manage clock event devices.
*
- * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org>
* Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
* Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
*/
@@ -94,6 +94,9 @@ static int __clockevents_switch_state(struct clock_event_device *dev,
if (dev->features & CLOCK_EVT_FEAT_DUMMY)
return 0;
+ /* On state transitions clear the forced flag unconditionally */
+ dev->next_event_forced = 0;
+
/* Transition with new state-specific callbacks */
switch (state) {
case CLOCK_EVT_STATE_DETACHED:
@@ -172,6 +175,7 @@ void clockevents_shutdown(struct clock_event_device *dev)
{
clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
dev->next_event = KTIME_MAX;
+ dev->next_event_forced = 0;
}
/**
@@ -292,6 +296,38 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
#endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_COUPLED
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_COUPLED_INLINE
+#include <asm/clock_inlined.h>
+#else
+static __always_inline void
+arch_inlined_clockevent_set_next_coupled(u64 cycles, struct clock_event_device *dev) { }
+#endif
+
+static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires)
+{
+ u64 cycles;
+
+ if (unlikely(!(dev->features & CLOCK_EVT_FEAT_CLOCKSOURCE_COUPLED)))
+ return false;
+
+ if (unlikely(!ktime_expiry_to_cycles(dev->cs_id, expires, &cycles)))
+ return false;
+
+ if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED_INLINE))
+ arch_inlined_clockevent_set_next_coupled(cycles, dev);
+ else
+ dev->set_next_coupled(cycles, dev);
+ return true;
+}
+
+#else
+static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires)
+{
+ return false;
+}
+#endif
+
/**
* clockevents_program_event - Reprogram the clock event device.
* @dev: device to program
@@ -300,12 +336,10 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
*
* Returns 0 on success, -ETIME when the event is in the past.
*/
-int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
- bool force)
+int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force)
{
- unsigned long long clc;
int64_t delta;
- int rc;
+ u64 cycles;
if (WARN_ON_ONCE(expires < 0))
return -ETIME;
@@ -319,21 +353,37 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n",
clockevent_get_state(dev));
- /* Shortcut for clockevent devices that can deal with ktime. */
- if (dev->features & CLOCK_EVT_FEAT_KTIME)
+ /* ktime_t based reprogramming for the broadcast hrtimer device */
+ if (unlikely(dev->features & CLOCK_EVT_FEAT_HRTIMER))
return dev->set_next_ktime(expires, dev);
+ if (likely(clockevent_set_next_coupled(dev, expires)))
+ return 0;
+
delta = ktime_to_ns(ktime_sub(expires, ktime_get()));
- if (delta <= 0)
- return force ? clockevents_program_min_delta(dev) : -ETIME;
- delta = min(delta, (int64_t) dev->max_delta_ns);
- delta = max(delta, (int64_t) dev->min_delta_ns);
+ /* Required for tick_periodic() during early boot */
+ if (delta <= 0 && !force)
+ return -ETIME;
- clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
- rc = dev->set_next_event((unsigned long) clc, dev);
+ if (delta > (int64_t)dev->min_delta_ns) {
+ delta = min(delta, (int64_t) dev->max_delta_ns);
+ cycles = ((u64)delta * dev->mult) >> dev->shift;
+ if (!dev->set_next_event((unsigned long) cycles, dev)) {
+ dev->next_event_forced = 0;
+ return 0;
+ }
+ }
+
+ if (dev->next_event_forced)
+ return 0;
- return (rc && force) ? clockevents_program_min_delta(dev) : rc;
+ if (dev->set_next_event(dev->min_delta_ticks, dev)) {
+ if (!force || clockevents_program_min_delta(dev))
+ return -ETIME;
+ }
+ dev->next_event_forced = 1;
+ return 0;
}
/*
@@ -633,7 +683,7 @@ void tick_offline_cpu(unsigned int cpu)
raw_spin_lock(&clockevents_lock);
tick_broadcast_offline(cpu);
- tick_shutdown(cpu);
+ tick_shutdown();
/*
* Unregister the clock event devices which were
diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c
index 38dae590b29f..b4cf17b4aeed 100644
--- a/kernel/time/clocksource-wdtest.c
+++ b/kernel/time/clocksource-wdtest.c
@@ -3,202 +3,196 @@
* Unit test for the clocksource watchdog.
*
* Copyright (C) 2021 Facebook, Inc.
+ * Copyright (C) 2026 Intel Corp.
*
* Author: Paul E. McKenney <paulmck@kernel.org>
+ * Author: Thomas Gleixner <tglx@kernel.org>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/device.h>
#include <linux/clocksource.h>
-#include <linux/init.h>
+#include <linux/delay.h>
#include <linux/module.h>
-#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
-#include <linux/tick.h>
#include <linux/kthread.h>
-#include <linux/delay.h>
-#include <linux/prandom.h>
-#include <linux/cpu.h>
#include "tick-internal.h"
+#include "timekeeping_internal.h"
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Clocksource watchdog unit test");
MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>");
+MODULE_AUTHOR("Thomas Gleixner <tglx@kernel.org>");
+
+enum wdtest_states {
+ WDTEST_INJECT_NONE,
+ WDTEST_INJECT_DELAY,
+ WDTEST_INJECT_POSITIVE,
+ WDTEST_INJECT_NEGATIVE,
+ WDTEST_INJECT_PERCPU = 0x100,
+};
-static int holdoff = IS_BUILTIN(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) ? 10 : 0;
-module_param(holdoff, int, 0444);
-MODULE_PARM_DESC(holdoff, "Time to wait to start test (s).");
+static enum wdtest_states wdtest_state;
+static unsigned long wdtest_test_count;
+static ktime_t wdtest_last_ts, wdtest_offset;
-/* Watchdog kthread's task_struct pointer for debug purposes. */
-static struct task_struct *wdtest_task;
+#define SHIFT_4000PPM 8
-static u64 wdtest_jiffies_read(struct clocksource *cs)
+static ktime_t wdtest_get_offset(struct clocksource *cs)
{
- return (u64)jiffies;
-}
-
-static struct clocksource clocksource_wdtest_jiffies = {
- .name = "wdtest-jiffies",
- .rating = 1, /* lowest valid rating*/
- .uncertainty_margin = TICK_NSEC,
- .read = wdtest_jiffies_read,
- .mask = CLOCKSOURCE_MASK(32),
- .flags = CLOCK_SOURCE_MUST_VERIFY,
- .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */
- .shift = JIFFIES_SHIFT,
- .max_cycles = 10,
-};
+ if (wdtest_state < WDTEST_INJECT_PERCPU)
+ return wdtest_test_count & 0x1 ? 0 : wdtest_offset >> SHIFT_4000PPM;
-static int wdtest_ktime_read_ndelays;
-static bool wdtest_ktime_read_fuzz;
+ /* Only affect the readout of the "remote" CPU */
+ return cs->wd_cpu == smp_processor_id() ? 0 : NSEC_PER_MSEC;
+}
static u64 wdtest_ktime_read(struct clocksource *cs)
{
- int wkrn = READ_ONCE(wdtest_ktime_read_ndelays);
- static int sign = 1;
- u64 ret;
+ ktime_t now = ktime_get_raw_fast_ns();
+ ktime_t intv = now - wdtest_last_ts;
- if (wkrn) {
- udelay(cs->uncertainty_margin / 250);
- WRITE_ONCE(wdtest_ktime_read_ndelays, wkrn - 1);
- }
- ret = ktime_get_real_fast_ns();
- if (READ_ONCE(wdtest_ktime_read_fuzz)) {
- sign = -sign;
- ret = ret + sign * 100 * NSEC_PER_MSEC;
+ /*
+ * Only increment the test counter once per watchdog interval and
+ * store the interval for the offset calculation of this step. This
+ * guarantees a consistent behaviour even if the other side needs
+ * to repeat due to a watchdog read timeout.
+ */
+ if (intv > (NSEC_PER_SEC / 4)) {
+ WRITE_ONCE(wdtest_test_count, wdtest_test_count + 1);
+ wdtest_last_ts = now;
+ wdtest_offset = intv;
}
- return ret;
-}
-static void wdtest_ktime_cs_mark_unstable(struct clocksource *cs)
-{
- pr_info("--- Marking %s unstable due to clocksource watchdog.\n", cs->name);
+ switch (wdtest_state & ~WDTEST_INJECT_PERCPU) {
+ case WDTEST_INJECT_POSITIVE:
+ return now + wdtest_get_offset(cs);
+ case WDTEST_INJECT_NEGATIVE:
+ return now - wdtest_get_offset(cs);
+ case WDTEST_INJECT_DELAY:
+ udelay(500);
+ return now;
+ default:
+ return now;
+ }
}
-#define KTIME_FLAGS (CLOCK_SOURCE_IS_CONTINUOUS | \
- CLOCK_SOURCE_VALID_FOR_HRES | \
- CLOCK_SOURCE_MUST_VERIFY | \
- CLOCK_SOURCE_VERIFY_PERCPU)
+#define KTIME_FLAGS (CLOCK_SOURCE_IS_CONTINUOUS | \
+ CLOCK_SOURCE_CALIBRATED | \
+ CLOCK_SOURCE_MUST_VERIFY | \
+ CLOCK_SOURCE_WDTEST)
static struct clocksource clocksource_wdtest_ktime = {
.name = "wdtest-ktime",
- .rating = 300,
+ .rating = 10,
.read = wdtest_ktime_read,
.mask = CLOCKSOURCE_MASK(64),
.flags = KTIME_FLAGS,
- .mark_unstable = wdtest_ktime_cs_mark_unstable,
.list = LIST_HEAD_INIT(clocksource_wdtest_ktime.list),
};
-/* Reset the clocksource if needed. */
-static void wdtest_ktime_clocksource_reset(void)
+static void wdtest_clocksource_reset(enum wdtest_states which, bool percpu)
+{
+ clocksource_unregister(&clocksource_wdtest_ktime);
+
+ pr_info("Test: State %d percpu %d\n", which, percpu);
+
+ wdtest_state = which;
+ if (percpu)
+ wdtest_state |= WDTEST_INJECT_PERCPU;
+ wdtest_test_count = 0;
+ wdtest_last_ts = 0;
+
+ clocksource_wdtest_ktime.rating = 10;
+ clocksource_wdtest_ktime.flags = KTIME_FLAGS;
+ if (percpu)
+ clocksource_wdtest_ktime.flags |= CLOCK_SOURCE_WDTEST_PERCPU;
+ clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000);
+}
+
+static bool wdtest_execute(enum wdtest_states which, bool percpu, unsigned int expect,
+ unsigned long calls)
{
- if (clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE) {
- clocksource_unregister(&clocksource_wdtest_ktime);
- clocksource_wdtest_ktime.flags = KTIME_FLAGS;
- schedule_timeout_uninterruptible(HZ / 10);
- clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000);
+ wdtest_clocksource_reset(which, percpu);
+
+ for (; READ_ONCE(wdtest_test_count) < calls; msleep(100)) {
+ unsigned int flags = READ_ONCE(clocksource_wdtest_ktime.flags);
+
+ if (kthread_should_stop())
+ return false;
+
+ if (flags & CLOCK_SOURCE_UNSTABLE) {
+ if (expect & CLOCK_SOURCE_UNSTABLE)
+ return true;
+ pr_warn("Fail: Unexpected unstable\n");
+ return false;
+ }
+ if (flags & CLOCK_SOURCE_VALID_FOR_HRES) {
+ if (expect & CLOCK_SOURCE_VALID_FOR_HRES)
+ return true;
+ pr_warn("Fail: Unexpected valid for highres\n");
+ return false;
+ }
}
+
+ if (!expect)
+ return true;
+
+ pr_warn("Fail: Timed out\n");
+ return false;
}
-/* Run the specified series of watchdog tests. */
-static int wdtest_func(void *arg)
+static bool wdtest_run(bool percpu)
{
- unsigned long j1, j2;
- int i, max_retries;
- char *s;
+ if (!wdtest_execute(WDTEST_INJECT_NONE, percpu, CLOCK_SOURCE_VALID_FOR_HRES, 8))
+ return false;
- schedule_timeout_uninterruptible(holdoff * HZ);
+ if (!wdtest_execute(WDTEST_INJECT_DELAY, percpu, 0, 4))
+ return false;
- /*
- * Verify that jiffies-like clocksources get the manually
- * specified uncertainty margin.
- */
- pr_info("--- Verify jiffies-like uncertainty margin.\n");
- __clocksource_register(&clocksource_wdtest_jiffies);
- WARN_ON_ONCE(clocksource_wdtest_jiffies.uncertainty_margin != TICK_NSEC);
+ if (!wdtest_execute(WDTEST_INJECT_POSITIVE, percpu, CLOCK_SOURCE_UNSTABLE, 8))
+ return false;
- j1 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies);
- schedule_timeout_uninterruptible(HZ);
- j2 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies);
- WARN_ON_ONCE(j1 == j2);
+ if (!wdtest_execute(WDTEST_INJECT_NEGATIVE, percpu, CLOCK_SOURCE_UNSTABLE, 8))
+ return false;
- clocksource_unregister(&clocksource_wdtest_jiffies);
+ return true;
+}
- /*
- * Verify that tsc-like clocksources are assigned a reasonable
- * uncertainty margin.
- */
- pr_info("--- Verify tsc-like uncertainty margin.\n");
+static int wdtest_func(void *arg)
+{
clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000);
- WARN_ON_ONCE(clocksource_wdtest_ktime.uncertainty_margin < NSEC_PER_USEC);
-
- j1 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime);
- udelay(1);
- j2 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime);
- pr_info("--- tsc-like times: %lu - %lu = %lu.\n", j2, j1, j2 - j1);
- WARN_ONCE(time_before(j2, j1 + NSEC_PER_USEC),
- "Expected at least 1000ns, got %lu.\n", j2 - j1);
-
- /* Verify tsc-like stability with various numbers of errors injected. */
- max_retries = clocksource_get_max_watchdog_retry();
- for (i = 0; i <= max_retries + 1; i++) {
- if (i <= 1 && i < max_retries)
- s = "";
- else if (i <= max_retries)
- s = ", expect message";
- else
- s = ", expect clock skew";
- pr_info("--- Watchdog with %dx error injection, %d retries%s.\n", i, max_retries, s);
- WRITE_ONCE(wdtest_ktime_read_ndelays, i);
- schedule_timeout_uninterruptible(2 * HZ);
- WARN_ON_ONCE(READ_ONCE(wdtest_ktime_read_ndelays));
- WARN_ON_ONCE((i <= max_retries) !=
- !(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE));
- wdtest_ktime_clocksource_reset();
+ if (wdtest_run(false)) {
+ if (wdtest_run(true))
+ pr_info("Success: All tests passed\n");
}
-
- /* Verify tsc-like stability with clock-value-fuzz error injection. */
- pr_info("--- Watchdog clock-value-fuzz error injection, expect clock skew and per-CPU mismatches.\n");
- WRITE_ONCE(wdtest_ktime_read_fuzz, true);
- schedule_timeout_uninterruptible(2 * HZ);
- WARN_ON_ONCE(!(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE));
- clocksource_verify_percpu(&clocksource_wdtest_ktime);
- WRITE_ONCE(wdtest_ktime_read_fuzz, false);
-
clocksource_unregister(&clocksource_wdtest_ktime);
- pr_info("--- Done with test.\n");
- return 0;
-}
+ if (!IS_MODULE(CONFIG_TEST_CLOCKSOURCE_WATCHDOG))
+ return 0;
-static void wdtest_print_module_parms(void)
-{
- pr_alert("--- holdoff=%d\n", holdoff);
+ while (!kthread_should_stop())
+ schedule_timeout_interruptible(3600 * HZ);
+ return 0;
}
-/* Cleanup function. */
-static void clocksource_wdtest_cleanup(void)
-{
-}
+static struct task_struct *wdtest_thread;
static int __init clocksource_wdtest_init(void)
{
- int ret = 0;
-
- wdtest_print_module_parms();
+ struct task_struct *t = kthread_run(wdtest_func, NULL, "wdtest");
- /* Create watchdog-test task. */
- wdtest_task = kthread_run(wdtest_func, NULL, "wdtest");
- if (IS_ERR(wdtest_task)) {
- ret = PTR_ERR(wdtest_task);
- pr_warn("%s: Failed to create wdtest kthread.\n", __func__);
- wdtest_task = NULL;
- return ret;
+ if (IS_ERR(t)) {
+ pr_warn("Failed to create wdtest kthread.\n");
+ return PTR_ERR(t);
}
-
+ wdtest_thread = t;
return 0;
}
-
module_init(clocksource_wdtest_init);
+
+static void clocksource_wdtest_cleanup(void)
+{
+ if (wdtest_thread)
+ kthread_stop(wdtest_thread);
+}
module_exit(clocksource_wdtest_cleanup);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 2a7802ec480c..baee13a1f87f 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -7,15 +7,17 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/device.h>
#include <linux/clocksource.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/device.h>
#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
-#include <linux/tick.h>
#include <linux/kthread.h>
+#include <linux/module.h>
#include <linux/prandom.h>
-#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <linux/tick.h>
+#include <linux/topology.h>
#include "tick-internal.h"
#include "timekeeping_internal.h"
@@ -107,48 +109,6 @@ static char override_name[CS_NAME_LEN];
static int finished_booting;
static u64 suspend_start;
-/*
- * Interval: 0.5sec.
- */
-#define WATCHDOG_INTERVAL (HZ >> 1)
-#define WATCHDOG_INTERVAL_MAX_NS ((2 * WATCHDOG_INTERVAL) * (NSEC_PER_SEC / HZ))
-
-/*
- * Threshold: 0.0312s, when doubled: 0.0625s.
- */
-#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5)
-
-/*
- * Maximum permissible delay between two readouts of the watchdog
- * clocksource surrounding a read of the clocksource being validated.
- * This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as
- * a lower bound for cs->uncertainty_margin values when registering clocks.
- *
- * The default of 500 parts per million is based on NTP's limits.
- * If a clocksource is good enough for NTP, it is good enough for us!
- *
- * In other words, by default, even if a clocksource is extremely
- * precise (for example, with a sub-nanosecond period), the maximum
- * permissible skew between the clocksource watchdog and the clocksource
- * under test is not permitted to go below the 500ppm minimum defined
- * by MAX_SKEW_USEC. This 500ppm minimum may be overridden using the
- * CLOCKSOURCE_WATCHDOG_MAX_SKEW_US Kconfig option.
- */
-#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
-#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
-#else
-#define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ)
-#endif
-
-/*
- * Default for maximum permissible skew when cs->uncertainty_margin is
- * not specified, and the lower bound even when cs->uncertainty_margin
- * is specified. This is also the default that is used when registering
- * clocks with unspecifed cs->uncertainty_margin, so this macro is used
- * even in CONFIG_CLOCKSOURCE_WATCHDOG=n kernels.
- */
-#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
-
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG
static void clocksource_watchdog_work(struct work_struct *work);
static void clocksource_select(void);
@@ -160,7 +120,42 @@ static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
static DEFINE_SPINLOCK(watchdog_lock);
static int watchdog_running;
static atomic_t watchdog_reset_pending;
-static int64_t watchdog_max_interval;
+
+/* Watchdog interval: 0.5sec. */
+#define WATCHDOG_INTERVAL (HZ >> 1)
+#define WATCHDOG_INTERVAL_NS (WATCHDOG_INTERVAL * (NSEC_PER_SEC / HZ))
+
+/* Maximum time between two reference watchdog readouts */
+#define WATCHDOG_READOUT_MAX_NS (50U * NSEC_PER_USEC)
+
+/*
+ * Maximum time between two remote readouts for NUMA=n. On NUMA enabled systems
+ * the timeout is calculated from the numa distance.
+ */
+#define WATCHDOG_DEFAULT_TIMEOUT_NS (50U * NSEC_PER_USEC)
+
+/*
+ * Remote timeout NUMA distance multiplier. The local distance is 10. The
+ * default remote distance is 20. ACPI tables provide more accurate numbers
+ * which are guaranteed to be greater than the local distance.
+ *
+ * This results in a 5us base value, which is equivalent to the above !NUMA
+ * default.
+ */
+#define WATCHDOG_NUMA_MULTIPLIER_NS ((u64)(WATCHDOG_DEFAULT_TIMEOUT_NS / LOCAL_DISTANCE))
+
+/* Limit the NUMA timeout in case the distance values are insanely big */
+#define WATCHDOG_NUMA_MAX_TIMEOUT_NS ((u64)(500U * NSEC_PER_USEC))
+
+/* Shift values to calculate the approximate $N ppm of a given delta. */
+#define SHIFT_500PPM 11
+#define SHIFT_4000PPM 8
+
+/* Number of attempts to read the watchdog */
+#define WATCHDOG_FREQ_RETRIES 3
+
+/* Five reads local and remote for inter CPU skew detection */
+#define WATCHDOG_REMOTE_MAX_SEQ 10
static inline void clocksource_watchdog_lock(unsigned long *flags)
{
@@ -241,210 +236,422 @@ void clocksource_mark_unstable(struct clocksource *cs)
spin_unlock_irqrestore(&watchdog_lock, flags);
}
-static int verify_n_cpus = 8;
-module_param(verify_n_cpus, int, 0644);
+static inline void clocksource_reset_watchdog(void)
+{
+ struct clocksource *cs;
-enum wd_read_status {
- WD_READ_SUCCESS,
- WD_READ_UNSTABLE,
- WD_READ_SKIP
+ list_for_each_entry(cs, &watchdog_list, wd_list)
+ cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+}
+
+enum wd_result {
+ WD_SUCCESS,
+ WD_FREQ_NO_WATCHDOG,
+ WD_FREQ_TIMEOUT,
+ WD_FREQ_RESET,
+ WD_FREQ_SKEWED,
+ WD_CPU_TIMEOUT,
+ WD_CPU_SKEWED,
+};
+
+struct watchdog_cpu_data {
+ /* Keep first as it is 32 byte aligned */
+ call_single_data_t csd;
+ atomic_t remote_inprogress;
+ enum wd_result result;
+ u64 cpu_ts[2];
+ struct clocksource *cs;
+ /* Ensure that the sequence is in a separate cache line */
+ atomic_t seq ____cacheline_aligned;
+ /* Set by the control CPU according to NUMA distance */
+ u64 timeout_ns;
};
-static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow)
-{
- int64_t md = 2 * watchdog->uncertainty_margin;
- unsigned int nretries, max_retries;
- int64_t wd_delay, wd_seq_delay;
- u64 wd_end, wd_end2;
-
- max_retries = clocksource_get_max_watchdog_retry();
- for (nretries = 0; nretries <= max_retries; nretries++) {
- local_irq_disable();
- *wdnow = watchdog->read(watchdog);
- *csnow = cs->read(cs);
- wd_end = watchdog->read(watchdog);
- wd_end2 = watchdog->read(watchdog);
- local_irq_enable();
-
- wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end);
- if (wd_delay <= md + cs->uncertainty_margin) {
- if (nretries > 1 && nretries >= max_retries) {
- pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n",
- smp_processor_id(), watchdog->name, nretries);
+struct watchdog_data {
+ raw_spinlock_t lock;
+ enum wd_result result;
+
+ u64 wd_seq;
+ u64 wd_delta;
+ u64 cs_delta;
+ u64 cpu_ts[2];
+
+ unsigned int curr_cpu;
+} ____cacheline_aligned_in_smp;
+
+static void watchdog_check_skew_remote(void *unused);
+
+static DEFINE_PER_CPU_ALIGNED(struct watchdog_cpu_data, watchdog_cpu_data) = {
+ .csd = CSD_INIT(watchdog_check_skew_remote, NULL),
+};
+
+static struct watchdog_data watchdog_data = {
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(watchdog_data.lock),
+};
+
+static inline void watchdog_set_result(struct watchdog_cpu_data *wd, enum wd_result result)
+{
+ guard(raw_spinlock)(&watchdog_data.lock);
+ if (!wd->result) {
+ atomic_set(&wd->seq, WATCHDOG_REMOTE_MAX_SEQ);
+ WRITE_ONCE(wd->result, result);
+ }
+}
+
+/* Wait for the sequence number to hand over control. */
+static bool watchdog_wait_seq(struct watchdog_cpu_data *wd, u64 start, int seq)
+{
+ for(int cnt = 0; atomic_read(&wd->seq) < seq; cnt++) {
+ /* Bail if the other side set an error result */
+ if (READ_ONCE(wd->result) != WD_SUCCESS)
+ return false;
+
+ /* Prevent endless loops if the other CPU does not react. */
+ if (cnt == 5000) {
+ u64 nsecs = ktime_get_raw_fast_ns();
+
+ if (nsecs - start >=wd->timeout_ns) {
+ watchdog_set_result(wd, WD_CPU_TIMEOUT);
+ return false;
}
- return WD_READ_SUCCESS;
+ cnt = 0;
}
+ cpu_relax();
+ }
+ return seq < WATCHDOG_REMOTE_MAX_SEQ;
+}
- /*
- * Now compute delay in consecutive watchdog read to see if
- * there is too much external interferences that cause
- * significant delay in reading both clocksource and watchdog.
- *
- * If consecutive WD read-back delay > md, report
- * system busy, reinit the watchdog and skip the current
- * watchdog test.
- */
- wd_seq_delay = cycles_to_nsec_safe(watchdog, wd_end, wd_end2);
- if (wd_seq_delay > md)
- goto skip_test;
+static void watchdog_check_skew(struct watchdog_cpu_data *wd, int index)
+{
+ u64 prev, now, delta, start = ktime_get_raw_fast_ns();
+ int local = index, remote = (index + 1) & 0x1;
+ struct clocksource *cs = wd->cs;
+
+ /* Set the local timestamp so that the first iteration works correctly */
+ wd->cpu_ts[local] = cs->read(cs);
+
+ /* Signal arrival */
+ atomic_inc(&wd->seq);
+
+ for (int seq = local + 2; seq < WATCHDOG_REMOTE_MAX_SEQ; seq += 2) {
+ if (!watchdog_wait_seq(wd, start, seq))
+ return;
+
+ /* Capture local timestamp before possible non-local coherency overhead */
+ now = cs->read(cs);
+
+ /* Store local timestamp before reading remote to limit coherency stalls */
+ wd->cpu_ts[local] = now;
+
+ prev = wd->cpu_ts[remote];
+ delta = (now - prev) & cs->mask;
+
+ if (delta > cs->max_raw_delta) {
+ watchdog_set_result(wd, WD_CPU_SKEWED);
+ return;
+ }
+
+ /* Hand over to the remote CPU */
+ atomic_inc(&wd->seq);
}
+}
- pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n",
- smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name);
- return WD_READ_UNSTABLE;
+static void watchdog_check_skew_remote(void *unused)
+{
+ struct watchdog_cpu_data *wd = this_cpu_ptr(&watchdog_cpu_data);
-skip_test:
- pr_info("timekeeping watchdog on CPU%d: %s wd-wd read-back delay of %lldns\n",
- smp_processor_id(), watchdog->name, wd_seq_delay);
- pr_info("wd-%s-wd read-back delay of %lldns, clock-skew test skipped!\n",
- cs->name, wd_delay);
- return WD_READ_SKIP;
+ atomic_inc(&wd->remote_inprogress);
+ watchdog_check_skew(wd, 1);
+ atomic_dec(&wd->remote_inprogress);
}
-static u64 csnow_mid;
-static cpumask_t cpus_ahead;
-static cpumask_t cpus_behind;
-static cpumask_t cpus_chosen;
+static inline bool wd_csd_locked(struct watchdog_cpu_data *wd)
+{
+ return READ_ONCE(wd->csd.node.u_flags) & CSD_FLAG_LOCK;
+}
+
+/*
+ * This is only invoked for remote CPUs. See watchdog_check_cpu_skew().
+ */
+static inline u64 wd_get_remote_timeout(unsigned int remote_cpu)
+{
+ unsigned int n1, n2;
+ u64 ns;
+
+ if (nr_node_ids == 1)
+ return WATCHDOG_DEFAULT_TIMEOUT_NS;
+
+ n1 = cpu_to_node(smp_processor_id());
+ n2 = cpu_to_node(remote_cpu);
+ ns = WATCHDOG_NUMA_MULTIPLIER_NS * node_distance(n1, n2);
+ return min(ns, WATCHDOG_NUMA_MAX_TIMEOUT_NS);
+}
-static void clocksource_verify_choose_cpus(void)
+static void __watchdog_check_cpu_skew(struct clocksource *cs, unsigned int cpu)
{
- int cpu, i, n = verify_n_cpus;
+ struct watchdog_cpu_data *wd;
- if (n < 0) {
- /* Check all of the CPUs. */
- cpumask_copy(&cpus_chosen, cpu_online_mask);
- cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
+ wd = per_cpu_ptr(&watchdog_cpu_data, cpu);
+ if (atomic_read(&wd->remote_inprogress) || wd_csd_locked(wd)) {
+ watchdog_data.result = WD_CPU_TIMEOUT;
return;
}
- /* If no checking desired, or no other CPU to check, leave. */
- cpumask_clear(&cpus_chosen);
- if (n == 0 || num_online_cpus() <= 1)
+ atomic_set(&wd->seq, 0);
+ wd->result = WD_SUCCESS;
+ wd->cs = cs;
+ /* Store the current CPU ID for the watchdog test unit */
+ cs->wd_cpu = smp_processor_id();
+
+ wd->timeout_ns = wd_get_remote_timeout(cpu);
+
+ /* Kick the remote CPU into the watchdog function */
+ if (WARN_ON_ONCE(smp_call_function_single_async(cpu, &wd->csd))) {
+ watchdog_data.result = WD_CPU_TIMEOUT;
return;
+ }
+
+ scoped_guard(irq)
+ watchdog_check_skew(wd, 0);
+
+ scoped_guard(raw_spinlock_irq, &watchdog_data.lock) {
+ watchdog_data.result = wd->result;
+ memcpy(watchdog_data.cpu_ts, wd->cpu_ts, sizeof(wd->cpu_ts));
+ }
+}
+
+static void watchdog_check_cpu_skew(struct clocksource *cs)
+{
+ unsigned int cpu = watchdog_data.curr_cpu;
+
+ cpu = cpumask_next_wrap(cpu, cpu_online_mask);
+ watchdog_data.curr_cpu = cpu;
- /* Make sure to select at least one CPU other than the current CPU. */
- cpu = cpumask_first(cpu_online_mask);
+ /* Skip the current CPU. Handles num_online_cpus() == 1 as well */
if (cpu == smp_processor_id())
- cpu = cpumask_next(cpu, cpu_online_mask);
- if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
return;
- cpumask_set_cpu(cpu, &cpus_chosen);
- /* Force a sane value for the boot parameter. */
- if (n > nr_cpu_ids)
- n = nr_cpu_ids;
+ /* Don't interfere with the test mechanics */
+ if ((cs->flags & CLOCK_SOURCE_WDTEST) && !(cs->flags & CLOCK_SOURCE_WDTEST_PERCPU))
+ return;
+
+ __watchdog_check_cpu_skew(cs, cpu);
+}
+
+static bool watchdog_check_freq(struct clocksource *cs, bool reset_pending)
+{
+ unsigned int ppm_shift = SHIFT_4000PPM;
+ u64 wd_ts0, wd_ts1, cs_ts;
+
+ watchdog_data.result = WD_SUCCESS;
+ if (!watchdog) {
+ watchdog_data.result = WD_FREQ_NO_WATCHDOG;
+ return false;
+ }
+
+ if (cs->flags & CLOCK_SOURCE_WDTEST_PERCPU)
+ return true;
/*
- * Randomly select the specified number of CPUs. If the same
- * CPU is selected multiple times, that CPU is checked only once,
- * and no replacement CPU is selected. This gracefully handles
- * situations where verify_n_cpus is greater than the number of
- * CPUs that are currently online.
+ * If both the clocksource and the watchdog claim they are
+ * calibrated use 500ppm limit. Uncalibrated clocksources need a
+ * larger allowance because thefirmware supplied frequencies can be
+ * way off.
*/
- for (i = 1; i < n; i++) {
- cpu = get_random_u32_below(nr_cpu_ids);
- cpu = cpumask_next(cpu - 1, cpu_online_mask);
- if (cpu >= nr_cpu_ids)
- cpu = cpumask_first(cpu_online_mask);
- if (!WARN_ON_ONCE(cpu >= nr_cpu_ids))
- cpumask_set_cpu(cpu, &cpus_chosen);
+ if (watchdog->flags & CLOCK_SOURCE_CALIBRATED && cs->flags & CLOCK_SOURCE_CALIBRATED)
+ ppm_shift = SHIFT_500PPM;
+
+ for (int retries = 0; retries < WATCHDOG_FREQ_RETRIES; retries++) {
+ s64 wd_last, cs_last, wd_seq, wd_delta, cs_delta, max_delta;
+
+ scoped_guard(irq) {
+ wd_ts0 = watchdog->read(watchdog);
+ cs_ts = cs->read(cs);
+ wd_ts1 = watchdog->read(watchdog);
+ }
+
+ wd_last = cs->wd_last;
+ cs_last = cs->cs_last;
+
+ /* Validate the watchdog readout window */
+ wd_seq = cycles_to_nsec_safe(watchdog, wd_ts0, wd_ts1);
+ if (wd_seq > WATCHDOG_READOUT_MAX_NS) {
+ /* Store for printout in case all retries fail */
+ watchdog_data.wd_seq = wd_seq;
+ continue;
+ }
+
+ /* Store for subsequent processing */
+ cs->wd_last = wd_ts0;
+ cs->cs_last = cs_ts;
+
+ /* First round or reset pending? */
+ if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || reset_pending)
+ goto reset;
+
+ /* Calculate the nanosecond deltas from the last invocation */
+ wd_delta = cycles_to_nsec_safe(watchdog, wd_last, wd_ts0);
+ cs_delta = cycles_to_nsec_safe(cs, cs_last, cs_ts);
+
+ watchdog_data.wd_delta = wd_delta;
+ watchdog_data.cs_delta = cs_delta;
+
+ /*
+ * Ensure that the deltas are within the readout limits of
+ * the clocksource and the watchdog. Long delays can cause
+ * clocksources to overflow.
+ */
+ max_delta = max(wd_delta, cs_delta);
+ if (max_delta > cs->max_idle_ns || max_delta > watchdog->max_idle_ns)
+ goto reset;
+
+ /*
+ * Calculate and validate the skew against the allowed PPM
+ * value of the maximum delta plus the watchdog readout
+ * time.
+ */
+ if (abs(wd_delta - cs_delta) < (max_delta >> ppm_shift) + wd_seq)
+ return true;
+
+ watchdog_data.result = WD_FREQ_SKEWED;
+ return false;
}
- /* Don't verify ourselves. */
- cpumask_clear_cpu(smp_processor_id(), &cpus_chosen);
+ watchdog_data.result = WD_FREQ_TIMEOUT;
+ return false;
+
+reset:
+ cs->flags |= CLOCK_SOURCE_WATCHDOG;
+ watchdog_data.result = WD_FREQ_RESET;
+ return false;
}
-static void clocksource_verify_one_cpu(void *csin)
+/* Synchronization for sched clock */
+static void clocksource_tick_stable(struct clocksource *cs)
{
- struct clocksource *cs = (struct clocksource *)csin;
-
- csnow_mid = cs->read(cs);
+ if (cs == curr_clocksource && cs->tick_stable)
+ cs->tick_stable(cs);
}
-void clocksource_verify_percpu(struct clocksource *cs)
+/* Conditionaly enable high resolution mode */
+static void clocksource_enable_highres(struct clocksource *cs)
{
- int64_t cs_nsec, cs_nsec_max = 0, cs_nsec_min = LLONG_MAX;
- u64 csnow_begin, csnow_end;
- int cpu, testcpu;
- s64 delta;
+ if ((cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) ||
+ !(cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) ||
+ !watchdog || !(watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS))
+ return;
+
+ /* Mark it valid for high-res. */
+ cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
- if (verify_n_cpus == 0)
+ /*
+ * Can't schedule work before finished_booting is
+ * true. clocksource_done_booting will take care of it.
+ */
+ if (!finished_booting)
return;
- cpumask_clear(&cpus_ahead);
- cpumask_clear(&cpus_behind);
- cpus_read_lock();
- migrate_disable();
- clocksource_verify_choose_cpus();
- if (cpumask_empty(&cpus_chosen)) {
- migrate_enable();
- cpus_read_unlock();
- pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name);
+
+ if (cs->flags & CLOCK_SOURCE_WDTEST)
return;
+
+ /*
+ * If this is not the current clocksource let the watchdog thread
+ * reselect it. Due to the change to high res this clocksource
+ * might be preferred now. If it is the current clocksource let the
+ * tick code know about that change.
+ */
+ if (cs != curr_clocksource) {
+ cs->flags |= CLOCK_SOURCE_RESELECT;
+ schedule_work(&watchdog_work);
+ } else {
+ tick_clock_notify();
}
- testcpu = smp_processor_id();
- pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n",
- cs->name, testcpu, cpumask_pr_args(&cpus_chosen));
- preempt_disable();
- for_each_cpu(cpu, &cpus_chosen) {
- if (cpu == testcpu)
- continue;
- csnow_begin = cs->read(cs);
- smp_call_function_single(cpu, clocksource_verify_one_cpu, cs, 1);
- csnow_end = cs->read(cs);
- delta = (s64)((csnow_mid - csnow_begin) & cs->mask);
- if (delta < 0)
- cpumask_set_cpu(cpu, &cpus_behind);
- delta = (csnow_end - csnow_mid) & cs->mask;
- if (delta < 0)
- cpumask_set_cpu(cpu, &cpus_ahead);
- cs_nsec = cycles_to_nsec_safe(cs, csnow_begin, csnow_end);
- if (cs_nsec > cs_nsec_max)
- cs_nsec_max = cs_nsec;
- if (cs_nsec < cs_nsec_min)
- cs_nsec_min = cs_nsec;
+}
+
+static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 2);
+
+static void watchdog_print_freq_timeout(struct clocksource *cs)
+{
+ if (!__ratelimit(&ratelimit_state))
+ return;
+ pr_info("Watchdog %s read timed out. Readout sequence took: %lluns\n",
+ watchdog->name, watchdog_data.wd_seq);
+}
+
+static void watchdog_print_freq_skew(struct clocksource *cs)
+{
+ pr_warn("Marking clocksource %s unstable due to frequency skew\n", cs->name);
+ pr_warn("Watchdog %20s interval: %16lluns\n", watchdog->name, watchdog_data.wd_delta);
+ pr_warn("Clocksource %20s interval: %16lluns\n", cs->name, watchdog_data.cs_delta);
+}
+
+static void watchdog_handle_remote_timeout(struct clocksource *cs)
+{
+ pr_info_once("Watchdog remote CPU %u read timed out\n", watchdog_data.curr_cpu);
+}
+
+static void watchdog_print_remote_skew(struct clocksource *cs)
+{
+ pr_warn("Marking clocksource %s unstable due to inter CPU skew\n", cs->name);
+ if (watchdog_data.cpu_ts[0] < watchdog_data.cpu_ts[1]) {
+ pr_warn("CPU%u %16llu < CPU%u %16llu (cycles)\n", smp_processor_id(),
+ watchdog_data.cpu_ts[0], watchdog_data.curr_cpu, watchdog_data.cpu_ts[1]);
+ } else {
+ pr_warn("CPU%u %16llu < CPU%u %16llu (cycles)\n", watchdog_data.curr_cpu,
+ watchdog_data.cpu_ts[1], smp_processor_id(), watchdog_data.cpu_ts[0]);
}
- preempt_enable();
- migrate_enable();
- cpus_read_unlock();
- if (!cpumask_empty(&cpus_ahead))
- pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n",
- cpumask_pr_args(&cpus_ahead), testcpu, cs->name);
- if (!cpumask_empty(&cpus_behind))
- pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n",
- cpumask_pr_args(&cpus_behind), testcpu, cs->name);
- if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind))
- pr_warn(" CPU %d check durations %lldns - %lldns for clocksource %s.\n",
- testcpu, cs_nsec_min, cs_nsec_max, cs->name);
-}
-EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
+}
-static inline void clocksource_reset_watchdog(void)
+static void watchdog_check_result(struct clocksource *cs)
{
- struct clocksource *cs;
+ switch (watchdog_data.result) {
+ case WD_SUCCESS:
+ clocksource_tick_stable(cs);
+ clocksource_enable_highres(cs);
+ return;
- list_for_each_entry(cs, &watchdog_list, wd_list)
+ case WD_FREQ_TIMEOUT:
+ watchdog_print_freq_timeout(cs);
+ /* Try again later and invalidate the reference timestamps. */
cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
-}
+ return;
+ case WD_FREQ_NO_WATCHDOG:
+ case WD_FREQ_RESET:
+ /*
+ * Nothing to do when the reference timestamps were reset
+ * or no watchdog clocksource registered.
+ */
+ return;
+
+ case WD_FREQ_SKEWED:
+ watchdog_print_freq_skew(cs);
+ break;
+
+ case WD_CPU_TIMEOUT:
+ /* Remote check timed out. Try again next cycle. */
+ watchdog_handle_remote_timeout(cs);
+ return;
+
+ case WD_CPU_SKEWED:
+ watchdog_print_remote_skew(cs);
+ break;
+ }
+ __clocksource_unstable(cs);
+}
static void clocksource_watchdog(struct timer_list *unused)
{
- int64_t wd_nsec, cs_nsec, interval;
- u64 csnow, wdnow, cslast, wdlast;
- int next_cpu, reset_pending;
struct clocksource *cs;
- enum wd_read_status read_ret;
- unsigned long extra_wait = 0;
- u32 md;
+ bool reset_pending;
- spin_lock(&watchdog_lock);
+ guard(spinlock)(&watchdog_lock);
if (!watchdog_running)
- goto out;
+ return;
reset_pending = atomic_read(&watchdog_reset_pending);
list_for_each_entry(cs, &watchdog_list, wd_list) {
-
/* Clocksource already marked unstable? */
if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
if (finished_booting)
@@ -452,174 +659,42 @@ static void clocksource_watchdog(struct timer_list *unused)
continue;
}
- read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
-
- if (read_ret == WD_READ_UNSTABLE) {
- /* Clock readout unreliable, so give it up. */
- __clocksource_unstable(cs);
- continue;
- }
-
- /*
- * When WD_READ_SKIP is returned, it means the system is likely
- * under very heavy load, where the latency of reading
- * watchdog/clocksource is very big, and affect the accuracy of
- * watchdog check. So give system some space and suspend the
- * watchdog check for 5 minutes.
- */
- if (read_ret == WD_READ_SKIP) {
- /*
- * As the watchdog timer will be suspended, and
- * cs->last could keep unchanged for 5 minutes, reset
- * the counters.
- */
- clocksource_reset_watchdog();
- extra_wait = HZ * 300;
- break;
- }
-
- /* Clocksource initialized ? */
- if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
- atomic_read(&watchdog_reset_pending)) {
- cs->flags |= CLOCK_SOURCE_WATCHDOG;
- cs->wd_last = wdnow;
- cs->cs_last = csnow;
- continue;
+ /* Compare against watchdog clocksource if available */
+ if (watchdog_check_freq(cs, reset_pending)) {
+ /* Check for inter CPU skew */
+ watchdog_check_cpu_skew(cs);
}
- wd_nsec = cycles_to_nsec_safe(watchdog, cs->wd_last, wdnow);
- cs_nsec = cycles_to_nsec_safe(cs, cs->cs_last, csnow);
- wdlast = cs->wd_last; /* save these in case we print them */
- cslast = cs->cs_last;
- cs->cs_last = csnow;
- cs->wd_last = wdnow;
-
- if (atomic_read(&watchdog_reset_pending))
- continue;
-
- /*
- * The processing of timer softirqs can get delayed (usually
- * on account of ksoftirqd not getting to run in a timely
- * manner), which causes the watchdog interval to stretch.
- * Skew detection may fail for longer watchdog intervals
- * on account of fixed margins being used.
- * Some clocksources, e.g. acpi_pm, cannot tolerate
- * watchdog intervals longer than a few seconds.
- */
- interval = max(cs_nsec, wd_nsec);
- if (unlikely(interval > WATCHDOG_INTERVAL_MAX_NS)) {
- if (system_state > SYSTEM_SCHEDULING &&
- interval > 2 * watchdog_max_interval) {
- watchdog_max_interval = interval;
- pr_warn("Long readout interval, skipping watchdog check: cs_nsec: %lld wd_nsec: %lld\n",
- cs_nsec, wd_nsec);
- }
- watchdog_timer.expires = jiffies;
- continue;
- }
-
- /* Check the deviation from the watchdog clocksource. */
- md = cs->uncertainty_margin + watchdog->uncertainty_margin;
- if (abs(cs_nsec - wd_nsec) > md) {
- s64 cs_wd_msec;
- s64 wd_msec;
- u32 wd_rem;
-
- pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
- smp_processor_id(), cs->name);
- pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n",
- watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask);
- pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n",
- cs->name, cs_nsec, csnow, cslast, cs->mask);
- cs_wd_msec = div_s64_rem(cs_nsec - wd_nsec, 1000 * 1000, &wd_rem);
- wd_msec = div_s64_rem(wd_nsec, 1000 * 1000, &wd_rem);
- pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n",
- cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec);
- if (curr_clocksource == cs)
- pr_warn(" '%s' is current clocksource.\n", cs->name);
- else if (curr_clocksource)
- pr_warn(" '%s' (not '%s') is current clocksource.\n", curr_clocksource->name, cs->name);
- else
- pr_warn(" No current clocksource.\n");
- __clocksource_unstable(cs);
- continue;
- }
-
- if (cs == curr_clocksource && cs->tick_stable)
- cs->tick_stable(cs);
-
- if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) &&
- (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) &&
- (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) {
- /* Mark it valid for high-res. */
- cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
-
- /*
- * clocksource_done_booting() will sort it if
- * finished_booting is not set yet.
- */
- if (!finished_booting)
- continue;
-
- /*
- * If this is not the current clocksource let
- * the watchdog thread reselect it. Due to the
- * change to high res this clocksource might
- * be preferred now. If it is the current
- * clocksource let the tick code know about
- * that change.
- */
- if (cs != curr_clocksource) {
- cs->flags |= CLOCK_SOURCE_RESELECT;
- schedule_work(&watchdog_work);
- } else {
- tick_clock_notify();
- }
- }
+ watchdog_check_result(cs);
}
- /*
- * We only clear the watchdog_reset_pending, when we did a
- * full cycle through all clocksources.
- */
+ /* Clear after the full clocksource walk */
if (reset_pending)
atomic_dec(&watchdog_reset_pending);
- /*
- * Cycle through CPUs to check if the CPUs stay synchronized
- * to each other.
- */
- next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask);
- if (next_cpu >= nr_cpu_ids)
- next_cpu = cpumask_first(cpu_online_mask);
-
- /*
- * Arm timer if not already pending: could race with concurrent
- * pair clocksource_stop_watchdog() clocksource_start_watchdog().
- */
+ /* Could have been rearmed by a stop/start cycle */
if (!timer_pending(&watchdog_timer)) {
- watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
- add_timer_on(&watchdog_timer, next_cpu);
+ watchdog_timer.expires += WATCHDOG_INTERVAL;
+ add_timer_local(&watchdog_timer);
}
-out:
- spin_unlock(&watchdog_lock);
}
static inline void clocksource_start_watchdog(void)
{
- if (watchdog_running || !watchdog || list_empty(&watchdog_list))
+ if (watchdog_running || list_empty(&watchdog_list))
return;
- timer_setup(&watchdog_timer, clocksource_watchdog, 0);
+ timer_setup(&watchdog_timer, clocksource_watchdog, TIMER_PINNED);
watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
- add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
+
+ add_timer_on(&watchdog_timer, get_boot_cpu_id());
watchdog_running = 1;
}
static inline void clocksource_stop_watchdog(void)
{
- if (!watchdog_running || (watchdog && !list_empty(&watchdog_list)))
+ if (!watchdog_running || !list_empty(&watchdog_list))
return;
- del_timer(&watchdog_timer);
+ timer_delete(&watchdog_timer);
watchdog_running = 0;
}
@@ -659,6 +734,13 @@ static void clocksource_select_watchdog(bool fallback)
if (cs->flags & CLOCK_SOURCE_MUST_VERIFY)
continue;
+ /*
+ * If it's not continuous, don't put the fox in charge of
+ * the henhouse.
+ */
+ if (!(cs->flags & CLOCK_SOURCE_IS_CONTINUOUS))
+ continue;
+
/* Skip current if we were requested for a fallback. */
if (fallback && cs == old_wd)
continue;
@@ -698,12 +780,6 @@ static int __clocksource_watchdog_kthread(void)
unsigned long flags;
int select = 0;
- /* Do any required per-CPU skew verification. */
- if (curr_clocksource &&
- curr_clocksource->flags & CLOCK_SOURCE_UNSTABLE &&
- curr_clocksource->flags & CLOCK_SOURCE_VERIFY_PERCPU)
- clocksource_verify_percpu(curr_clocksource);
-
spin_lock_irqsave(&watchdog_lock, flags);
list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) {
if (cs->flags & CLOCK_SOURCE_UNSTABLE) {
@@ -1024,6 +1100,8 @@ static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur)
continue;
if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES))
continue;
+ if (cs->flags & CLOCK_SOURCE_WDTEST)
+ continue;
return cs;
}
return NULL;
@@ -1048,6 +1126,8 @@ static void __clocksource_select(bool skipcur)
continue;
if (strcmp(cs->name, override_name) != 0)
continue;
+ if (cs->flags & CLOCK_SOURCE_WDTEST)
+ continue;
/*
* Check to make sure we don't switch to a non-highres
* capable clocksource if the tick code is in oneshot
@@ -1177,31 +1257,10 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq
clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
NSEC_PER_SEC / scale, sec * scale);
- }
- /*
- * If the uncertainty margin is not specified, calculate it. If
- * both scale and freq are non-zero, calculate the clock period, but
- * bound below at 2*WATCHDOG_MAX_SKEW, that is, 500ppm by default.
- * However, if either of scale or freq is zero, be very conservative
- * and take the tens-of-milliseconds WATCHDOG_THRESHOLD value
- * for the uncertainty margin. Allow stupidly small uncertainty
- * margins to be specified by the caller for testing purposes,
- * but warn to discourage production use of this capability.
- *
- * Bottom line: The sum of the uncertainty margins of the
- * watchdog clocksource and the clocksource under test will be at
- * least 500ppm by default. For more information, please see the
- * comment preceding CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US above.
- */
- if (scale && freq && !cs->uncertainty_margin) {
- cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq);
- if (cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW)
- cs->uncertainty_margin = 2 * WATCHDOG_MAX_SKEW;
- } else if (!cs->uncertainty_margin) {
- cs->uncertainty_margin = WATCHDOG_THRESHOLD;
+ /* Update cs::freq_khz */
+ cs->freq_khz = div_u64((u64)freq * scale, 1000);
}
- WARN_ON_ONCE(cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW);
/*
* Ensure clocksources that have large 'mult' values don't overflow
@@ -1249,6 +1308,10 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX))
cs->id = CSID_GENERIC;
+
+ if (WARN_ON_ONCE(!freq && cs->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT))
+ cs->flags &= ~CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT;
+
if (cs->vdso_clock_mode < 0 ||
cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) {
pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n",
@@ -1510,7 +1573,7 @@ static int __init boot_override_clocksource(char* str)
{
mutex_lock(&clocksource_mutex);
if (str)
- strscpy(override_name, str, sizeof(override_name));
+ strscpy(override_name, str);
mutex_unlock(&clocksource_mutex);
return 1;
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index deb1aa32814e..5bd6efe598f0 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org>
* Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
* Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
*
@@ -50,6 +50,36 @@
#include "tick-internal.h"
/*
+ * Constants to set the queued state of the timer (INACTIVE, ENQUEUED)
+ *
+ * The callback state is kept separate in the CPU base because having it in
+ * the timer would required touching the timer after the callback, which
+ * makes it impossible to free the timer from the callback function.
+ *
+ * Therefore we track the callback state in:
+ *
+ * timer->base->cpu_base->running == timer
+ *
+ * On SMP it is possible to have a "callback function running and enqueued"
+ * status. It happens for example when a posix timer expired and the callback
+ * queued a signal. Between dropping the lock which protects the posix timer
+ * and reacquiring the base lock of the hrtimer, another CPU can deliver the
+ * signal and rearm the timer.
+ *
+ * All state transitions are protected by cpu_base->lock.
+ */
+#define HRTIMER_STATE_INACTIVE false
+#define HRTIMER_STATE_ENQUEUED true
+
+/*
+ * The resolution of the clocks. The resolution value is returned in
+ * the clock_getres() system call to give application programmers an
+ * idea of the (in)accuracy of timers. Timer values are rounded up to
+ * this resolution values.
+ */
+#define HIGH_RES_NSEC 1
+
+/*
* Masks for selecting the soft and hard context timers from
* cpu_base->active
*/
@@ -59,6 +89,7 @@
#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
static void retrigger_next_event(void *arg);
+static ktime_t __hrtimer_cb_get_time(clockid_t clock_id);
/*
* The timer bases:
@@ -68,65 +99,26 @@ static void retrigger_next_event(void *arg);
* to reach a base using a clockid, hrtimer_clockid_to_base()
* is used to convert from clockid to the proper hrtimer_base_type.
*/
+
+#define BASE_INIT(idx, cid) \
+ [idx] = { .index = idx, .clockid = cid }
+
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
- .clock_base =
- {
- {
- .index = HRTIMER_BASE_MONOTONIC,
- .clockid = CLOCK_MONOTONIC,
- .get_time = &ktime_get,
- },
- {
- .index = HRTIMER_BASE_REALTIME,
- .clockid = CLOCK_REALTIME,
- .get_time = &ktime_get_real,
- },
- {
- .index = HRTIMER_BASE_BOOTTIME,
- .clockid = CLOCK_BOOTTIME,
- .get_time = &ktime_get_boottime,
- },
- {
- .index = HRTIMER_BASE_TAI,
- .clockid = CLOCK_TAI,
- .get_time = &ktime_get_clocktai,
- },
- {
- .index = HRTIMER_BASE_MONOTONIC_SOFT,
- .clockid = CLOCK_MONOTONIC,
- .get_time = &ktime_get,
- },
- {
- .index = HRTIMER_BASE_REALTIME_SOFT,
- .clockid = CLOCK_REALTIME,
- .get_time = &ktime_get_real,
- },
- {
- .index = HRTIMER_BASE_BOOTTIME_SOFT,
- .clockid = CLOCK_BOOTTIME,
- .get_time = &ktime_get_boottime,
- },
- {
- .index = HRTIMER_BASE_TAI_SOFT,
- .clockid = CLOCK_TAI,
- .get_time = &ktime_get_clocktai,
- },
+ .clock_base = {
+ BASE_INIT(HRTIMER_BASE_MONOTONIC, CLOCK_MONOTONIC),
+ BASE_INIT(HRTIMER_BASE_REALTIME, CLOCK_REALTIME),
+ BASE_INIT(HRTIMER_BASE_BOOTTIME, CLOCK_BOOTTIME),
+ BASE_INIT(HRTIMER_BASE_TAI, CLOCK_TAI),
+ BASE_INIT(HRTIMER_BASE_MONOTONIC_SOFT, CLOCK_MONOTONIC),
+ BASE_INIT(HRTIMER_BASE_REALTIME_SOFT, CLOCK_REALTIME),
+ BASE_INIT(HRTIMER_BASE_BOOTTIME_SOFT, CLOCK_BOOTTIME),
+ BASE_INIT(HRTIMER_BASE_TAI_SOFT, CLOCK_TAI),
},
.csd = CSD_INIT(retrigger_next_event, NULL)
};
-static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
- /* Make sure we catch unsupported clockids */
- [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES,
-
- [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
- [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
- [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
- [CLOCK_TAI] = HRTIMER_BASE_TAI,
-};
-
static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
{
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
@@ -135,23 +127,43 @@ static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
return likely(base->online);
}
+#ifdef CONFIG_HIGH_RES_TIMERS
+DEFINE_STATIC_KEY_FALSE(hrtimer_highres_enabled_key);
+
+static void hrtimer_hres_workfn(struct work_struct *work)
+{
+ static_branch_enable(&hrtimer_highres_enabled_key);
+}
+
+static DECLARE_WORK(hrtimer_hres_work, hrtimer_hres_workfn);
+
+static inline void hrtimer_schedule_hres_work(void)
+{
+ if (!hrtimer_highres_enabled())
+ schedule_work(&hrtimer_hres_work);
+}
+#else
+static inline void hrtimer_schedule_hres_work(void) { }
+#endif
+
/*
* Functions and macros which are different for UP/SMP systems are kept in a
* single place
*/
#ifdef CONFIG_SMP
-
/*
* We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
* such that hrtimer_callback_running() can unconditionally dereference
* timer->base->cpu_base
*/
static struct hrtimer_cpu_base migration_cpu_base = {
- .clock_base = { {
- .cpu_base = &migration_cpu_base,
- .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq,
- &migration_cpu_base.lock),
- }, },
+ .clock_base = {
+ [0] = {
+ .cpu_base = &migration_cpu_base,
+ .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq,
+ &migration_cpu_base.lock),
+ },
+ },
};
#define migration_base migration_cpu_base.clock_base[0]
@@ -168,15 +180,13 @@ static struct hrtimer_cpu_base migration_cpu_base = {
* possible to set timer->base = &migration_base and drop the lock: the timer
* remains locked.
*/
-static
-struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
- unsigned long *flags)
+static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
+ unsigned long *flags)
__acquires(&timer->base->lock)
{
- struct hrtimer_clock_base *base;
-
for (;;) {
- base = READ_ONCE(timer->base);
+ struct hrtimer_clock_base *base = READ_ONCE(timer->base);
+
if (likely(base != &migration_base)) {
raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
if (likely(base == timer->base))
@@ -218,7 +228,7 @@ static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_
/*
* The offline local CPU can't be the default target if the
* next remote target event is after this timer. Keep the
- * elected new base. An IPI will we issued to reprogram
+ * elected new base. An IPI will be issued to reprogram
* it as a last resort.
*/
if (!hrtimer_base_is_online(this_cpu_base))
@@ -229,7 +239,7 @@ static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_
return expires >= new_base->cpu_base->expires_next;
}
-static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned)
+static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, bool pinned)
{
if (!hrtimer_base_is_online(base)) {
int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER));
@@ -257,8 +267,7 @@ static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *
* the timer callback is currently running.
*/
static inline struct hrtimer_clock_base *
-switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
- int pinned)
+switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, bool pinned)
{
struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
struct hrtimer_clock_base *new_base;
@@ -271,13 +280,12 @@ again:
if (base != new_base) {
/*
- * We are trying to move timer to new_base.
- * However we can't change timer's base while it is running,
- * so we keep it on the same CPU. No hassle vs. reprogramming
- * the event source in the high resolution case. The softirq
- * code will take care of this when the timer function has
- * completed. There is no conflict as we hold the lock until
- * the timer is enqueued.
+ * We are trying to move timer to new_base. However we can't
+ * change timer's base while it is running, so we keep it on
+ * the same CPU. No hassle vs. reprogramming the event source
+ * in the high resolution case. The remote CPU will take care
+ * of this when the timer function has completed. There is no
+ * conflict as we hold the lock until the timer is enqueued.
*/
if (unlikely(hrtimer_callback_running(timer)))
return base;
@@ -287,8 +295,7 @@ again:
raw_spin_unlock(&base->cpu_base->lock);
raw_spin_lock(&new_base->cpu_base->lock);
- if (!hrtimer_suitable_target(timer, new_base, new_cpu_base,
- this_cpu_base)) {
+ if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) {
raw_spin_unlock(&new_base->cpu_base->lock);
raw_spin_lock(&base->cpu_base->lock);
new_cpu_base = this_cpu_base;
@@ -307,14 +314,13 @@ again:
#else /* CONFIG_SMP */
-static inline struct hrtimer_clock_base *
-lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+static inline struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
+ unsigned long *flags)
__acquires(&timer->base->cpu_base->lock)
{
struct hrtimer_clock_base *base = timer->base;
raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
-
return base;
}
@@ -349,7 +355,7 @@ s64 __ktime_divns(const ktime_t kt, s64 div)
return dclc < 0 ? -tmp : tmp;
}
EXPORT_SYMBOL_GPL(__ktime_divns);
-#endif /* BITS_PER_LONG >= 64 */
+#endif /* BITS_PER_LONG < 64 */
/*
* Add two ktime values and do a safety check for overflow:
@@ -376,7 +382,7 @@ static const struct debug_obj_descr hrtimer_debug_descr;
static void *hrtimer_debug_hint(void *addr)
{
- return ((struct hrtimer *) addr)->function;
+ return ACCESS_PRIVATE((struct hrtimer *)addr, function);
}
/*
@@ -431,12 +437,37 @@ static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state)
}
}
+/* Stub timer callback for improperly used timers. */
+static enum hrtimer_restart stub_timer(struct hrtimer *unused)
+{
+ WARN_ON_ONCE(1);
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * hrtimer_fixup_assert_init is called when:
+ * - an untracked/uninit-ed object is found
+ */
+static bool hrtimer_fixup_assert_init(void *addr, enum debug_obj_state state)
+{
+ struct hrtimer *timer = addr;
+
+ switch (state) {
+ case ODEBUG_STATE_NOTAVAILABLE:
+ hrtimer_setup(timer, stub_timer, CLOCK_MONOTONIC, 0);
+ return true;
+ default:
+ return false;
+ }
+}
+
static const struct debug_obj_descr hrtimer_debug_descr = {
- .name = "hrtimer",
- .debug_hint = hrtimer_debug_hint,
- .fixup_init = hrtimer_fixup_init,
- .fixup_activate = hrtimer_fixup_activate,
- .fixup_free = hrtimer_fixup_free,
+ .name = "hrtimer",
+ .debug_hint = hrtimer_debug_hint,
+ .fixup_init = hrtimer_fixup_init,
+ .fixup_activate = hrtimer_fixup_activate,
+ .fixup_free = hrtimer_fixup_free,
+ .fixup_assert_init = hrtimer_fixup_assert_init,
};
static inline void debug_hrtimer_init(struct hrtimer *timer)
@@ -449,8 +480,7 @@ static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer)
debug_object_init_on_stack(timer, &hrtimer_debug_descr);
}
-static inline void debug_hrtimer_activate(struct hrtimer *timer,
- enum hrtimer_mode mode)
+static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode)
{
debug_object_activate(timer, &hrtimer_debug_descr);
}
@@ -460,6 +490,11 @@ static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
debug_object_deactivate(timer, &hrtimer_debug_descr);
}
+static inline void debug_hrtimer_assert_init(struct hrtimer *timer)
+{
+ debug_object_assert_init(timer, &hrtimer_debug_descr);
+}
+
void destroy_hrtimer_on_stack(struct hrtimer *timer)
{
debug_object_free(timer, &hrtimer_debug_descr);
@@ -470,100 +505,98 @@ EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
static inline void debug_hrtimer_init(struct hrtimer *timer) { }
static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { }
-static inline void debug_hrtimer_activate(struct hrtimer *timer,
- enum hrtimer_mode mode) { }
+static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { }
static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
+static inline void debug_hrtimer_assert_init(struct hrtimer *timer) { }
#endif
-static inline void
-debug_init(struct hrtimer *timer, clockid_t clockid,
- enum hrtimer_mode mode)
+static inline void debug_setup(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode)
{
debug_hrtimer_init(timer);
- trace_hrtimer_init(timer, clockid, mode);
+ trace_hrtimer_setup(timer, clockid, mode);
}
-static inline void debug_init_on_stack(struct hrtimer *timer, clockid_t clockid,
- enum hrtimer_mode mode)
+static inline void debug_setup_on_stack(struct hrtimer *timer, clockid_t clockid,
+ enum hrtimer_mode mode)
{
debug_hrtimer_init_on_stack(timer);
- trace_hrtimer_init(timer, clockid, mode);
+ trace_hrtimer_setup(timer, clockid, mode);
}
-static inline void debug_activate(struct hrtimer *timer,
- enum hrtimer_mode mode)
+static inline void debug_activate(struct hrtimer *timer, enum hrtimer_mode mode, bool was_armed)
{
debug_hrtimer_activate(timer, mode);
- trace_hrtimer_start(timer, mode);
+ trace_hrtimer_start(timer, mode, was_armed);
}
-static inline void debug_deactivate(struct hrtimer *timer)
-{
- debug_hrtimer_deactivate(timer);
- trace_hrtimer_cancel(timer);
-}
+#define for_each_active_base(base, cpu_base, active) \
+ for (unsigned int idx = ffs(active); idx--; idx = ffs((active))) \
+ for (bool done = false; !done; active &= ~(1U << idx)) \
+ for (base = &cpu_base->clock_base[idx]; !done; done = true)
+
+#define hrtimer_from_timerqueue_node(_n) container_of_const(_n, struct hrtimer, node)
-static struct hrtimer_clock_base *
-__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
+#if defined(CONFIG_NO_HZ_COMMON)
+/*
+ * Same as hrtimer_bases_next_event() below, but skips the excluded timer and
+ * does not update cpu_base->next_timer/expires.
+ */
+static ktime_t hrtimer_bases_next_event_without(struct hrtimer_cpu_base *cpu_base,
+ const struct hrtimer *exclude,
+ unsigned int active, ktime_t expires_next)
{
- unsigned int idx;
+ struct hrtimer_clock_base *base;
+ ktime_t expires;
- if (!*active)
- return NULL;
+ lockdep_assert_held(&cpu_base->lock);
- idx = __ffs(*active);
- *active &= ~(1U << idx);
+ for_each_active_base(base, cpu_base, active) {
+ expires = ktime_sub(base->expires_next, base->offset);
+ if (expires >= expires_next)
+ continue;
- return &cpu_base->clock_base[idx];
+ /*
+ * If the excluded timer is the first on this base evaluate the
+ * next timer.
+ */
+ struct timerqueue_linked_node *node = timerqueue_linked_first(&base->active);
+
+ if (unlikely(&exclude->node == node)) {
+ node = timerqueue_linked_next(node);
+ if (!node)
+ continue;
+ expires = ktime_sub(node->expires, base->offset);
+ if (expires >= expires_next)
+ continue;
+ }
+ expires_next = expires;
+ }
+ /* If base->offset changed, the result might be negative */
+ return max(expires_next, 0);
}
+#endif
-#define for_each_active_base(base, cpu_base, active) \
- while ((base = __next_base((cpu_base), &(active))))
+static __always_inline struct hrtimer *clock_base_next_timer(struct hrtimer_clock_base *base)
+{
+ struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active);
-static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
- const struct hrtimer *exclude,
- unsigned int active,
- ktime_t expires_next)
+ return hrtimer_from_timerqueue_node(next);
+}
+
+/* Find the base with the earliest expiry */
+static void hrtimer_bases_first(struct hrtimer_cpu_base *cpu_base,unsigned int active,
+ ktime_t *expires_next, struct hrtimer **next_timer)
{
struct hrtimer_clock_base *base;
ktime_t expires;
for_each_active_base(base, cpu_base, active) {
- struct timerqueue_node *next;
- struct hrtimer *timer;
-
- next = timerqueue_getnext(&base->active);
- timer = container_of(next, struct hrtimer, node);
- if (timer == exclude) {
- /* Get to the next timer in the queue. */
- next = timerqueue_iterate_next(next);
- if (!next)
- continue;
-
- timer = container_of(next, struct hrtimer, node);
- }
- expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
- if (expires < expires_next) {
- expires_next = expires;
-
- /* Skip cpu_base update if a timer is being excluded. */
- if (exclude)
- continue;
-
- if (timer->is_soft)
- cpu_base->softirq_next_timer = timer;
- else
- cpu_base->next_timer = timer;
+ expires = ktime_sub(base->expires_next, base->offset);
+ if (expires < *expires_next) {
+ *expires_next = expires;
+ *next_timer = clock_base_next_timer(base);
}
}
- /*
- * clock_was_set() might have changed base->offset of any of
- * the clock bases so the result might be negative. Fix it up
- * to prevent a false positive in clockevents_program_event().
- */
- if (expires_next < 0)
- expires_next = 0;
- return expires_next;
}
/*
@@ -586,30 +619,28 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
* - HRTIMER_ACTIVE_SOFT, or
* - HRTIMER_ACTIVE_HARD.
*/
-static ktime_t
-__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
+static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
{
- unsigned int active;
struct hrtimer *next_timer = NULL;
ktime_t expires_next = KTIME_MAX;
+ unsigned int active;
+
+ lockdep_assert_held(&cpu_base->lock);
if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
- cpu_base->softirq_next_timer = NULL;
- expires_next = __hrtimer_next_event_base(cpu_base, NULL,
- active, KTIME_MAX);
-
- next_timer = cpu_base->softirq_next_timer;
+ if (active)
+ hrtimer_bases_first(cpu_base, active, &expires_next, &next_timer);
+ cpu_base->softirq_next_timer = next_timer;
}
if (active_mask & HRTIMER_ACTIVE_HARD) {
active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
+ if (active)
+ hrtimer_bases_first(cpu_base, active, &expires_next, &next_timer);
cpu_base->next_timer = next_timer;
- expires_next = __hrtimer_next_event_base(cpu_base, NULL, active,
- expires_next);
}
-
- return expires_next;
+ return max(expires_next, 0);
}
static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base)
@@ -649,8 +680,8 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
- ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
- offs_real, offs_boot, offs_tai);
+ ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, offs_real,
+ offs_boot, offs_tai);
base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
@@ -660,7 +691,9 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
}
/*
- * Is the high resolution mode active ?
+ * Is the high resolution mode active in the CPU base. This cannot use the
+ * static key as the CPUs are switched to high resolution mode
+ * asynchronously.
*/
static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
{
@@ -668,8 +701,13 @@ static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
cpu_base->hres_active : 0;
}
-static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base,
- struct hrtimer *next_timer,
+static inline void hrtimer_rearm_event(ktime_t expires_next, bool deferred)
+{
+ trace_hrtimer_rearm(expires_next, deferred);
+ tick_program_event(expires_next, 1);
+}
+
+static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, struct hrtimer *next_timer,
ktime_t expires_next)
{
cpu_base->expires_next = expires_next;
@@ -694,20 +732,13 @@ static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base,
if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
return;
- tick_program_event(expires_next, 1);
+ hrtimer_rearm_event(expires_next, false);
}
-/*
- * Reprogram the event source with checking both queues for the
- * next event
- * Called with interrupts disabled and base->lock held
- */
-static void
-hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
+/* Reprogram the event source with a evaluation of all clock bases */
+static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, bool skip_equal)
{
- ktime_t expires_next;
-
- expires_next = hrtimer_update_next_event(cpu_base);
+ ktime_t expires_next = hrtimer_update_next_event(cpu_base);
if (skip_equal && expires_next == cpu_base->expires_next)
return;
@@ -718,57 +749,49 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
/* High resolution timer related functions */
#ifdef CONFIG_HIGH_RES_TIMERS
-/*
- * High resolution timer enabled ?
- */
+/* High resolution timer enabled ? */
static bool hrtimer_hres_enabled __read_mostly = true;
unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
EXPORT_SYMBOL_GPL(hrtimer_resolution);
-/*
- * Enable / Disable high resolution mode
- */
+/* Enable / Disable high resolution mode */
static int __init setup_hrtimer_hres(char *str)
{
return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
}
-
__setup("highres=", setup_hrtimer_hres);
-/*
- * hrtimer_high_res_enabled - query, if the highres mode is enabled
- */
-static inline int hrtimer_is_hres_enabled(void)
+/* hrtimer_high_res_enabled - query, if the highres mode is enabled */
+static inline bool hrtimer_is_hres_enabled(void)
{
return hrtimer_hres_enabled;
}
-/*
- * Switch to high resolution mode
- */
+/* Switch to high resolution mode */
static void hrtimer_switch_to_hres(void)
{
struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
if (tick_init_highres()) {
- pr_warn("Could not switch to high resolution mode on CPU %u\n",
- base->cpu);
+ pr_warn("Could not switch to high resolution mode on CPU %u\n", base->cpu);
return;
}
- base->hres_active = 1;
+ base->hres_active = true;
hrtimer_resolution = HIGH_RES_NSEC;
tick_setup_sched_timer(true);
/* "Retrigger" the interrupt to get things going */
retrigger_next_event(NULL);
+ hrtimer_schedule_hres_work();
}
#else
-static inline int hrtimer_is_hres_enabled(void) { return 0; }
+static inline bool hrtimer_is_hres_enabled(void) { return 0; }
static inline void hrtimer_switch_to_hres(void) { }
#endif /* CONFIG_HIGH_RES_TIMERS */
+
/*
* Retrigger next event is called after clock was set with interrupts
* disabled through an SMP function call or directly from low level
@@ -799,17 +822,16 @@ static void retrigger_next_event(void *arg)
* of the next expiring timer is enough. The return from the SMP
* function call will take care of the reprogramming in case the
* CPU was in a NOHZ idle sleep.
+ *
+ * In periodic low resolution mode, the next softirq expiration
+ * must also be updated.
*/
- if (!hrtimer_hres_active(base) && !tick_nohz_active)
- return;
-
- raw_spin_lock(&base->lock);
+ guard(raw_spinlock)(&base->lock);
hrtimer_update_base(base);
if (hrtimer_hres_active(base))
- hrtimer_force_reprogram(base, 0);
+ hrtimer_force_reprogram(base, /* skip_equal */ false);
else
hrtimer_update_next_event(base);
- raw_spin_unlock(&base->lock);
}
/*
@@ -823,10 +845,11 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
struct hrtimer_clock_base *base = timer->base;
- ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+ ktime_t expires = hrtimer_get_expires(timer);
- WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
+ WARN_ON_ONCE(expires < 0);
+ expires = ktime_sub(expires, base->offset);
/*
* CLOCK_REALTIME timer might be requested with an absolute
* expiry time which is less than base->offset. Set it to 0.
@@ -853,8 +876,7 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
timer_cpu_base->softirq_next_timer = timer;
timer_cpu_base->softirq_expires_next = expires;
- if (!ktime_before(expires, timer_cpu_base->expires_next) ||
- !reprogram)
+ if (!ktime_before(expires, timer_cpu_base->expires_next) || !reprogram)
return;
}
@@ -868,11 +890,8 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
if (expires >= cpu_base->expires_next)
return;
- /*
- * If the hrtimer interrupt is running, then it will reevaluate the
- * clock bases and reprogram the clock event device.
- */
- if (cpu_base->in_hrtirq)
+ /* If a deferred rearm is pending skip reprogramming the device */
+ if (cpu_base->deferred_rearm)
return;
cpu_base->next_timer = timer;
@@ -880,8 +899,7 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
__hrtimer_reprogram(cpu_base, timer, expires);
}
-static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base,
- unsigned int active)
+static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int active)
{
struct hrtimer_clock_base *base;
unsigned int seq;
@@ -907,13 +925,11 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base,
if (seq == cpu_base->clock_was_set_seq)
return false;
- /*
- * If the remote CPU is currently handling an hrtimer interrupt, it
- * will reevaluate the first expiring timer of all clock bases
- * before reprogramming. Nothing to do here.
- */
- if (cpu_base->in_hrtirq)
+ /* If a deferred rearm is pending the remote CPU will take care of it */
+ if (cpu_base->deferred_rearm) {
+ cpu_base->deferred_needs_update = true;
return false;
+ }
/*
* Walk the affected clock bases and check whether the first expiring
@@ -924,15 +940,15 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base,
active &= cpu_base->active_bases;
for_each_active_base(base, cpu_base, active) {
- struct timerqueue_node *next;
+ struct timerqueue_linked_node *next;
- next = timerqueue_getnext(&base->active);
+ next = timerqueue_linked_first(&base->active);
expires = ktime_sub(next->expires, base->offset);
if (expires < cpu_base->expires_next)
return true;
/* Extra check for softirq clock bases */
- if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT)
+ if (base->index < HRTIMER_BASE_MONOTONIC_SOFT)
continue;
if (cpu_base->softirq_activated)
continue;
@@ -958,11 +974,9 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base,
*/
void clock_was_set(unsigned int bases)
{
- struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases);
cpumask_var_t mask;
- int cpu;
- if (!hrtimer_hres_active(cpu_base) && !tick_nohz_active)
+ if (!hrtimer_highres_enabled() && !tick_nohz_is_active())
goto out_timerfd;
if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
@@ -971,23 +985,19 @@ void clock_was_set(unsigned int bases)
}
/* Avoid interrupting CPUs if possible */
- cpus_read_lock();
- for_each_online_cpu(cpu) {
- unsigned long flags;
-
- cpu_base = &per_cpu(hrtimer_bases, cpu);
- raw_spin_lock_irqsave(&cpu_base->lock, flags);
+ scoped_guard(cpus_read_lock) {
+ int cpu;
- if (update_needs_ipi(cpu_base, bases))
- cpumask_set_cpu(cpu, mask);
+ for_each_online_cpu(cpu) {
+ struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
- raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+ guard(raw_spinlock_irqsave)(&cpu_base->lock);
+ if (update_needs_ipi(cpu_base, bases))
+ cpumask_set_cpu(cpu, mask);
+ }
+ scoped_guard(preempt)
+ smp_call_function_many(mask, retrigger_next_event, NULL, 1);
}
-
- preempt_disable();
- smp_call_function_many(mask, retrigger_next_event, NULL, 1);
- preempt_enable();
- cpus_read_unlock();
free_cpumask_var(mask);
out_timerfd:
@@ -1022,11 +1032,8 @@ void hrtimers_resume_local(void)
retrigger_next_event(NULL);
}
-/*
- * Counterpart to lock_hrtimer_base above:
- */
-static inline
-void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
+/* Counterpart to lock_hrtimer_base above */
+static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
__releases(&timer->base->cpu_base->lock)
{
raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
@@ -1043,7 +1050,7 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
* .. note::
* This only updates the timer expiry value and does not requeue the timer.
*
- * There is also a variant of the function hrtimer_forward_now().
+ * There is also a variant of this function: hrtimer_forward_now().
*
* Context: Can be safely called from the callback function of @timer. If called
* from other contexts @timer must neither be enqueued nor running the
@@ -1053,15 +1060,15 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
*/
u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
{
- u64 orun = 1;
ktime_t delta;
+ u64 orun = 1;
delta = ktime_sub(now, hrtimer_get_expires(timer));
if (delta < 0)
return 0;
- if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED))
+ if (WARN_ON(timer->is_queued))
return 0;
if (interval < hrtimer_resolution)
@@ -1072,7 +1079,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
orun = ktime_divns(delta, incr);
hrtimer_add_expires_ns(timer, incr * orun);
- if (hrtimer_get_expires_tv64(timer) > now)
+ if (hrtimer_get_expires(timer) > now)
return orun;
/*
* This (and the ktime_add() below) is the
@@ -1090,73 +1097,98 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
* enqueue_hrtimer - internal function to (re)start a timer
*
* The timer is inserted in expiry order. Insertion into the
- * red black tree is O(log(n)). Must hold the base lock.
+ * red black tree is O(log(n)).
*
* Returns true when the new timer is the leftmost timer in the tree.
*/
static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
- enum hrtimer_mode mode)
+ enum hrtimer_mode mode, bool was_armed)
{
- debug_activate(timer, mode);
+ lockdep_assert_held(&base->cpu_base->lock);
+
+ debug_activate(timer, mode, was_armed);
WARN_ON_ONCE(!base->cpu_base->online);
base->cpu_base->active_bases |= 1 << base->index;
/* Pairs with the lockless read in hrtimer_is_queued() */
- WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED);
+ WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED);
+
+ if (!timerqueue_linked_add(&base->active, &timer->node))
+ return false;
+
+ base->expires_next = hrtimer_get_expires(timer);
+ return true;
+}
+
+static inline void base_update_next_timer(struct hrtimer_clock_base *base)
+{
+ struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active);
- return timerqueue_add(&base->active, &timer->node);
+ base->expires_next = next ? next->expires : KTIME_MAX;
}
/*
* __remove_hrtimer - internal function to remove a timer
*
- * Caller must hold the base lock.
- *
* High resolution timer mode reprograms the clock event device when the
* timer is the one which expires next. The caller can disable this by setting
* reprogram to zero. This is useful, when the context does a reprogramming
* anyway (e.g. timer interrupt)
*/
-static void __remove_hrtimer(struct hrtimer *timer,
- struct hrtimer_clock_base *base,
- u8 newstate, int reprogram)
+static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
+ bool newstate, bool reprogram)
{
struct hrtimer_cpu_base *cpu_base = base->cpu_base;
- u8 state = timer->state;
+ bool was_first;
- /* Pairs with the lockless read in hrtimer_is_queued() */
- WRITE_ONCE(timer->state, newstate);
- if (!(state & HRTIMER_STATE_ENQUEUED))
+ lockdep_assert_held(&cpu_base->lock);
+
+ if (!timer->is_queued)
return;
- if (!timerqueue_del(&base->active, &timer->node))
+ /* Pairs with the lockless read in hrtimer_is_queued() */
+ WRITE_ONCE(timer->is_queued, newstate);
+
+ was_first = !timerqueue_linked_prev(&timer->node);
+
+ if (!timerqueue_linked_del(&base->active, &timer->node))
cpu_base->active_bases &= ~(1 << base->index);
+ /* Nothing to update if this was not the first timer in the base */
+ if (!was_first)
+ return;
+
+ base_update_next_timer(base);
+
/*
- * Note: If reprogram is false we do not update
- * cpu_base->next_timer. This happens when we remove the first
- * timer on a remote cpu. No harm as we never dereference
- * cpu_base->next_timer. So the worst thing what can happen is
- * an superfluous call to hrtimer_force_reprogram() on the
- * remote cpu later on if the same timer gets enqueued again.
+ * If reprogram is false don't update cpu_base->next_timer and do not
+ * touch the clock event device.
+ *
+ * This happens when removing the first timer on a remote CPU, which
+ * will be handled by the remote CPU's interrupt. It also happens when
+ * a local timer is removed to be immediately restarted. That's handled
+ * at the call site.
*/
- if (reprogram && timer == cpu_base->next_timer)
- hrtimer_force_reprogram(cpu_base, 1);
+ if (!reprogram || timer != cpu_base->next_timer || timer->is_lazy)
+ return;
+
+ if (cpu_base->deferred_rearm)
+ cpu_base->deferred_needs_update = true;
+ else
+ hrtimer_force_reprogram(cpu_base, /* skip_equal */ true);
}
-/*
- * remove hrtimer, called with base lock held
- */
-static inline int
-remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
- bool restart, bool keep_local)
+static inline bool remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
+ bool newstate)
{
- u8 state = timer->state;
+ lockdep_assert_held(&base->cpu_base->lock);
- if (state & HRTIMER_STATE_ENQUEUED) {
+ if (timer->is_queued) {
bool reprogram;
+ debug_hrtimer_deactivate(timer);
+
/*
* Remove the timer and force reprogramming when high
* resolution mode is active and the timer is on the current
@@ -1165,24 +1197,81 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
* reprogramming happens in the interrupt handler. This is a
* rare case and less expensive than a smp call.
*/
- debug_deactivate(timer);
reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
- /*
- * If the timer is not restarted then reprogramming is
- * required if the timer is local. If it is local and about
- * to be restarted, avoid programming it twice (on removal
- * and a moment later when it's requeued).
- */
- if (!restart)
- state = HRTIMER_STATE_INACTIVE;
- else
- reprogram &= !keep_local;
+ __remove_hrtimer(timer, base, newstate, reprogram);
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Update in place has to retrieve the expiry times of the neighbour nodes
+ * if they exist. That is cache line neutral because the dequeue/enqueue
+ * operation is going to need the same cache lines. But there is a big win
+ * when the dequeue/enqueue can be avoided because the RB tree does not
+ * have to be rebalanced twice.
+ */
+static inline bool
+hrtimer_can_update_in_place(struct hrtimer *timer, struct hrtimer_clock_base *base, ktime_t expires)
+{
+ struct timerqueue_linked_node *next = timerqueue_linked_next(&timer->node);
+ struct timerqueue_linked_node *prev = timerqueue_linked_prev(&timer->node);
- __remove_hrtimer(timer, base, state, reprogram);
- return 1;
+ /* If the new expiry goes behind the next timer, requeue is required */
+ if (next && expires > next->expires)
+ return false;
+
+ /* If this is the first timer, update in place */
+ if (!prev)
+ return true;
+
+ /* Update in place when it does not go ahead of the previous one */
+ return expires >= prev->expires;
+}
+
+static inline bool
+remove_and_enqueue_same_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
+ const enum hrtimer_mode mode, ktime_t expires, u64 delta_ns)
+{
+ bool was_first = false;
+
+ /* Remove it from the timer queue if active */
+ if (timer->is_queued) {
+ was_first = !timerqueue_linked_prev(&timer->node);
+
+ /* Try to update in place to avoid the de/enqueue dance */
+ if (hrtimer_can_update_in_place(timer, base, expires)) {
+ hrtimer_set_expires_range_ns(timer, expires, delta_ns);
+ trace_hrtimer_start(timer, mode, true);
+ if (was_first)
+ base->expires_next = expires;
+ return was_first;
+ }
+
+ debug_hrtimer_deactivate(timer);
+ timerqueue_linked_del(&base->active, &timer->node);
}
- return 0;
+
+ /* Set the new expiry time */
+ hrtimer_set_expires_range_ns(timer, expires, delta_ns);
+
+ debug_activate(timer, mode, timer->is_queued);
+ base->cpu_base->active_bases |= 1 << base->index;
+
+ /* Pairs with the lockless read in hrtimer_is_queued() */
+ WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED);
+
+ /* If it's the first expiring timer now or again, update base */
+ if (timerqueue_linked_add(&base->active, &timer->node)) {
+ base->expires_next = expires;
+ return true;
+ }
+
+ if (was_first)
+ base_update_next_timer(base);
+
+ return false;
}
static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
@@ -1201,55 +1290,93 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
return tim;
}
-static void
-hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
+static void hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
{
- ktime_t expires;
+ ktime_t expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
/*
- * Find the next SOFT expiration.
- */
- expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
-
- /*
- * reprogramming needs to be triggered, even if the next soft
- * hrtimer expires at the same time than the next hard
+ * Reprogramming needs to be triggered, even if the next soft
+ * hrtimer expires at the same time as the next hard
* hrtimer. cpu_base->softirq_expires_next needs to be updated!
*/
if (expires == KTIME_MAX)
return;
/*
- * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event()
- * cpu_base->*expires_next is only set by hrtimer_reprogram()
+ * cpu_base->next_timer is recomputed by __hrtimer_get_next_event()
+ * cpu_base->expires_next is only set by hrtimer_reprogram()
*/
hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
}
-static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
- u64 delta_ns, const enum hrtimer_mode mode,
- struct hrtimer_clock_base *base)
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+static __always_inline bool hrtimer_prefer_local(bool is_local, bool is_first, bool is_pinned)
+{
+ if (static_branch_likely(&timers_migration_enabled)) {
+ /*
+ * If it is local and the first expiring timer keep it on the local
+ * CPU to optimize reprogramming of the clockevent device. Also
+ * avoid switch_hrtimer_base() overhead when local and pinned.
+ */
+ if (!is_local)
+ return false;
+ if (is_first || is_pinned)
+ return true;
+
+ /* Honour the NOHZ full restrictions */
+ if (!housekeeping_cpu(smp_processor_id(), HK_TYPE_KERNEL_NOISE))
+ return false;
+
+ /*
+ * If the tick is not stopped or need_resched() is set, then
+ * there is no point in moving the timer somewhere else.
+ */
+ return !tick_nohz_tick_stopped() || need_resched();
+ }
+ return is_local;
+}
+#else
+static __always_inline bool hrtimer_prefer_local(bool is_local, bool is_first, bool is_pinned)
+{
+ return is_local;
+}
+#endif
+
+static inline bool hrtimer_keep_base(struct hrtimer *timer, bool is_local, bool is_first,
+ bool is_pinned)
+{
+ /* If the timer is running the callback it has to stay on its CPU base. */
+ if (unlikely(timer->base->running == timer))
+ return true;
+
+ return hrtimer_prefer_local(is_local, is_first, is_pinned);
+}
+
+static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
+ const enum hrtimer_mode mode, struct hrtimer_clock_base *base)
{
struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
- struct hrtimer_clock_base *new_base;
- bool force_local, first;
+ bool is_pinned, first, was_first, keep_base = false;
+ struct hrtimer_cpu_base *cpu_base = base->cpu_base;
- /*
- * If the timer is on the local cpu base and is the first expiring
- * timer then this might end up reprogramming the hardware twice
- * (on removal and on enqueue). To avoid that by prevent the
- * reprogram on removal, keep the timer local to the current CPU
- * and enforce reprogramming after it is queued no matter whether
- * it is the new first expiring timer again or not.
- */
- force_local = base->cpu_base == this_cpu_base;
- force_local &= base->cpu_base->next_timer == timer;
+ was_first = cpu_base->next_timer == timer;
+ is_pinned = !!(mode & HRTIMER_MODE_PINNED);
/*
- * Don't force local queuing if this enqueue happens on a unplugged
- * CPU after hrtimer_cpu_dying() has been invoked.
+ * Don't keep it local if this enqueue happens on a unplugged CPU
+ * after hrtimer_cpu_dying() has been invoked.
*/
- force_local &= this_cpu_base->online;
+ if (likely(this_cpu_base->online)) {
+ bool is_local = cpu_base == this_cpu_base;
+
+ keep_base = hrtimer_keep_base(timer, is_local, was_first, is_pinned);
+ }
+
+ /* Calculate absolute expiry time for relative timers */
+ if (mode & HRTIMER_MODE_REL)
+ tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid));
+ /* Compensate for low resolution granularity */
+ tim = hrtimer_update_lowres(timer, tim, mode);
/*
* Remove an active timer from the queue. In case it is not queued
@@ -1261,32 +1388,41 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
* reprogramming later if it was the first expiring timer. This
* avoids programming the underlying clock event twice (once at
* removal and once after enqueue).
+ *
+ * @keep_base is also true if the timer callback is running on a
+ * remote CPU and for local pinned timers.
*/
- remove_hrtimer(timer, base, true, force_local);
+ if (likely(keep_base)) {
+ first = remove_and_enqueue_same_base(timer, base, mode, tim, delta_ns);
+ } else {
+ /* Keep the ENQUEUED state in case it is queued */
+ bool was_armed = remove_hrtimer(timer, base, HRTIMER_STATE_ENQUEUED);
- if (mode & HRTIMER_MODE_REL)
- tim = ktime_add_safe(tim, base->get_time());
+ hrtimer_set_expires_range_ns(timer, tim, delta_ns);
- tim = hrtimer_update_lowres(timer, tim, mode);
+ /* Switch the timer base, if necessary: */
+ base = switch_hrtimer_base(timer, base, is_pinned);
+ cpu_base = base->cpu_base;
- hrtimer_set_expires_range_ns(timer, tim, delta_ns);
+ first = enqueue_hrtimer(timer, base, mode, was_armed);
+ }
- /* Switch the timer base, if necessary: */
- if (!force_local) {
- new_base = switch_hrtimer_base(timer, base,
- mode & HRTIMER_MODE_PINNED);
- } else {
- new_base = base;
+ /* If a deferred rearm is pending skip reprogramming the device */
+ if (cpu_base->deferred_rearm) {
+ cpu_base->deferred_needs_update = true;
+ return false;
}
- first = enqueue_hrtimer(timer, new_base, mode);
- if (!force_local) {
+ if (!was_first || cpu_base != this_cpu_base) {
/*
- * If the current CPU base is online, then the timer is
- * never queued on a remote CPU if it would be the first
- * expiring timer there.
+ * If the current CPU base is online, then the timer is never
+ * queued on a remote CPU if it would be the first expiring
+ * timer there unless the timer callback is currently executed
+ * on the remote CPU. In the latter case the remote CPU will
+ * re-evaluate the first expiring timer after completing the
+ * callbacks.
*/
- if (hrtimer_base_is_online(this_cpu_base))
+ if (likely(hrtimer_base_is_online(this_cpu_base)))
return first;
/*
@@ -1294,21 +1430,33 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
* already offline. If the timer is the first to expire,
* kick the remote CPU to reprogram the clock event.
*/
- if (first) {
- struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base;
+ if (first)
+ smp_call_function_single_async(cpu_base->cpu, &cpu_base->csd);
+ return false;
+ }
- smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd);
- }
- return 0;
+ /*
+ * Special case for the HRTICK timer. It is frequently rearmed and most
+ * of the time moves the expiry into the future. That's expensive in
+ * virtual machines and it's better to take the pointless already armed
+ * interrupt than reprogramming the hardware on every context switch.
+ *
+ * If the new expiry is before the armed time, then reprogramming is
+ * required.
+ */
+ if (timer->is_lazy) {
+ if (cpu_base->expires_next <= hrtimer_get_expires(timer))
+ return false;
}
/*
- * Timer was forced to stay on the current CPU to avoid
- * reprogramming on removal and enqueue. Force reprogram the
- * hardware by evaluating the new first expiring timer.
+ * Timer was the first expiring timer and forced to stay on the
+ * current CPU to avoid reprogramming on removal and enqueue. Force
+ * reprogram the hardware by evaluating the new first expiring
+ * timer.
*/
- hrtimer_force_reprogram(new_base->cpu_base, 1);
- return 0;
+ hrtimer_force_reprogram(cpu_base, /* skip_equal */ true);
+ return false;
}
/**
@@ -1320,14 +1468,14 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
* relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
* softirq based mode is considered for debug purpose only!
*/
-void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
- u64 delta_ns, const enum hrtimer_mode mode)
+void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns,
+ const enum hrtimer_mode mode)
{
struct hrtimer_clock_base *base;
unsigned long flags;
- if (WARN_ON_ONCE(!timer->function))
- return;
+ debug_hrtimer_assert_init(timer);
+
/*
* Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
* match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
@@ -1375,8 +1523,11 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
base = lock_hrtimer_base(timer, &flags);
- if (!hrtimer_callback_running(timer))
- ret = remove_hrtimer(timer, base, false, false);
+ if (!hrtimer_callback_running(timer)) {
+ ret = remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE);
+ if (ret)
+ trace_hrtimer_cancel(timer);
+ }
unlock_hrtimer_base(timer, &flags);
@@ -1410,8 +1561,7 @@ static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
* the timer callback to finish. Drop expiry_lock and reacquire it. That
* allows the waiter to acquire the lock and make progress.
*/
-static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
- unsigned long flags)
+static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, unsigned long flags)
{
if (atomic_read(&cpu_base->timer_waiters)) {
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
@@ -1439,7 +1589,7 @@ static __always_inline bool is_migration_base(struct hrtimer_clock_base *base)
* running.
*
* This prevents priority inversion: if the soft irq thread is preempted
- * in the middle of a timer callback, then calling del_timer_sync() can
+ * in the middle of a timer callback, then calling hrtimer_cancel() can
* lead to two issues:
*
* - If the caller is on a remote CPU then it has to spin wait for the timer
@@ -1476,14 +1626,10 @@ void hrtimer_cancel_wait_running(const struct hrtimer *timer)
spin_unlock_bh(&base->cpu_base->softirq_expiry_lock);
}
#else
-static inline void
-hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { }
-static inline void
-hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { }
-static inline void
-hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { }
-static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base,
- unsigned long flags) { }
+static inline void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { }
+static inline void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { }
+static inline void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { }
+static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, unsigned long fl) { }
#endif
/**
@@ -1539,15 +1685,11 @@ u64 hrtimer_get_next_event(void)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
u64 expires = KTIME_MAX;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&cpu_base->lock, flags);
+ guard(raw_spinlock_irqsave)(&cpu_base->lock);
if (!hrtimer_hres_active(cpu_base))
expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);
- raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
-
return expires;
}
@@ -1562,48 +1704,65 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
u64 expires = KTIME_MAX;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&cpu_base->lock, flags);
-
- if (hrtimer_hres_active(cpu_base)) {
- unsigned int active;
+ unsigned int active;
- if (!cpu_base->softirq_activated) {
- active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
- expires = __hrtimer_next_event_base(cpu_base, exclude,
- active, KTIME_MAX);
- }
- active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
- expires = __hrtimer_next_event_base(cpu_base, exclude, active,
- expires);
- }
+ guard(raw_spinlock_irqsave)(&cpu_base->lock);
+ if (!hrtimer_hres_active(cpu_base))
+ return expires;
- raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+ active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
+ if (active && !cpu_base->softirq_activated)
+ expires = hrtimer_bases_next_event_without(cpu_base, exclude, active, KTIME_MAX);
- return expires;
+ active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
+ if (!active)
+ return expires;
+ return hrtimer_bases_next_event_without(cpu_base, exclude, active, expires);
}
#endif
static inline int hrtimer_clockid_to_base(clockid_t clock_id)
{
- if (likely(clock_id < MAX_CLOCKS)) {
- int base = hrtimer_clock_to_base_table[clock_id];
+ switch (clock_id) {
+ case CLOCK_MONOTONIC:
+ return HRTIMER_BASE_MONOTONIC;
+ case CLOCK_REALTIME:
+ return HRTIMER_BASE_REALTIME;
+ case CLOCK_BOOTTIME:
+ return HRTIMER_BASE_BOOTTIME;
+ case CLOCK_TAI:
+ return HRTIMER_BASE_TAI;
+ default:
+ WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
+ return HRTIMER_BASE_MONOTONIC;
+ }
+}
- if (likely(base != HRTIMER_MAX_CLOCK_BASES))
- return base;
+static ktime_t __hrtimer_cb_get_time(clockid_t clock_id)
+{
+ switch (clock_id) {
+ case CLOCK_MONOTONIC:
+ return ktime_get();
+ case CLOCK_REALTIME:
+ return ktime_get_real();
+ case CLOCK_BOOTTIME:
+ return ktime_get_boottime();
+ case CLOCK_TAI:
+ return ktime_get_clocktai();
+ default:
+ WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
+ return ktime_get();
}
- WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
- return HRTIMER_BASE_MONOTONIC;
}
-static enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused)
+ktime_t hrtimer_cb_get_time(const struct hrtimer *timer)
{
- return HRTIMER_NORESTART;
+ return __hrtimer_cb_get_time(timer->base->clockid);
}
+EXPORT_SYMBOL_GPL(hrtimer_cb_get_time);
-static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
- enum hrtimer_mode mode)
+static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*fn)(struct hrtimer *),
+ clockid_t clock_id, enum hrtimer_mode mode)
{
bool softtimer = !!(mode & HRTIMER_MODE_SOFT);
struct hrtimer_cpu_base *cpu_base;
@@ -1634,41 +1793,15 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
base += hrtimer_clockid_to_base(clock_id);
timer->is_soft = softtimer;
timer->is_hard = !!(mode & HRTIMER_MODE_HARD);
+ timer->is_lazy = !!(mode & HRTIMER_MODE_LAZY_REARM);
timer->base = &cpu_base->clock_base[base];
- timerqueue_init(&timer->node);
-}
-
-static void __hrtimer_setup(struct hrtimer *timer,
- enum hrtimer_restart (*function)(struct hrtimer *),
- clockid_t clock_id, enum hrtimer_mode mode)
-{
- __hrtimer_init(timer, clock_id, mode);
+ timerqueue_linked_init(&timer->node);
- if (WARN_ON_ONCE(!function))
- timer->function = hrtimer_dummy_timeout;
+ if (WARN_ON_ONCE(!fn))
+ ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout;
else
- timer->function = function;
-}
-
-/**
- * hrtimer_init - initialize a timer to the given clock
- * @timer: the timer to be initialized
- * @clock_id: the clock to be used
- * @mode: The modes which are relevant for initialization:
- * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
- * HRTIMER_MODE_REL_SOFT
- *
- * The PINNED variants of the above can be handed in,
- * but the PINNED bit is ignored as pinning happens
- * when the hrtimer is started
- */
-void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
- enum hrtimer_mode mode)
-{
- debug_init(timer, clock_id, mode);
- __hrtimer_init(timer, clock_id, mode);
+ ACCESS_PRIVATE(timer, function) = fn;
}
-EXPORT_SYMBOL_GPL(hrtimer_init);
/**
* hrtimer_setup - initialize a timer to the given clock
@@ -1686,7 +1819,7 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *),
clockid_t clock_id, enum hrtimer_mode mode)
{
- debug_init(timer, clock_id, mode);
+ debug_setup(timer, clock_id, mode);
__hrtimer_setup(timer, function, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_setup);
@@ -1705,7 +1838,7 @@ void hrtimer_setup_on_stack(struct hrtimer *timer,
enum hrtimer_restart (*function)(struct hrtimer *),
clockid_t clock_id, enum hrtimer_mode mode)
{
- debug_init_on_stack(timer, clock_id, mode);
+ debug_setup_on_stack(timer, clock_id, mode);
__hrtimer_setup(timer, function, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_setup_on_stack);
@@ -1726,12 +1859,10 @@ bool hrtimer_active(const struct hrtimer *timer)
base = READ_ONCE(timer->base);
seq = raw_read_seqcount_begin(&base->seq);
- if (timer->state != HRTIMER_STATE_INACTIVE ||
- base->running == timer)
+ if (timer->is_queued || base->running == timer)
return true;
- } while (read_seqcount_retry(&base->seq, seq) ||
- base != READ_ONCE(timer->base));
+ } while (read_seqcount_retry(&base->seq, seq) || base != READ_ONCE(timer->base));
return false;
}
@@ -1745,7 +1876,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active);
* - callback: the timer is being ran
* - post: the timer is inactive or (re)queued
*
- * On the read side we ensure we observe timer->state and cpu_base->running
+ * On the read side we ensure we observe timer->is_queued and cpu_base->running
* from the same section, if anything changed while we looked at it, we retry.
* This includes timer->base changing because sequence numbers alone are
* insufficient for that.
@@ -1754,11 +1885,9 @@ EXPORT_SYMBOL_GPL(hrtimer_active);
* a false negative if the read side got smeared over multiple consecutive
* __run_hrtimer() invocations.
*/
-
-static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
- struct hrtimer_clock_base *base,
- struct hrtimer *timer, ktime_t *now,
- unsigned long flags) __must_hold(&cpu_base->lock)
+static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base,
+ struct hrtimer *timer, ktime_t now, unsigned long flags)
+ __must_hold(&cpu_base->lock)
{
enum hrtimer_restart (*fn)(struct hrtimer *);
bool expires_in_hardirq;
@@ -1766,20 +1895,20 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
lockdep_assert_held(&cpu_base->lock);
- debug_deactivate(timer);
+ debug_hrtimer_deactivate(timer);
base->running = timer;
/*
- * Separate the ->running assignment from the ->state assignment.
+ * Separate the ->running assignment from the ->is_queued assignment.
*
* As with a regular write barrier, this ensures the read side in
* hrtimer_active() cannot observe base->running == NULL &&
- * timer->state == INACTIVE.
+ * timer->is_queued == INACTIVE.
*/
raw_write_seqcount_barrier(&base->seq);
- __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
- fn = timer->function;
+ __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, false);
+ fn = ACCESS_PRIVATE(timer, function);
/*
* Clear the 'is relative' flag for the TIME_LOW_RES case. If the
@@ -1813,16 +1942,15 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
* hrtimer_start_range_ns() can have popped in and enqueued the timer
* for us already.
*/
- if (restart != HRTIMER_NORESTART &&
- !(timer->state & HRTIMER_STATE_ENQUEUED))
- enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
+ if (restart == HRTIMER_RESTART && !timer->is_queued)
+ enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS, false);
/*
- * Separate the ->running assignment from the ->state assignment.
+ * Separate the ->running assignment from the ->is_queued assignment.
*
* As with a regular write barrier, this ensures the read side in
* hrtimer_active() cannot observe base->running.timer == NULL &&
- * timer->state == INACTIVE.
+ * timer->is_queued == INACTIVE.
*/
raw_write_seqcount_barrier(&base->seq);
@@ -1830,23 +1958,24 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
base->running = NULL;
}
+static __always_inline struct hrtimer *clock_base_next_timer_safe(struct hrtimer_clock_base *base)
+{
+ struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active);
+
+ return next ? hrtimer_from_timerqueue_node(next) : NULL;
+}
+
static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
unsigned long flags, unsigned int active_mask)
{
- struct hrtimer_clock_base *base;
unsigned int active = cpu_base->active_bases & active_mask;
+ struct hrtimer_clock_base *base;
for_each_active_base(base, cpu_base, active) {
- struct timerqueue_node *node;
- ktime_t basenow;
-
- basenow = ktime_add(now, base->offset);
-
- while ((node = timerqueue_getnext(&base->active))) {
- struct hrtimer *timer;
-
- timer = container_of(node, struct hrtimer, node);
+ ktime_t basenow = ktime_add(now, base->offset);
+ struct hrtimer *timer;
+ while ((timer = clock_base_next_timer(base))) {
/*
* The immediate goal for using the softexpires is
* minimizing wakeups, not running timers at the
@@ -1859,10 +1988,10 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
* are right-of a not yet expired timer, because that
* timer will have to trigger a wakeup anyway.
*/
- if (basenow < hrtimer_get_softexpires_tv64(timer))
+ if (basenow < hrtimer_get_softexpires(timer))
break;
- __run_hrtimer(cpu_base, base, timer, &basenow, flags);
+ __run_hrtimer(cpu_base, base, timer, basenow, flags);
if (active_mask == HRTIMER_ACTIVE_SOFT)
hrtimer_sync_wait_running(cpu_base, flags);
}
@@ -1881,7 +2010,7 @@ static __latent_entropy void hrtimer_run_softirq(void)
now = hrtimer_update_base(cpu_base);
__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);
- cpu_base->softirq_activated = 0;
+ cpu_base->softirq_activated = false;
hrtimer_update_softirq_timer(cpu_base, true);
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
@@ -1891,6 +2020,63 @@ static __latent_entropy void hrtimer_run_softirq(void)
#ifdef CONFIG_HIGH_RES_TIMERS
/*
+ * Very similar to hrtimer_force_reprogram(), except it deals with
+ * deferred_rearm and hang_detected.
+ */
+static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next, bool deferred)
+{
+ cpu_base->expires_next = expires_next;
+ cpu_base->deferred_rearm = false;
+
+ if (unlikely(cpu_base->hang_detected)) {
+ /*
+ * Give the system a chance to do something else than looping
+ * on hrtimer interrupts.
+ */
+ expires_next = ktime_add_ns(ktime_get(),
+ min(100 * NSEC_PER_MSEC, cpu_base->max_hang_time));
+ }
+ hrtimer_rearm_event(expires_next, deferred);
+}
+
+#ifdef CONFIG_HRTIMER_REARM_DEFERRED
+void __hrtimer_rearm_deferred(void)
+{
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+ ktime_t expires_next;
+
+ if (!cpu_base->deferred_rearm)
+ return;
+
+ guard(raw_spinlock)(&cpu_base->lock);
+ if (cpu_base->deferred_needs_update) {
+ hrtimer_update_base(cpu_base);
+ expires_next = hrtimer_update_next_event(cpu_base);
+ } else {
+ /* No timer added/removed. Use the cached value */
+ expires_next = cpu_base->deferred_expires_next;
+ }
+ hrtimer_rearm(cpu_base, expires_next, true);
+}
+
+static __always_inline void
+hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next)
+{
+ /* hrtimer_interrupt() just re-evaluated the first expiring timer */
+ cpu_base->deferred_needs_update = false;
+ /* Cache the expiry time */
+ cpu_base->deferred_expires_next = expires_next;
+ set_thread_flag(TIF_HRTIMER_REARM);
+}
+#else /* CONFIG_HRTIMER_REARM_DEFERRED */
+static __always_inline void
+hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next)
+{
+ hrtimer_rearm(cpu_base, expires_next, false);
+}
+#endif /* !CONFIG_HRTIMER_REARM_DEFERRED */
+
+/*
* High resolution timer interrupt
* Called with interrupts disabled
*/
@@ -1904,86 +2090,55 @@ void hrtimer_interrupt(struct clock_event_device *dev)
BUG_ON(!cpu_base->hres_active);
cpu_base->nr_events++;
dev->next_event = KTIME_MAX;
+ dev->next_event_forced = 0;
raw_spin_lock_irqsave(&cpu_base->lock, flags);
entry_time = now = hrtimer_update_base(cpu_base);
retry:
- cpu_base->in_hrtirq = 1;
+ cpu_base->deferred_rearm = true;
/*
- * We set expires_next to KTIME_MAX here with cpu_base->lock
- * held to prevent that a timer is enqueued in our queue via
- * the migration code. This does not affect enqueueing of
- * timers which run their callback and need to be requeued on
- * this CPU.
+ * Set expires_next to KTIME_MAX, which prevents that remote CPUs queue
+ * timers while __hrtimer_run_queues() is expiring the clock bases.
+ * Timers which are re/enqueued on the local CPU are not affected by
+ * this.
*/
cpu_base->expires_next = KTIME_MAX;
if (!ktime_before(now, cpu_base->softirq_expires_next)) {
cpu_base->softirq_expires_next = KTIME_MAX;
- cpu_base->softirq_activated = 1;
+ cpu_base->softirq_activated = true;
raise_timer_softirq(HRTIMER_SOFTIRQ);
}
__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
- /* Reevaluate the clock bases for the [soft] next expiry */
- expires_next = hrtimer_update_next_event(cpu_base);
- /*
- * Store the new expiry value so the migration code can verify
- * against it.
- */
- cpu_base->expires_next = expires_next;
- cpu_base->in_hrtirq = 0;
- raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
-
- /* Reprogramming necessary ? */
- if (!tick_program_event(expires_next, 0)) {
- cpu_base->hang_detected = 0;
- return;
- }
-
/*
* The next timer was already expired due to:
* - tracing
* - long lasting callbacks
* - being scheduled away when running in a VM
*
- * We need to prevent that we loop forever in the hrtimer
- * interrupt routine. We give it 3 attempts to avoid
- * overreacting on some spurious event.
- *
- * Acquire base lock for updating the offsets and retrieving
- * the current time.
+ * We need to prevent that we loop forever in the hrtiner interrupt
+ * routine. We give it 3 attempts to avoid overreacting on some
+ * spurious event.
*/
- raw_spin_lock_irqsave(&cpu_base->lock, flags);
now = hrtimer_update_base(cpu_base);
- cpu_base->nr_retries++;
- if (++retries < 3)
- goto retry;
- /*
- * Give the system a chance to do something else than looping
- * here. We stored the entry time, so we know exactly how long
- * we spent here. We schedule the next event this amount of
- * time away.
- */
- cpu_base->nr_hangs++;
- cpu_base->hang_detected = 1;
- raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+ expires_next = hrtimer_update_next_event(cpu_base);
+ cpu_base->hang_detected = false;
+ if (expires_next < now) {
+ if (++retries < 3)
+ goto retry;
+
+ delta = ktime_sub(now, entry_time);
+ cpu_base->max_hang_time = max_t(unsigned int, cpu_base->max_hang_time, delta);
+ cpu_base->nr_hangs++;
+ cpu_base->hang_detected = true;
+ }
- delta = ktime_sub(now, entry_time);
- if ((unsigned int)delta > cpu_base->max_hang_time)
- cpu_base->max_hang_time = (unsigned int) delta;
- /*
- * Limit it to a sensible value as we enforce a longer
- * delay. Give the CPU at least 100ms to catch up.
- */
- if (delta > 100 * NSEC_PER_MSEC)
- expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
- else
- expires_next = ktime_add(now, delta);
- tick_program_event(expires_next, 1);
- pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta));
+ hrtimer_interrupt_rearm(cpu_base, expires_next);
+ raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
}
+
#endif /* !CONFIG_HIGH_RES_TIMERS */
/*
@@ -2015,7 +2170,7 @@ void hrtimer_run_queues(void)
if (!ktime_before(now, cpu_base->softirq_expires_next)) {
cpu_base->softirq_expires_next = KTIME_MAX;
- cpu_base->softirq_activated = 1;
+ cpu_base->softirq_activated = true;
raise_timer_softirq(HRTIMER_SOFTIRQ);
}
@@ -2028,8 +2183,7 @@ void hrtimer_run_queues(void)
*/
static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
{
- struct hrtimer_sleeper *t =
- container_of(timer, struct hrtimer_sleeper, timer);
+ struct hrtimer_sleeper *t = container_of(timer, struct hrtimer_sleeper, timer);
struct task_struct *task = t->task;
t->task = NULL;
@@ -2047,14 +2201,13 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
* Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers
* to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context)
*/
-void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
- enum hrtimer_mode mode)
+void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode)
{
/*
* Make the enqueue delivery mode check work on RT. If the sleeper
* was initialized for hard interrupt delivery, force the mode bit.
* This is a special case for hrtimer_sleepers because
- * __hrtimer_init_sleeper() determines the delivery mode on RT so the
+ * __hrtimer_setup_sleeper() determines the delivery mode on RT so the
* fiddling with this decision is avoided at the call sites.
*/
if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard)
@@ -2064,8 +2217,8 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
}
EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);
-static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
- clockid_t clock_id, enum hrtimer_mode mode)
+static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id,
+ enum hrtimer_mode mode)
{
/*
* On PREEMPT_RT enabled kernels hrtimers which are not explicitly
@@ -2091,8 +2244,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
mode |= HRTIMER_MODE_HARD;
}
- __hrtimer_init(&sl->timer, clock_id, mode);
- sl->timer.function = hrtimer_wakeup;
+ __hrtimer_setup(&sl->timer, hrtimer_wakeup, clock_id, mode);
sl->task = current;
}
@@ -2102,11 +2254,11 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
* @clock_id: the clock to be used
* @mode: timer mode abs/rel
*/
-void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl,
- clockid_t clock_id, enum hrtimer_mode mode)
+void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id,
+ enum hrtimer_mode mode)
{
- debug_init_on_stack(&sl->timer, clock_id, mode);
- __hrtimer_init_sleeper(sl, clock_id, mode);
+ debug_setup_on_stack(&sl->timer, clock_id, mode);
+ __hrtimer_setup_sleeper(sl, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack);
@@ -2170,18 +2322,17 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
int ret;
hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS);
- hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
+ hrtimer_set_expires(&t.timer, restart->nanosleep.expires);
ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
destroy_hrtimer_on_stack(&t.timer);
return ret;
}
-long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
- const clockid_t clockid)
+long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid)
{
struct restart_block *restart;
struct hrtimer_sleeper t;
- int ret = 0;
+ int ret;
hrtimer_setup_sleeper_on_stack(&t, clockid, mode);
hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns);
@@ -2197,7 +2348,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
restart = &current->restart_block;
restart->nanosleep.clockid = t.timer.base->clockid;
- restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
+ restart->nanosleep.expires = hrtimer_get_expires(&t.timer);
set_restart_fn(restart, hrtimer_nanosleep_restart);
out:
destroy_hrtimer_on_stack(&t.timer);
@@ -2220,8 +2371,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
- return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
- CLOCK_MONOTONIC);
+ return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC);
}
#endif
@@ -2229,7 +2379,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
- struct old_timespec32 __user *, rmtp)
+ struct old_timespec32 __user *, rmtp)
{
struct timespec64 tu;
@@ -2242,8 +2392,7 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
current->restart_block.fn = do_no_restart_syscall;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
- return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
- CLOCK_MONOTONIC);
+ return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC);
}
#endif
@@ -2253,14 +2402,13 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
int hrtimers_prepare_cpu(unsigned int cpu)
{
struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
- int i;
- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
+ for (int i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i];
clock_b->cpu_base = cpu_base;
seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock);
- timerqueue_init_head(&clock_b->active);
+ timerqueue_linked_init_head(&clock_b->active);
}
cpu_base->cpu = cpu;
@@ -2274,13 +2422,14 @@ int hrtimers_cpu_starting(unsigned int cpu)
/* Clear out any left over state from a CPU down operation */
cpu_base->active_bases = 0;
- cpu_base->hres_active = 0;
- cpu_base->hang_detected = 0;
+ cpu_base->hres_active = false;
+ cpu_base->hang_detected = false;
cpu_base->next_timer = NULL;
cpu_base->softirq_next_timer = NULL;
cpu_base->expires_next = KTIME_MAX;
cpu_base->softirq_expires_next = KTIME_MAX;
- cpu_base->online = 1;
+ cpu_base->softirq_activated = false;
+ cpu_base->online = true;
return 0;
}
@@ -2289,20 +2438,20 @@ int hrtimers_cpu_starting(unsigned int cpu)
static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
struct hrtimer_clock_base *new_base)
{
+ struct timerqueue_linked_node *node;
struct hrtimer *timer;
- struct timerqueue_node *node;
- while ((node = timerqueue_getnext(&old_base->active))) {
- timer = container_of(node, struct hrtimer, node);
+ while ((node = timerqueue_linked_first(&old_base->active))) {
+ timer = hrtimer_from_timerqueue_node(node);
BUG_ON(hrtimer_callback_running(timer));
- debug_deactivate(timer);
+ debug_hrtimer_deactivate(timer);
/*
* Mark it as ENQUEUED not INACTIVE otherwise the
* timer could be seen as !active and just vanish away
* under us on another CPU
*/
- __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
+ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, false);
timer->base = new_base;
/*
* Enqueue the timers on the new cpu. This does not
@@ -2312,13 +2461,13 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
* sort out already expired timers and reprogram the
* event device.
*/
- enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
+ enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS, true);
}
}
int hrtimers_cpu_dying(unsigned int dying_cpu)
{
- int i, ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER));
+ int ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER));
struct hrtimer_cpu_base *old_base, *new_base;
old_base = this_cpu_ptr(&hrtimer_bases);
@@ -2331,21 +2480,14 @@ int hrtimers_cpu_dying(unsigned int dying_cpu)
raw_spin_lock(&old_base->lock);
raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING);
- for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
- migrate_hrtimer_list(&old_base->clock_base[i],
- &new_base->clock_base[i]);
- }
+ for (int i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+ migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]);
- /*
- * The migration might have changed the first expiring softirq
- * timer on this CPU. Update it.
- */
- __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT);
/* Tell the other CPU to retrigger the next event */
smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
raw_spin_unlock(&new_base->lock);
- old_base->online = 0;
+ old_base->online = false;
raw_spin_unlock(&old_base->lock);
return 0;
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 876d389b2e21..7c6110e964e7 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -163,8 +163,7 @@ void posixtimer_rearm_itimer(struct task_struct *tsk)
struct hrtimer *tmr = &tsk->signal->real_timer;
if (!hrtimer_is_queued(tmr) && tsk->signal->it_real_incr != 0) {
- hrtimer_forward(tmr, tmr->base->get_time(),
- tsk->signal->it_real_incr);
+ hrtimer_forward_now(tmr, tsk->signal->it_real_incr);
hrtimer_restart(tmr);
}
}
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index bc4db9e5ab70..1c954f330dfe 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -32,7 +32,6 @@ static u64 jiffies_read(struct clocksource *cs)
static struct clocksource clocksource_jiffies = {
.name = "jiffies",
.rating = 1, /* lowest valid rating*/
- .uncertainty_margin = 32 * NSEC_PER_MSEC,
.read = jiffies_read,
.mask = CLOCKSOURCE_MASK(32),
.mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */
@@ -75,13 +74,11 @@ struct clocksource * __init __weak clocksource_default_clock(void)
static struct clocksource refined_jiffies;
-int register_refined_jiffies(long cycles_per_second)
+void __init register_refined_jiffies(long cycles_per_second)
{
u64 nsec_per_tick, shift_hz;
long cycles_per_tick;
-
-
refined_jiffies = clocksource_jiffies;
refined_jiffies.name = "refined-jiffies";
refined_jiffies.rating++;
@@ -100,5 +97,221 @@ int register_refined_jiffies(long cycles_per_second)
refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT;
__clocksource_register(&refined_jiffies);
- return 0;
}
+
+#ifdef CONFIG_PROC_SYSCTL
+static ulong mult_hz(const ulong val)
+{
+ return val * HZ;
+}
+
+static ulong div_hz(const ulong val)
+{
+ return val / HZ;
+}
+
+static int sysctl_u2k_int_conv_hz(const bool *negp, const ulong *u_ptr, int *k_ptr)
+{
+ return proc_int_u2k_conv_uop(u_ptr, k_ptr, negp, mult_hz);
+}
+
+static int sysctl_k2u_int_conv_hz(bool *negp, ulong *u_ptr, const int *k_ptr)
+{
+ return proc_int_k2u_conv_kop(u_ptr, k_ptr, negp, div_hz);
+}
+
+static int sysctl_u2k_int_conv_userhz(const bool *negp, const ulong *u_ptr, int *k_ptr)
+{
+ return proc_int_u2k_conv_uop(u_ptr, k_ptr, negp, clock_t_to_jiffies);
+}
+
+static ulong sysctl_jiffies_to_clock_t(const ulong val)
+{
+ return jiffies_to_clock_t(val);
+}
+
+static int sysctl_k2u_int_conv_userhz(bool *negp, ulong *u_ptr, const int *k_ptr)
+{
+ return proc_int_k2u_conv_kop(u_ptr, k_ptr, negp, sysctl_jiffies_to_clock_t);
+}
+
+static ulong sysctl_msecs_to_jiffies(const ulong val)
+{
+ return msecs_to_jiffies(val);
+}
+
+static int sysctl_u2k_int_conv_ms(const bool *negp, const ulong *u_ptr, int *k_ptr)
+{
+ return proc_int_u2k_conv_uop(u_ptr, k_ptr, negp, sysctl_msecs_to_jiffies);
+}
+
+static ulong sysctl_jiffies_to_msecs(const ulong val)
+{
+ return jiffies_to_msecs(val);
+}
+
+static int sysctl_k2u_int_conv_ms(bool *negp, ulong *u_ptr, const int *k_ptr)
+{
+ return proc_int_k2u_conv_kop(u_ptr, k_ptr, negp, sysctl_jiffies_to_msecs);
+}
+
+static int do_proc_int_conv_jiffies(bool *negp, ulong *u_ptr, int *k_ptr,
+ int dir, const struct ctl_table *tbl)
+{
+ return proc_int_conv(negp, u_ptr, k_ptr, dir, tbl, false,
+ sysctl_u2k_int_conv_hz, sysctl_k2u_int_conv_hz);
+}
+
+static int do_proc_int_conv_userhz_jiffies(bool *negp, ulong *u_ptr,
+ int *k_ptr, int dir,
+ const struct ctl_table *tbl)
+{
+ return proc_int_conv(negp, u_ptr, k_ptr, dir, tbl, false,
+ sysctl_u2k_int_conv_userhz,
+ sysctl_k2u_int_conv_userhz);
+}
+
+static int do_proc_int_conv_ms_jiffies(bool *negp, ulong *u_ptr, int *k_ptr,
+ int dir, const struct ctl_table *tbl)
+{
+ return proc_int_conv(negp, u_ptr, k_ptr, dir, tbl, false,
+ sysctl_u2k_int_conv_ms, sysctl_k2u_int_conv_ms);
+}
+
+static int do_proc_int_conv_ms_jiffies_minmax(bool *negp, ulong *u_ptr,
+ int *k_ptr, int dir,
+ const struct ctl_table *tbl)
+{
+ return proc_int_conv(negp, u_ptr, k_ptr, dir, tbl, false,
+ sysctl_u2k_int_conv_ms, sysctl_k2u_int_conv_ms);
+}
+
+#else // CONFIG_PROC_SYSCTL
+static int do_proc_int_conv_jiffies(bool *negp, ulong *u_ptr, int *k_ptr,
+ int dir, const struct ctl_table *tbl)
+{
+ return -ENOSYS;
+}
+
+static int do_proc_int_conv_userhz_jiffies(bool *negp, ulong *u_ptr,
+ int *k_ptr, int dir,
+ const struct ctl_table *tbl)
+{
+ return -ENOSYS;
+}
+
+static int do_proc_int_conv_ms_jiffies(bool *negp, ulong *u_ptr, int *k_ptr,
+ int dir, const struct ctl_table *tbl)
+{
+ return -ENOSYS;
+}
+
+static int do_proc_int_conv_ms_jiffies_minmax(bool *negp, ulong *u_ptr,
+ int *k_ptr, int dir,
+ const struct ctl_table *tbl)
+{
+ return -ENOSYS;
+}
+#endif
+
+/**
+ * proc_dointvec_jiffies - read a vector of integers as seconds
+ * @table: the sysctl table
+ * @dir: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in seconds, and are converted into
+ * jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_jiffies(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
+ do_proc_int_conv_jiffies);
+}
+EXPORT_SYMBOL(proc_dointvec_jiffies);
+
+/**
+ * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
+ * @table: the sysctl table
+ * @dir: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: pointer to the file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/USER_HZ seconds, and
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
+ do_proc_int_conv_userhz_jiffies);
+}
+EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
+
+/**
+ * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds
+ * @table: the sysctl table
+ * @dir: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: the current position in the file
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
+ * values from/to the user buffer, treated as an ASCII string.
+ * The values read are assumed to be in 1/1000 seconds, and
+ * are converted into jiffies.
+ *
+ * Returns 0 on success.
+ */
+int proc_dointvec_ms_jiffies(const struct ctl_table *table, int dir, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
+ do_proc_int_conv_ms_jiffies);
+}
+EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
+
+int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_dointvec_conv(table, dir, buffer, lenp, ppos,
+ do_proc_int_conv_ms_jiffies_minmax);
+}
+
+/**
+ * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values
+ * @table: the sysctl table
+ * @dir: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long
+ * values from/to the user buffer, treated as an ASCII string. The values
+ * are treated as milliseconds, and converted to jiffies when they are stored.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max).
+ *
+ * Returns 0 on success.
+ */
+int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int dir,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return proc_doulongvec_minmax_conv(table, dir, buffer, lenp, ppos,
+ HZ, 1000l);
+}
+EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
+
diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c
index 0775b9ec952a..4bca3f78c8ea 100644
--- a/kernel/time/namespace.c
+++ b/kernel/time/namespace.c
@@ -12,13 +12,15 @@
#include <linux/seq_file.h>
#include <linux/proc_ns.h>
#include <linux/export.h>
+#include <linux/nstree.h>
#include <linux/time.h>
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/err.h>
#include <linux/mm.h>
+#include <linux/cleanup.h>
-#include <vdso/datapage.h>
+#include "namespace_internal.h"
ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim,
struct timens_offsets *ns_offsets)
@@ -88,29 +90,27 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns,
goto fail;
err = -ENOMEM;
- ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT);
+ ns = kzalloc_obj(*ns, GFP_KERNEL_ACCOUNT);
if (!ns)
goto fail_dec;
- refcount_set(&ns->ns.count, 1);
-
- ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
- if (!ns->vvar_page)
+ err = timens_vdso_alloc_vvar_page(ns);
+ if (err)
goto fail_free;
- err = ns_alloc_inum(&ns->ns);
+ err = ns_common_init(ns);
if (err)
goto fail_free_page;
ns->ucounts = ucounts;
- ns->ns.ops = &timens_operations;
ns->user_ns = get_user_ns(user_ns);
ns->offsets = old_ns->offsets;
ns->frozen_offsets = false;
+ ns_tree_add(ns);
return ns;
fail_free_page:
- __free_page(ns->vvar_page);
+ timens_vdso_free_vvar_page(ns);
fail_free:
kfree(ns);
fail_dec:
@@ -130,7 +130,7 @@ fail:
*
* Return: timens_for_children namespace or ERR_PTR.
*/
-struct time_namespace *copy_time_ns(unsigned long flags,
+struct time_namespace *copy_time_ns(u64 flags,
struct user_namespace *user_ns, struct time_namespace *old_ns)
{
if (!(flags & CLONE_NEWTIME))
@@ -139,155 +139,47 @@ struct time_namespace *copy_time_ns(unsigned long flags,
return clone_time_ns(user_ns, old_ns);
}
-static struct timens_offset offset_from_ts(struct timespec64 off)
-{
- struct timens_offset ret;
-
- ret.sec = off.tv_sec;
- ret.nsec = off.tv_nsec;
-
- return ret;
-}
-
-/*
- * A time namespace VVAR page has the same layout as the VVAR page which
- * contains the system wide VDSO data.
- *
- * For a normal task the VVAR pages are installed in the normal ordering:
- * VVAR
- * PVCLOCK
- * HVCLOCK
- * TIMENS <- Not really required
- *
- * Now for a timens task the pages are installed in the following order:
- * TIMENS
- * PVCLOCK
- * HVCLOCK
- * VVAR
- *
- * The check for vdso_data->clock_mode is in the unlikely path of
- * the seq begin magic. So for the non-timens case most of the time
- * 'seq' is even, so the branch is not taken.
- *
- * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check
- * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the
- * update to finish and for 'seq' to become even anyway.
- *
- * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which
- * enforces the time namespace handling path.
- */
-static void timens_setup_vdso_data(struct vdso_data *vdata,
- struct time_namespace *ns)
-{
- struct timens_offset *offset = vdata->offset;
- struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic);
- struct timens_offset boottime = offset_from_ts(ns->offsets.boottime);
-
- vdata->seq = 1;
- vdata->clock_mode = VDSO_CLOCKMODE_TIMENS;
- offset[CLOCK_MONOTONIC] = monotonic;
- offset[CLOCK_MONOTONIC_RAW] = monotonic;
- offset[CLOCK_MONOTONIC_COARSE] = monotonic;
- offset[CLOCK_BOOTTIME] = boottime;
- offset[CLOCK_BOOTTIME_ALARM] = boottime;
-}
-
-struct page *find_timens_vvar_page(struct vm_area_struct *vma)
-{
- if (likely(vma->vm_mm == current->mm))
- return current->nsproxy->time_ns->vvar_page;
-
- /*
- * VM_PFNMAP | VM_IO protect .fault() handler from being called
- * through interfaces like /proc/$pid/mem or
- * process_vm_{readv,writev}() as long as there's no .access()
- * in special_mapping_vmops().
- * For more details check_vma_flags() and __access_remote_vm()
- */
-
- WARN(1, "vvar_page accessed remotely");
-
- return NULL;
-}
-
-/*
- * Protects possibly multiple offsets writers racing each other
- * and tasks entering the namespace.
- */
-static DEFINE_MUTEX(offset_lock);
-
-static void timens_set_vvar_page(struct task_struct *task,
- struct time_namespace *ns)
-{
- struct vdso_data *vdata;
- unsigned int i;
-
- if (ns == &init_time_ns)
- return;
-
- /* Fast-path, taken by every task in namespace except the first. */
- if (likely(ns->frozen_offsets))
- return;
-
- mutex_lock(&offset_lock);
- /* Nothing to-do: vvar_page has been already initialized. */
- if (ns->frozen_offsets)
- goto out;
-
- ns->frozen_offsets = true;
- vdata = arch_get_vdso_data(page_address(ns->vvar_page));
-
- for (i = 0; i < CS_BASES; i++)
- timens_setup_vdso_data(&vdata[i], ns);
-
-out:
- mutex_unlock(&offset_lock);
-}
+DEFINE_MUTEX(timens_offset_lock);
void free_time_ns(struct time_namespace *ns)
{
+ ns_tree_remove(ns);
dec_time_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
- ns_free_inum(&ns->ns);
- __free_page(ns->vvar_page);
- kfree(ns);
-}
-
-static struct time_namespace *to_time_ns(struct ns_common *ns)
-{
- return container_of(ns, struct time_namespace, ns);
+ ns_common_free(ns);
+ timens_vdso_free_vvar_page(ns);
+ /* Concurrent nstree traversal depends on a grace period. */
+ kfree_rcu(ns, ns.ns_rcu);
}
static struct ns_common *timens_get(struct task_struct *task)
{
- struct time_namespace *ns = NULL;
+ struct time_namespace *ns;
struct nsproxy *nsproxy;
- task_lock(task);
+ guard(task_lock)(task);
nsproxy = task->nsproxy;
- if (nsproxy) {
- ns = nsproxy->time_ns;
- get_time_ns(ns);
- }
- task_unlock(task);
+ if (!nsproxy)
+ return NULL;
- return ns ? &ns->ns : NULL;
+ ns = nsproxy->time_ns;
+ get_time_ns(ns);
+ return &ns->ns;
}
static struct ns_common *timens_for_children_get(struct task_struct *task)
{
- struct time_namespace *ns = NULL;
+ struct time_namespace *ns;
struct nsproxy *nsproxy;
- task_lock(task);
+ guard(task_lock)(task);
nsproxy = task->nsproxy;
- if (nsproxy) {
- ns = nsproxy->time_ns_for_children;
- get_time_ns(ns);
- }
- task_unlock(task);
+ if (!nsproxy)
+ return NULL;
- return ns ? &ns->ns : NULL;
+ ns = nsproxy->time_ns_for_children;
+ get_time_ns(ns);
+ return &ns->ns;
}
static void timens_put(struct ns_common *ns)
@@ -295,12 +187,6 @@ static void timens_put(struct ns_common *ns)
put_time_ns(to_time_ns(ns));
}
-void timens_commit(struct task_struct *tsk, struct time_namespace *ns)
-{
- timens_set_vvar_page(tsk, ns);
- vdso_join_timens(tsk, ns);
-}
-
static int timens_install(struct nsset *nsset, struct ns_common *new)
{
struct nsproxy *nsproxy = nsset->nsproxy;
@@ -364,36 +250,33 @@ static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts)
void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m)
{
- struct ns_common *ns;
- struct time_namespace *time_ns;
+ struct time_namespace *time_ns __free(time_ns) = NULL;
+ struct ns_common *ns = timens_for_children_get(p);
- ns = timens_for_children_get(p);
if (!ns)
return;
+
time_ns = to_time_ns(ns);
show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic);
show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime);
- put_time_ns(time_ns);
}
int proc_timens_set_offset(struct file *file, struct task_struct *p,
struct proc_timens_offset *offsets, int noffsets)
{
- struct ns_common *ns;
- struct time_namespace *time_ns;
+ struct time_namespace *time_ns __free(time_ns) = NULL;
+ struct ns_common *ns = timens_for_children_get(p);
struct timespec64 tp;
- int i, err;
+ int i;
- ns = timens_for_children_get(p);
if (!ns)
return -ESRCH;
+
time_ns = to_time_ns(ns);
- if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) {
- put_time_ns(time_ns);
+ if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME))
return -EPERM;
- }
for (i = 0; i < noffsets; i++) {
struct proc_timens_offset *off = &offsets[i];
@@ -406,15 +289,12 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p,
ktime_get_boottime_ts64(&tp);
break;
default:
- err = -EINVAL;
- goto out;
+ return -EINVAL;
}
- err = -ERANGE;
-
if (off->val.tv_sec > KTIME_SEC_MAX ||
off->val.tv_sec < -KTIME_SEC_MAX)
- goto out;
+ return -ERANGE;
tp = timespec64_add(tp, off->val);
/*
@@ -422,16 +302,13 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p,
* still unreachable.
*/
if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2)
- goto out;
+ return -ERANGE;
}
- mutex_lock(&offset_lock);
- if (time_ns->frozen_offsets) {
- err = -EACCES;
- goto out_unlock;
- }
+ guard(mutex)(&timens_offset_lock);
+ if (time_ns->frozen_offsets)
+ return -EACCES;
- err = 0;
/* Don't report errors after this line */
for (i = 0; i < noffsets; i++) {
struct proc_timens_offset *off = &offsets[i];
@@ -449,17 +326,11 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p,
*offset = off->val;
}
-out_unlock:
- mutex_unlock(&offset_lock);
-out:
- put_time_ns(time_ns);
-
- return err;
+ return 0;
}
const struct proc_ns_operations timens_operations = {
.name = "time",
- .type = CLONE_NEWTIME,
.get = timens_get,
.put = timens_put,
.install = timens_install,
@@ -469,7 +340,6 @@ const struct proc_ns_operations timens_operations = {
const struct proc_ns_operations timens_for_children_operations = {
.name = "time_for_children",
.real_ns_name = "time",
- .type = CLONE_NEWTIME,
.get = timens_for_children_get,
.put = timens_put,
.install = timens_install,
@@ -477,9 +347,12 @@ const struct proc_ns_operations timens_for_children_operations = {
};
struct time_namespace init_time_ns = {
- .ns.count = REFCOUNT_INIT(3),
+ .ns = NS_COMMON_INIT(init_time_ns),
.user_ns = &init_user_ns,
- .ns.inum = PROC_TIME_INIT_INO,
- .ns.ops = &timens_operations,
.frozen_offsets = true,
};
+
+void __init time_ns_init(void)
+{
+ ns_tree_add(&init_time_ns);
+}
diff --git a/kernel/time/namespace_internal.h b/kernel/time/namespace_internal.h
new file mode 100644
index 000000000000..b37ba179f43b
--- /dev/null
+++ b/kernel/time/namespace_internal.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _TIME_NAMESPACE_INTERNAL_H
+#define _TIME_NAMESPACE_INTERNAL_H
+
+#include <linux/mutex.h>
+
+struct time_namespace;
+
+/*
+ * Protects possibly multiple offsets writers racing each other
+ * and tasks entering the namespace.
+ */
+extern struct mutex timens_offset_lock;
+
+#ifdef CONFIG_TIME_NS_VDSO
+int timens_vdso_alloc_vvar_page(struct time_namespace *ns);
+void timens_vdso_free_vvar_page(struct time_namespace *ns);
+#else /* !CONFIG_TIME_NS_VDSO */
+static inline int timens_vdso_alloc_vvar_page(struct time_namespace *ns)
+{
+ return 0;
+}
+static inline void timens_vdso_free_vvar_page(struct time_namespace *ns)
+{
+}
+#endif /* CONFIG_TIME_NS_VDSO */
+
+#endif /* _TIME_NAMESPACE_INTERNAL_H */
diff --git a/kernel/time/namespace_vdso.c b/kernel/time/namespace_vdso.c
new file mode 100644
index 000000000000..0d74d160eec9
--- /dev/null
+++ b/kernel/time/namespace_vdso.c
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Author: Andrei Vagin <avagin@openvz.org>
+ * Author: Dmitry Safonov <dima@arista.com>
+ */
+
+#include <linux/cleanup.h>
+#include <linux/mm.h>
+#include <linux/time_namespace.h>
+#include <linux/time.h>
+#include <linux/vdso_datastore.h>
+
+#include <vdso/clocksource.h>
+#include <vdso/datapage.h>
+
+#include "namespace_internal.h"
+
+static struct timens_offset offset_from_ts(struct timespec64 off)
+{
+ struct timens_offset ret;
+
+ ret.sec = off.tv_sec;
+ ret.nsec = off.tv_nsec;
+
+ return ret;
+}
+
+/*
+ * A time namespace VVAR page has the same layout as the VVAR page which
+ * contains the system wide VDSO data.
+ *
+ * For a normal task the VVAR pages are installed in the normal ordering:
+ * VVAR
+ * PVCLOCK
+ * HVCLOCK
+ * TIMENS <- Not really required
+ *
+ * Now for a timens task the pages are installed in the following order:
+ * TIMENS
+ * PVCLOCK
+ * HVCLOCK
+ * VVAR
+ *
+ * The check for vdso_clock->clock_mode is in the unlikely path of
+ * the seq begin magic. So for the non-timens case most of the time
+ * 'seq' is even, so the branch is not taken.
+ *
+ * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check
+ * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the
+ * update to finish and for 'seq' to become even anyway.
+ *
+ * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which
+ * enforces the time namespace handling path.
+ */
+static void timens_setup_vdso_clock_data(struct vdso_clock *vc,
+ struct time_namespace *ns)
+{
+ struct timens_offset *offset = vc->offset;
+ struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic);
+ struct timens_offset boottime = offset_from_ts(ns->offsets.boottime);
+
+ vc->seq = 1;
+ vc->clock_mode = VDSO_CLOCKMODE_TIMENS;
+ offset[CLOCK_MONOTONIC] = monotonic;
+ offset[CLOCK_MONOTONIC_RAW] = monotonic;
+ offset[CLOCK_MONOTONIC_COARSE] = monotonic;
+ offset[CLOCK_BOOTTIME] = boottime;
+ offset[CLOCK_BOOTTIME_ALARM] = boottime;
+}
+
+struct page *find_timens_vvar_page(struct vm_area_struct *vma)
+{
+ if (likely(vma->vm_mm == current->mm))
+ return current->nsproxy->time_ns->vvar_page;
+
+ /*
+ * VM_PFNMAP | VM_IO protect .fault() handler from being called
+ * through interfaces like /proc/$pid/mem or
+ * process_vm_{readv,writev}() as long as there's no .access()
+ * in special_mapping_vmops().
+ * For more details check_vma_flags() and __access_remote_vm()
+ */
+
+ WARN(1, "vvar_page accessed remotely");
+
+ return NULL;
+}
+
+static void timens_set_vvar_page(struct task_struct *task,
+ struct time_namespace *ns)
+{
+ struct vdso_time_data *vdata;
+ struct vdso_clock *vc;
+ unsigned int i;
+
+ if (ns == &init_time_ns)
+ return;
+
+ /* Fast-path, taken by every task in namespace except the first. */
+ if (likely(ns->frozen_offsets))
+ return;
+
+ guard(mutex)(&timens_offset_lock);
+ /* Nothing to-do: vvar_page has been already initialized. */
+ if (ns->frozen_offsets)
+ return;
+
+ ns->frozen_offsets = true;
+ vdata = page_address(ns->vvar_page);
+ vc = vdata->clock_data;
+
+ for (i = 0; i < CS_BASES; i++)
+ timens_setup_vdso_clock_data(&vc[i], ns);
+
+ if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) {
+ for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++)
+ timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns);
+ }
+}
+
+/*
+ * The vvar page layout depends on whether a task belongs to the root or
+ * non-root time namespace. Whenever a task changes its namespace, the VVAR
+ * page tables are cleared and then they will be re-faulted with a
+ * corresponding layout.
+ * See also the comment near timens_setup_vdso_clock_data() for details.
+ */
+static int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
+{
+ struct mm_struct *mm = task->mm;
+ struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
+
+ guard(mmap_read_lock)(mm);
+ for_each_vma(vmi, vma) {
+ if (vma_is_special_mapping(vma, &vdso_vvar_mapping))
+ zap_vma(vma);
+ }
+ return 0;
+}
+
+void timens_commit(struct task_struct *tsk, struct time_namespace *ns)
+{
+ timens_set_vvar_page(tsk, ns);
+ vdso_join_timens(tsk, ns);
+}
+
+int timens_vdso_alloc_vvar_page(struct time_namespace *ns)
+{
+ ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!ns->vvar_page)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void timens_vdso_free_vvar_page(struct time_namespace *ns)
+{
+ __free_page(ns->vvar_page);
+}
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 163e7a2033b6..97fa99b96dd0 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -18,6 +18,7 @@
#include <linux/module.h>
#include <linux/rtc.h>
#include <linux/audit.h>
+#include <linux/timekeeper_internal.h>
#include "ntp_internal.h"
#include "timekeeping_internal.h"
@@ -86,14 +87,16 @@ struct ntp_data {
#endif
};
-static struct ntp_data tk_ntp_data = {
- .tick_usec = USER_TICK_USEC,
- .time_state = TIME_OK,
- .time_status = STA_UNSYNC,
- .time_constant = 2,
- .time_maxerror = NTP_PHASE_LIMIT,
- .time_esterror = NTP_PHASE_LIMIT,
- .ntp_next_leap_sec = TIME64_MAX,
+static struct ntp_data tk_ntp_data[TIMEKEEPERS_MAX] = {
+ [ 0 ... TIMEKEEPERS_MAX - 1 ] = {
+ .tick_usec = USER_TICK_USEC,
+ .time_state = TIME_OK,
+ .time_status = STA_UNSYNC,
+ .time_constant = 2,
+ .time_maxerror = NTP_PHASE_LIMIT,
+ .time_esterror = NTP_PHASE_LIMIT,
+ .ntp_next_leap_sec = TIME64_MAX,
+ },
};
#define SECS_PER_DAY 86400
@@ -300,7 +303,7 @@ static void ntp_update_offset(struct ntp_data *ntpdata, long offset)
* Select how the frequency is to be controlled
* and in which mode (PLL or FLL).
*/
- real_secs = __ktime_get_real_seconds();
+ real_secs = ktime_get_ntp_seconds(ntpdata - tk_ntp_data);
secs = (long)(real_secs - ntpdata->time_reftime);
if (unlikely(ntpdata->time_status & STA_FREQHOLD))
secs = 0;
@@ -348,33 +351,38 @@ static void __ntp_clear(struct ntp_data *ntpdata)
/**
* ntp_clear - Clears the NTP state variables
+ * @tkid: Timekeeper ID to be able to select proper ntp data array member
*/
-void ntp_clear(void)
+void ntp_clear(unsigned int tkid)
{
- __ntp_clear(&tk_ntp_data);
+ __ntp_clear(&tk_ntp_data[tkid]);
}
-u64 ntp_tick_length(void)
+u64 ntp_tick_length(unsigned int tkid)
{
- return tk_ntp_data.tick_length;
+ return tk_ntp_data[tkid].tick_length;
}
/**
* ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t
+ * @tkid: Timekeeper ID
*
- * Provides the time of the next leapsecond against CLOCK_REALTIME in
- * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending.
+ * Returns: For @tkid == TIMEKEEPER_CORE this provides the time of the next
+ * leap second against CLOCK_REALTIME in a ktime_t format if a
+ * leap second is pending. KTIME_MAX otherwise.
*/
-ktime_t ntp_get_next_leap(void)
+ktime_t ntp_get_next_leap(unsigned int tkid)
{
- struct ntp_data *ntpdata = &tk_ntp_data;
- ktime_t ret;
+ struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE];
+
+ if (tkid != TIMEKEEPER_CORE)
+ return KTIME_MAX;
if ((ntpdata->time_state == TIME_INS) && (ntpdata->time_status & STA_INS))
return ktime_set(ntpdata->ntp_next_leap_sec, 0);
- ret = KTIME_MAX;
- return ret;
+
+ return KTIME_MAX;
}
/*
@@ -387,9 +395,9 @@ ktime_t ntp_get_next_leap(void)
*
* Also handles leap second processing, and returns leap offset
*/
-int second_overflow(time64_t secs)
+int second_overflow(unsigned int tkid, time64_t secs)
{
- struct ntp_data *ntpdata = &tk_ntp_data;
+ struct ntp_data *ntpdata = &tk_ntp_data[tkid];
s64 delta;
int leap = 0;
s32 rem;
@@ -605,7 +613,7 @@ static inline int update_rtc(struct timespec64 *to_set, unsigned long *offset_ns
*/
static inline bool ntp_synced(void)
{
- return !(tk_ntp_data.time_status & STA_UNSYNC);
+ return !(tk_ntp_data[TIMEKEEPER_CORE].time_status & STA_UNSYNC);
}
/*
@@ -678,8 +686,7 @@ void ntp_notify_cmos_timer(bool offset_set)
static void __init ntp_init_cmos_sync(void)
{
- hrtimer_init(&sync_hrtimer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
- sync_hrtimer.function = sync_timer_callback;
+ hrtimer_setup(&sync_hrtimer, sync_timer_callback, CLOCK_REALTIME, HRTIMER_MODE_ABS);
}
#else /* CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */
static inline void __init ntp_init_cmos_sync(void) { }
@@ -703,7 +710,7 @@ static inline void process_adj_status(struct ntp_data *ntpdata, const struct __k
* reference time to current time.
*/
if (!(ntpdata->time_status & STA_PLL) && (txc->status & STA_PLL))
- ntpdata->time_reftime = __ktime_get_real_seconds();
+ ntpdata->time_reftime = ktime_get_ntp_seconds(ntpdata - tk_ntp_data);
/* only set allowed bits */
ntpdata->time_status &= STA_RONLY;
@@ -760,10 +767,10 @@ static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct
* adjtimex() mainly allows reading (and writing, if superuser) of
* kernel time-keeping variables. used by xntpd.
*/
-int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts,
- s32 *time_tai, struct audit_ntp_data *ad)
+int ntp_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts,
+ s32 *time_tai, struct audit_ntp_data *ad)
{
- struct ntp_data *ntpdata = &tk_ntp_data;
+ struct ntp_data *ntpdata = &tk_ntp_data[tkid];
int result;
if (txc->modes & ADJ_ADJTIME) {
@@ -1032,8 +1039,8 @@ static void hardpps_update_phase(struct ntp_data *ntpdata, long error)
*/
void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
{
+ struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE];
struct pps_normtime pts_norm, freq_norm;
- struct ntp_data *ntpdata = &tk_ntp_data;
pts_norm = pps_normalize_ts(*phase_ts);
@@ -1084,18 +1091,18 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t
static int __init ntp_tick_adj_setup(char *str)
{
- int rc = kstrtos64(str, 0, &tk_ntp_data.ntp_tick_adj);
+ int rc = kstrtos64(str, 0, &tk_ntp_data[TIMEKEEPER_CORE].ntp_tick_adj);
if (rc)
return rc;
- tk_ntp_data.ntp_tick_adj <<= NTP_SCALE_SHIFT;
+ tk_ntp_data[TIMEKEEPER_CORE].ntp_tick_adj <<= NTP_SCALE_SHIFT;
return 1;
}
-
__setup("ntp_tick_adj=", ntp_tick_adj_setup);
void __init ntp_init(void)
{
- ntp_clear();
+ for (int id = 0; id < TIMEKEEPERS_MAX; id++)
+ __ntp_clear(tk_ntp_data + id);
ntp_init_cmos_sync();
}
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index 5a633dce9057..7084d839c207 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -3,14 +3,13 @@
#define _LINUX_NTP_INTERNAL_H
extern void ntp_init(void);
-extern void ntp_clear(void);
+extern void ntp_clear(unsigned int tkid);
/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
-extern u64 ntp_tick_length(void);
-extern ktime_t ntp_get_next_leap(void);
-extern int second_overflow(time64_t secs);
-extern int __do_adjtimex(struct __kernel_timex *txc,
- const struct timespec64 *ts,
- s32 *time_tai, struct audit_ntp_data *ad);
+extern u64 ntp_tick_length(unsigned int tkid);
+extern ktime_t ntp_get_next_leap(unsigned int tkid);
+extern int second_overflow(unsigned int tkid, time64_t secs);
+extern int ntp_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts,
+ s32 *time_tai, struct audit_ntp_data *ad);
extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts);
#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index 1af0bb2cc45c..dab37295c8c2 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -90,26 +90,6 @@ static long posix_clock_ioctl(struct file *fp,
return err;
}
-#ifdef CONFIG_COMPAT
-static long posix_clock_compat_ioctl(struct file *fp,
- unsigned int cmd, unsigned long arg)
-{
- struct posix_clock_context *pccontext = fp->private_data;
- struct posix_clock *clk = get_posix_clock(fp);
- int err = -ENOTTY;
-
- if (!clk)
- return -ENODEV;
-
- if (clk->ops.ioctl)
- err = clk->ops.ioctl(pccontext, cmd, arg);
-
- put_posix_clock(clk);
-
- return err;
-}
-#endif
-
static int posix_clock_open(struct inode *inode, struct file *fp)
{
int err;
@@ -123,12 +103,13 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
err = -ENODEV;
goto out;
}
- pccontext = kzalloc(sizeof(*pccontext), GFP_KERNEL);
+ pccontext = kzalloc_obj(*pccontext);
if (!pccontext) {
err = -ENOMEM;
goto out;
}
pccontext->clk = clk;
+ pccontext->fp = fp;
if (clk->ops.open) {
err = clk->ops.open(pccontext, fp->f_mode);
if (err) {
@@ -171,11 +152,9 @@ static const struct file_operations posix_clock_file_operations = {
.read = posix_clock_read,
.poll = posix_clock_poll,
.unlocked_ioctl = posix_clock_ioctl,
+ .compat_ioctl = posix_clock_ioctl,
.open = posix_clock_open,
.release = posix_clock_release,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = posix_clock_compat_ioctl,
-#endif
};
int posix_clock_register(struct posix_clock *clk, struct device *dev)
@@ -251,7 +230,7 @@ static int pc_clock_adjtime(clockid_t id, struct __kernel_timex *tx)
if (err)
return err;
- if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
+ if (tx->modes && (cd.fp->f_mode & FMODE_WRITE) == 0) {
err = -EACCES;
goto out;
}
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 50e8d04ab661..0de2bb7cbec0 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1406,6 +1406,15 @@ void run_posix_cpu_timers(void)
lockdep_assert_irqs_disabled();
/*
+ * Ensure that release_task(tsk) can't happen while
+ * handle_posix_cpu_timers() is running. Otherwise, a concurrent
+ * posix_cpu_timer_del() may fail to lock_task_sighand(tsk) and
+ * miss timer->it.cpu.firing != 0.
+ */
+ if (tsk->exit_state)
+ return;
+
+ /*
* If the actual expiry is deferred to task work context and the
* work is already scheduled there is no point to do anything here.
*/
@@ -1548,7 +1557,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
* Report back to the user the time still remaining.
*/
restart = &current->restart_block;
- restart->nanosleep.expires = expires;
+ restart->nanosleep.expires = ns_to_ktime(expires);
if (restart->nanosleep.type != TT_NONE)
error = nanosleep_copyout(restart, &it.it_value);
}
@@ -1590,7 +1599,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
clockid_t which_clock = restart_block->nanosleep.clockid;
struct timespec64 t;
- t = ns_to_timespec64(restart_block->nanosleep.expires);
+ t = ktime_to_timespec64(restart_block->nanosleep.expires);
return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t);
}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 1b675aee99a9..9331e1614124 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -9,34 +9,27 @@
*
* These are all the functions necessary to implement POSIX clocks & timers
*/
-#include <linux/mm.h>
+#include <linux/compat.h>
+#include <linux/compiler.h>
+#include <linux/init.h>
+#include <linux/jhash.h>
#include <linux/interrupt.h>
-#include <linux/slab.h>
-#include <linux/time.h>
-#include <linux/mutex.h>
-#include <linux/sched/task.h>
-
-#include <linux/uaccess.h>
#include <linux/list.h>
-#include <linux/init.h>
-#include <linux/compiler.h>
-#include <linux/hash.h>
+#include <linux/memblock.h>
+#include <linux/nospec.h>
#include <linux/posix-clock.h>
#include <linux/posix-timers.h>
+#include <linux/prctl.h>
+#include <linux/sched/task.h>
+#include <linux/slab.h>
#include <linux/syscalls.h>
-#include <linux/wait.h>
-#include <linux/workqueue.h>
-#include <linux/export.h>
-#include <linux/hashtable.h>
-#include <linux/compat.h>
-#include <linux/nospec.h>
+#include <linux/time.h>
#include <linux/time_namespace.h>
+#include <linux/uaccess.h>
#include "timekeeping.h"
#include "posix-timers.h"
-static struct kmem_cache *posix_timers_cache;
-
/*
* Timers are managed in a hash table for lockless lookup. The hash key is
* constructed from current::signal and the timer ID and the timer is
@@ -46,39 +39,60 @@ static struct kmem_cache *posix_timers_cache;
* This allows checkpoint/restore to reconstruct the exact timer IDs for
* a process.
*/
-static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
-static DEFINE_SPINLOCK(hash_lock);
+struct timer_hash_bucket {
+ spinlock_t lock;
+ struct hlist_head head;
+};
+
+static struct {
+ struct timer_hash_bucket *buckets;
+ unsigned long mask;
+ struct kmem_cache *cache;
+} __timer_data __ro_after_init __aligned(4*sizeof(long));
+
+#define timer_buckets (__timer_data.buckets)
+#define timer_hashmask (__timer_data.mask)
+#define posix_timers_cache (__timer_data.cache)
static const struct k_clock * const posix_clocks[];
static const struct k_clock *clockid_to_kclock(const clockid_t id);
static const struct k_clock clock_realtime, clock_monotonic;
+#define TIMER_ANY_ID INT_MIN
+
/* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */
#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \
~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD))
#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
#endif
-static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
+static struct k_itimer *lock_timer(timer_t timer_id);
+static inline void unlock_timer(struct k_itimer *timr)
+{
+ if (likely((timr)))
+ spin_unlock_irq(&timr->it_lock);
+}
+
+#define scoped_timer_get_or_fail(_id) \
+ scoped_cond_guard(lock_timer, return -EINVAL, _id)
-#define lock_timer(tid, flags) \
-({ struct k_itimer *__timr; \
- __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \
- __timr; \
-})
+#define scoped_timer (scope)
-static int hash(struct signal_struct *sig, unsigned int nr)
+DEFINE_CLASS(lock_timer, struct k_itimer *, unlock_timer(_T), lock_timer(id), timer_t id);
+DEFINE_CLASS_IS_COND_GUARD(lock_timer);
+
+static struct timer_hash_bucket *hash_bucket(struct signal_struct *sig, unsigned int nr)
{
- return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable));
+ return &timer_buckets[jhash2((u32 *)&sig, sizeof(sig) / sizeof(u32), nr) & timer_hashmask];
}
-static struct k_itimer *__posix_timers_find(struct hlist_head *head,
- struct signal_struct *sig,
- timer_t id)
+static struct k_itimer *posix_timer_by_id(timer_t id)
{
+ struct signal_struct *sig = current->signal;
+ struct timer_hash_bucket *bucket = hash_bucket(sig, id);
struct k_itimer *timer;
- hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) {
+ hlist_for_each_entry_rcu(timer, &bucket->head, t_hash) {
/* timer->it_signal can be set concurrently */
if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id))
return timer;
@@ -86,46 +100,88 @@ static struct k_itimer *__posix_timers_find(struct hlist_head *head,
return NULL;
}
-static struct k_itimer *posix_timer_by_id(timer_t id)
+static inline struct signal_struct *posix_sig_owner(const struct k_itimer *timer)
{
- struct signal_struct *sig = current->signal;
- struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)];
+ unsigned long val = (unsigned long)timer->it_signal;
- return __posix_timers_find(head, sig, id);
+ /*
+ * Mask out bit 0, which acts as invalid marker to prevent
+ * posix_timer_by_id() detecting it as valid.
+ */
+ return (struct signal_struct *)(val & ~1UL);
}
-static int posix_timer_add(struct k_itimer *timer)
+static bool posix_timer_hashed(struct timer_hash_bucket *bucket, struct signal_struct *sig,
+ timer_t id)
{
- struct signal_struct *sig = current->signal;
- struct hlist_head *head;
- unsigned int cnt, id;
+ struct hlist_head *head = &bucket->head;
+ struct k_itimer *timer;
- /*
- * FIXME: Replace this by a per signal struct xarray once there is
- * a plan to handle the resulting CRIU regression gracefully.
- */
- for (cnt = 0; cnt <= INT_MAX; cnt++) {
- spin_lock(&hash_lock);
- id = sig->next_posix_timer_id;
+ hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&bucket->lock)) {
+ if ((posix_sig_owner(timer) == sig) && (timer->it_id == id))
+ return true;
+ }
+ return false;
+}
- /* Write the next ID back. Clamp it to the positive space */
- sig->next_posix_timer_id = (id + 1) & INT_MAX;
+static bool posix_timer_add_at(struct k_itimer *timer, struct signal_struct *sig, unsigned int id)
+{
+ struct timer_hash_bucket *bucket = hash_bucket(sig, id);
- head = &posix_timers_hashtable[hash(sig, id)];
- if (!__posix_timers_find(head, sig, id)) {
- hlist_add_head_rcu(&timer->t_hash, head);
- spin_unlock(&hash_lock);
- return id;
+ scoped_guard (spinlock, &bucket->lock) {
+ /*
+ * Validate under the lock as this could have raced against
+ * another thread ending up with the same ID, which is
+ * highly unlikely, but possible.
+ */
+ if (!posix_timer_hashed(bucket, sig, id)) {
+ /*
+ * Set the timer ID and the signal pointer to make
+ * it identifiable in the hash table. The signal
+ * pointer has bit 0 set to indicate that it is not
+ * yet fully initialized. posix_timer_hashed()
+ * masks this bit out, but the syscall lookup fails
+ * to match due to it being set. This guarantees
+ * that there can't be duplicate timer IDs handed
+ * out.
+ */
+ timer->it_id = (timer_t)id;
+ timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL);
+ hlist_add_head_rcu(&timer->t_hash, &bucket->head);
+ return true;
}
- spin_unlock(&hash_lock);
}
- /* POSIX return code when no timer ID could be allocated */
- return -EAGAIN;
+ return false;
}
-static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
+static int posix_timer_add(struct k_itimer *timer, int req_id)
{
- spin_unlock_irqrestore(&timr->it_lock, flags);
+ struct signal_struct *sig = current->signal;
+
+ if (unlikely(req_id != TIMER_ANY_ID)) {
+ if (!posix_timer_add_at(timer, sig, req_id))
+ return -EBUSY;
+
+ /*
+ * Move the ID counter past the requested ID, so that after
+ * switching back to normal mode the IDs are outside of the
+ * exact allocated region. That avoids ID collisions on the
+ * next regular timer_create() invocations.
+ */
+ atomic_set(&sig->next_posix_timer_id, req_id + 1);
+ return req_id;
+ }
+
+ for (unsigned int cnt = 0; cnt <= INT_MAX; cnt++) {
+ /* Get the next timer ID and clamp it to positive space */
+ unsigned int id = atomic_fetch_inc(&sig->next_posix_timer_id) & INT_MAX;
+
+ if (posix_timer_add_at(timer, sig, id))
+ return id;
+ cond_resched();
+ }
+ /* POSIX return code when no timer ID could be allocated */
+ return -EAGAIN;
}
static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp)
@@ -220,15 +276,6 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp)
return 0;
}
-static __init int init_posix_timers(void)
-{
- posix_timers_cache = kmem_cache_create("posix_timers_cache",
- sizeof(struct k_itimer), 0,
- SLAB_PANIC | SLAB_ACCOUNT, NULL);
- return 0;
-}
-__initcall(init_posix_timers);
-
/*
* The siginfo si_overrun field and the return value of timer_getoverrun(2)
* are of type int. Clamp the overrun value to INT_MAX
@@ -245,8 +292,7 @@ static void common_hrtimer_rearm(struct k_itimer *timr)
{
struct hrtimer *timer = &timr->it.real.timer;
- timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(),
- timr->it_interval);
+ timr->it_overrun += hrtimer_forward_now(timer, timr->it_interval);
hrtimer_restart(timer);
}
@@ -259,7 +305,7 @@ static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_it
* since the signal was queued. In either case, don't rearm and
* drop the signal.
*/
- if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!timr->it_signal))
+ if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!posixtimer_valid(timr)))
return false;
if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING))
@@ -304,6 +350,9 @@ void posix_timer_queue_signal(struct k_itimer *timr)
{
lockdep_assert_held(&timr->it_lock);
+ if (!posixtimer_valid(timr))
+ return;
+
timr->it_status = timr->it_interval ? POSIX_TIMER_REQUEUE_PENDING : POSIX_TIMER_DISARMED;
posixtimer_send_sigqueue(timr);
}
@@ -324,6 +373,21 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
+long posixtimer_create_prctl(unsigned long ctrl)
+{
+ switch (ctrl) {
+ case PR_TIMER_CREATE_RESTORE_IDS_OFF:
+ current->signal->timer_create_restore_ids = 0;
+ return 0;
+ case PR_TIMER_CREATE_RESTORE_IDS_ON:
+ current->signal->timer_create_restore_ids = 1;
+ return 0;
+ case PR_TIMER_CREATE_RESTORE_IDS_GET:
+ return current->signal->timer_create_restore_ids;
+ }
+ return -EINVAL;
+}
+
static struct pid *good_sigevent(sigevent_t * event)
{
struct pid *pid = task_tgid(current);
@@ -350,8 +414,12 @@ static struct pid *good_sigevent(sigevent_t * event)
static struct k_itimer *alloc_posix_timer(void)
{
- struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);
+ struct k_itimer *tmr;
+ if (unlikely(!posix_timers_cache))
+ return NULL;
+
+ tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL);
if (!tmr)
return tmr;
@@ -373,15 +441,16 @@ void posixtimer_free_timer(struct k_itimer *tmr)
static void posix_timer_unhash_and_free(struct k_itimer *tmr)
{
- spin_lock(&hash_lock);
- hlist_del_rcu(&tmr->t_hash);
- spin_unlock(&hash_lock);
+ struct timer_hash_bucket *bucket = hash_bucket(posix_sig_owner(tmr), tmr->it_id);
+
+ scoped_guard (spinlock, &bucket->lock)
+ hlist_del_rcu(&tmr->t_hash);
posixtimer_putref(tmr);
}
static int common_timer_create(struct k_itimer *new_timer)
{
- hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
+ hrtimer_setup(&new_timer->it.real.timer, posix_timer_fn, new_timer->it_clock, 0);
return 0;
}
@@ -390,6 +459,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
timer_t __user *created_timer_id)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
+ timer_t req_id = TIMER_ANY_ID;
struct k_itimer *new_timer;
int error, new_timer_id;
@@ -398,6 +468,15 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
if (!kc->timer_create)
return -EOPNOTSUPP;
+ /* Special case for CRIU to restore timers with a given timer ID. */
+ if (unlikely(current->signal->timer_create_restore_ids)) {
+ if (copy_from_user(&req_id, created_timer_id, sizeof(req_id)))
+ return -EFAULT;
+ /* Valid IDs are 0..INT_MAX */
+ if ((unsigned int)req_id > INT_MAX)
+ return -EINVAL;
+ }
+
new_timer = alloc_posix_timer();
if (unlikely(!new_timer))
return -EAGAIN;
@@ -406,24 +485,21 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
/*
* Add the timer to the hash table. The timer is not yet valid
- * because new_timer::it_signal is still NULL. The timer id is also
- * not yet visible to user space.
+ * after insertion, but has a unique ID allocated.
*/
- new_timer_id = posix_timer_add(new_timer);
+ new_timer_id = posix_timer_add(new_timer, req_id);
if (new_timer_id < 0) {
posixtimer_free_timer(new_timer);
return new_timer_id;
}
- new_timer->it_id = (timer_t) new_timer_id;
new_timer->it_clock = which_clock;
new_timer->kclock = kc;
new_timer->it_overrun = -1LL;
if (event) {
- rcu_read_lock();
- new_timer->it_pid = get_pid(good_sigevent(event));
- rcu_read_unlock();
+ scoped_guard (rcu)
+ new_timer->it_pid = get_pid(good_sigevent(event));
if (!new_timer->it_pid) {
error = -EINVAL;
goto out;
@@ -434,7 +510,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
} else {
new_timer->it_sigev_notify = SIGEV_SIGNAL;
new_timer->sigq.info.si_signo = SIGALRM;
- memset(&new_timer->sigq.info.si_value, 0, sizeof(sigval_t));
new_timer->sigq.info.si_value.sival_int = new_timer->it_id;
new_timer->it_pid = get_pid(task_tgid(current));
}
@@ -452,8 +527,8 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
goto out;
}
/*
- * After succesful copy out, the timer ID is visible to user space
- * now but not yet valid because new_timer::signal is still NULL.
+ * After successful copy out, the timer ID is visible to user space
+ * now but not yet valid because new_timer::signal low order bit is 1.
*
* Complete the initialization with the clock specific create
* callback.
@@ -462,14 +537,25 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event,
if (error)
goto out;
- spin_lock_irq(&current->sighand->siglock);
- /* This makes the timer valid in the hash table */
- WRITE_ONCE(new_timer->it_signal, current->signal);
- hlist_add_head(&new_timer->list, &current->signal->posix_timers);
- spin_unlock_irq(&current->sighand->siglock);
/*
- * After unlocking sighand::siglock @new_timer is subject to
- * concurrent removal and cannot be touched anymore
+ * timer::it_lock ensures that __lock_timer() observes a fully
+ * initialized timer when it observes a valid timer::it_signal.
+ *
+ * sighand::siglock is required to protect signal::posix_timers.
+ */
+ scoped_guard (spinlock_irq, &new_timer->it_lock) {
+ guard(spinlock)(&current->sighand->siglock);
+ /*
+ * new_timer::it_signal contains the signal pointer with
+ * bit 0 set, which makes it invalid for syscall operations.
+ * Store the unmodified signal pointer to make it valid.
+ */
+ WRITE_ONCE(new_timer->it_signal, current->signal);
+ hlist_add_head_rcu(&new_timer->list, &current->signal->posix_timers);
+ }
+ /*
+ * After unlocking @new_timer is subject to concurrent removal and
+ * cannot be touched anymore
*/
return 0;
out:
@@ -507,7 +593,7 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock,
}
#endif
-static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
+static struct k_itimer *lock_timer(timer_t timer_id)
{
struct k_itimer *timr;
@@ -522,11 +608,11 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
* The hash lookup and the timers are RCU protected.
*
* Timers are added to the hash in invalid state where
- * timr::it_signal == NULL. timer::it_signal is only set after the
- * rest of the initialization succeeded.
+ * timr::it_signal is marked invalid. timer::it_signal is only set
+ * after the rest of the initialization succeeded.
*
* Timer destruction happens in steps:
- * 1) Set timr::it_signal to NULL with timr::it_lock held
+ * 1) Set timr::it_signal marked invalid with timr::it_lock held
* 2) Release timr::it_lock
* 3) Remove from the hash under hash_lock
* 4) Put the reference count.
@@ -543,25 +629,21 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
*
* The lookup validates locklessly that timr::it_signal ==
* current::it_signal and timr::it_id == @timer_id. timr::it_id
- * can't change, but timr::it_signal becomes NULL during
- * destruction.
+ * can't change, but timr::it_signal can become invalid during
+ * destruction, which makes the locked check fail.
*/
- rcu_read_lock();
+ guard(rcu)();
timr = posix_timer_by_id(timer_id);
if (timr) {
- spin_lock_irqsave(&timr->it_lock, *flags);
+ spin_lock_irq(&timr->it_lock);
/*
* Validate under timr::it_lock that timr::it_signal is
* still valid. Pairs with #1 above.
*/
- if (timr->it_signal == current->signal) {
- rcu_read_unlock();
+ if (timr->it_signal == current->signal)
return timr;
- }
- spin_unlock_irqrestore(&timr->it_lock, *flags);
+ spin_unlock_irq(&timr->it_lock);
}
- rcu_read_unlock();
-
return NULL;
}
@@ -652,24 +734,10 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting)
static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting)
{
- const struct k_clock *kc;
- struct k_itimer *timr;
- unsigned long flags;
- int ret = 0;
-
- timr = lock_timer(timer_id, &flags);
- if (!timr)
- return -EINVAL;
-
memset(setting, 0, sizeof(*setting));
- kc = timr->kclock;
- if (WARN_ON_ONCE(!kc || !kc->timer_get))
- ret = -EINVAL;
- else
- kc->timer_get(timr, setting);
-
- unlock_timer(timr, flags);
- return ret;
+ scoped_timer_get_or_fail(timer_id)
+ scoped_timer->kclock->timer_get(scoped_timer, setting);
+ return 0;
}
/* Get the time remaining on a POSIX.1b interval timer. */
@@ -723,18 +791,8 @@ SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id,
*/
SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id)
{
- struct k_itimer *timr;
- unsigned long flags;
- int overrun;
-
- timr = lock_timer(timer_id, &flags);
- if (!timr)
- return -EINVAL;
-
- overrun = timer_overrun_to_int(timr);
- unlock_timer(timr, flags);
-
- return overrun;
+ scoped_timer_get_or_fail(timer_id)
+ return timer_overrun_to_int(scoped_timer);
}
static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
@@ -747,7 +805,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
/*
* Posix magic: Relative CLOCK_REALTIME timers are not affected by
* clock modifications, so they become CLOCK_MONOTONIC based under the
- * hood. See hrtimer_init(). Update timr->kclock, so the generic
+ * hood. See hrtimer_setup(). Update timr->kclock, so the generic
* functions which use timr->kclock->clock_get_*() work.
*
* Note: it_clock stays unmodified, because the next timer_set() might
@@ -756,11 +814,10 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires,
if (timr->it_clock == CLOCK_REALTIME)
timr->kclock = absolute ? &clock_realtime : &clock_monotonic;
- hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
- timr->it.real.timer.function = posix_timer_fn;
+ hrtimer_setup(&timr->it.real.timer, posix_timer_fn, timr->it_clock, mode);
if (!absolute)
- expires = ktime_add_safe(expires, timer->base->get_time());
+ expires = ktime_add_safe(expires, hrtimer_cb_get_time(timer));
hrtimer_set_expires(timer, expires);
if (!sigev_none)
@@ -791,26 +848,13 @@ static void common_timer_wait_running(struct k_itimer *timer)
* when the task which tries to delete or disarm the timer has preempted
* the task which runs the expiry in task work context.
*/
-static struct k_itimer *timer_wait_running(struct k_itimer *timer,
- unsigned long *flags)
+static void timer_wait_running(struct k_itimer *timer)
{
- const struct k_clock *kc = READ_ONCE(timer->kclock);
- timer_t timer_id = READ_ONCE(timer->it_id);
-
- /* Prevent kfree(timer) after dropping the lock */
- rcu_read_lock();
- unlock_timer(timer, *flags);
-
/*
* kc->timer_wait_running() might drop RCU lock. So @timer
* cannot be touched anymore after the function returns!
*/
- if (!WARN_ON_ONCE(!kc->timer_wait_running))
- kc->timer_wait_running(timer);
-
- rcu_read_unlock();
- /* Relock the timer. It might be not longer hashed. */
- return lock_timer(timer_id, flags);
+ timer->kclock->timer_wait_running(timer);
}
/*
@@ -865,15 +909,9 @@ int common_timer_set(struct k_itimer *timr, int flags,
return 0;
}
-static int do_timer_settime(timer_t timer_id, int tmr_flags,
- struct itimerspec64 *new_spec64,
+static int do_timer_settime(timer_t timer_id, int tmr_flags, struct itimerspec64 *new_spec64,
struct itimerspec64 *old_spec64)
{
- const struct k_clock *kc;
- struct k_itimer *timr;
- unsigned long flags;
- int error;
-
if (!timespec64_valid(&new_spec64->it_interval) ||
!timespec64_valid(&new_spec64->it_value))
return -EINVAL;
@@ -881,33 +919,28 @@ static int do_timer_settime(timer_t timer_id, int tmr_flags,
if (old_spec64)
memset(old_spec64, 0, sizeof(*old_spec64));
- timr = lock_timer(timer_id, &flags);
-retry:
- if (!timr)
- return -EINVAL;
+ for (; ; old_spec64 = NULL) {
+ struct k_itimer *timr;
- if (old_spec64)
- old_spec64->it_interval = ktime_to_timespec64(timr->it_interval);
+ scoped_timer_get_or_fail(timer_id) {
+ timr = scoped_timer;
- /* Prevent signal delivery and rearming. */
- timr->it_signal_seq++;
+ if (old_spec64)
+ old_spec64->it_interval = ktime_to_timespec64(timr->it_interval);
- kc = timr->kclock;
- if (WARN_ON_ONCE(!kc || !kc->timer_set))
- error = -EINVAL;
- else
- error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64);
-
- if (error == TIMER_RETRY) {
- // We already got the old time...
- old_spec64 = NULL;
- /* Unlocks and relocks the timer if it still exists */
- timr = timer_wait_running(timr, &flags);
- goto retry;
- }
- unlock_timer(timr, flags);
+ /* Prevent signal delivery and rearming. */
+ timr->it_signal_seq++;
- return error;
+ int ret = timr->kclock->timer_set(timr, tmr_flags, new_spec64, old_spec64);
+ if (ret != TIMER_RETRY)
+ return ret;
+
+ /* Protect the timer from being freed when leaving the lock scope */
+ rcu_read_lock();
+ }
+ timer_wait_running(timr);
+ rcu_read_unlock();
+ }
}
/* Set a POSIX.1b interval timer */
@@ -978,110 +1011,58 @@ static inline void posix_timer_cleanup_ignored(struct k_itimer *tmr)
}
}
-static inline int timer_delete_hook(struct k_itimer *timer)
+static void posix_timer_delete(struct k_itimer *timer)
{
- const struct k_clock *kc = timer->kclock;
-
- /* Prevent signal delivery and rearming. */
+ /*
+ * Invalidate the timer, remove it from the linked list and remove
+ * it from the ignored list if pending.
+ *
+ * The invalidation must be written with siglock held so that the
+ * signal code observes the invalidated timer::it_signal in
+ * do_sigaction(), which prevents it from moving a pending signal
+ * of a deleted timer to the ignore list.
+ *
+ * The invalidation also prevents signal queueing, signal delivery
+ * and therefore rearming from the signal delivery path.
+ *
+ * A concurrent lookup can still find the timer in the hash, but it
+ * will check timer::it_signal with timer::it_lock held and observe
+ * bit 0 set, which invalidates it. That also prevents the timer ID
+ * from being handed out before this timer is completely gone.
+ */
timer->it_signal_seq++;
- if (WARN_ON_ONCE(!kc || !kc->timer_del))
- return -EINVAL;
- return kc->timer_del(timer);
+ scoped_guard (spinlock, &current->sighand->siglock) {
+ unsigned long sig = (unsigned long)timer->it_signal | 1UL;
+
+ WRITE_ONCE(timer->it_signal, (struct signal_struct *)sig);
+ hlist_del_rcu(&timer->list);
+ posix_timer_cleanup_ignored(timer);
+ }
+
+ while (timer->kclock->timer_del(timer) == TIMER_RETRY) {
+ guard(rcu)();
+ spin_unlock_irq(&timer->it_lock);
+ timer_wait_running(timer);
+ spin_lock_irq(&timer->it_lock);
+ }
}
/* Delete a POSIX.1b interval timer. */
SYSCALL_DEFINE1(timer_delete, timer_t, timer_id)
{
struct k_itimer *timer;
- unsigned long flags;
-
- timer = lock_timer(timer_id, &flags);
-retry_delete:
- if (!timer)
- return -EINVAL;
-
- if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) {
- /* Unlocks and relocks the timer if it still exists */
- timer = timer_wait_running(timer, &flags);
- goto retry_delete;
+ scoped_timer_get_or_fail(timer_id) {
+ timer = scoped_timer;
+ posix_timer_delete(timer);
}
-
- spin_lock(&current->sighand->siglock);
- hlist_del(&timer->list);
- posix_timer_cleanup_ignored(timer);
- /*
- * A concurrent lookup could check timer::it_signal lockless. It
- * will reevaluate with timer::it_lock held and observe the NULL.
- *
- * It must be written with siglock held so that the signal code
- * observes timer->it_signal == NULL in do_sigaction(SIG_IGN),
- * which prevents it from moving a pending signal of a deleted
- * timer to the ignore list.
- */
- WRITE_ONCE(timer->it_signal, NULL);
- spin_unlock(&current->sighand->siglock);
-
- unlock_timer(timer, flags);
+ /* Remove it from the hash, which frees up the timer ID */
posix_timer_unhash_and_free(timer);
return 0;
}
/*
- * Delete a timer if it is armed, remove it from the hash and schedule it
- * for RCU freeing.
- */
-static void itimer_delete(struct k_itimer *timer)
-{
- unsigned long flags;
-
- /*
- * irqsave is required to make timer_wait_running() work.
- */
- spin_lock_irqsave(&timer->it_lock, flags);
-
-retry_delete:
- /*
- * Even if the timer is not longer accessible from other tasks
- * it still might be armed and queued in the underlying timer
- * mechanism. Worse, that timer mechanism might run the expiry
- * function concurrently.
- */
- if (timer_delete_hook(timer) == TIMER_RETRY) {
- /*
- * Timer is expired concurrently, prevent livelocks
- * and pointless spinning on RT.
- *
- * timer_wait_running() drops timer::it_lock, which opens
- * the possibility for another task to delete the timer.
- *
- * That's not possible here because this is invoked from
- * do_exit() only for the last thread of the thread group.
- * So no other task can access and delete that timer.
- */
- if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer))
- return;
-
- goto retry_delete;
- }
- hlist_del(&timer->list);
-
- posix_timer_cleanup_ignored(timer);
-
- /*
- * Setting timer::it_signal to NULL is technically not required
- * here as nothing can access the timer anymore legitimately via
- * the hash table. Set it to NULL nevertheless so that all deletion
- * paths are consistent.
- */
- WRITE_ONCE(timer->it_signal, NULL);
-
- spin_unlock_irqrestore(&timer->it_lock, flags);
- posix_timer_unhash_and_free(timer);
-}
-
-/*
* Invoked from do_exit() when the last thread of a thread group exits.
* At that point no other task can access the timers of the dying
* task anymore.
@@ -1089,21 +1070,29 @@ retry_delete:
void exit_itimers(struct task_struct *tsk)
{
struct hlist_head timers;
+ struct hlist_node *next;
+ struct k_itimer *timer;
+
+ /* Clear restore mode for exec() */
+ tsk->signal->timer_create_restore_ids = 0;
if (hlist_empty(&tsk->signal->posix_timers))
return;
/* Protect against concurrent read via /proc/$PID/timers */
- spin_lock_irq(&tsk->sighand->siglock);
- hlist_move_list(&tsk->signal->posix_timers, &timers);
- spin_unlock_irq(&tsk->sighand->siglock);
+ scoped_guard (spinlock_irq, &tsk->sighand->siglock)
+ hlist_move_list(&tsk->signal->posix_timers, &timers);
/* The timers are not longer accessible via tsk::signal */
- while (!hlist_empty(&timers))
- itimer_delete(hlist_entry(timers.first, struct k_itimer, list));
+ hlist_for_each_entry_safe(timer, next, &timers, list) {
+ scoped_guard (spinlock_irq, &timer->it_lock)
+ posix_timer_delete(timer);
+ posix_timer_unhash_and_free(timer);
+ cond_resched();
+ }
/*
- * There should be no timers on the ignored list. itimer_delete() has
+ * There should be no timers on the ignored list. posix_timer_delete() has
* mopped them up.
*/
if (!WARN_ON_ONCE(!hlist_empty(&tsk->signal->ignored_posix_timers)))
@@ -1246,7 +1235,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
* sys_clock_settime(). The kernel internal timekeeping is always using
* nanoseconds precision independent of the clocksource device which is
* used to read the time from. The resolution of that device only
- * affects the presicion of the time returned by sys_clock_gettime().
+ * affects the precision of the time returned by sys_clock_gettime().
*
* Returns:
* 0 Success. @tp contains the resolution
@@ -1529,6 +1518,9 @@ static const struct k_clock * const posix_clocks[] = {
[CLOCK_REALTIME_ALARM] = &alarm_clock,
[CLOCK_BOOTTIME_ALARM] = &alarm_clock,
[CLOCK_TAI] = &clock_tai,
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+ [CLOCK_AUX ... CLOCK_AUX_LAST] = &clock_aux,
+#endif
};
static const struct k_clock *clockid_to_kclock(const clockid_t id)
@@ -1545,3 +1537,31 @@ static const struct k_clock *clockid_to_kclock(const clockid_t id)
return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))];
}
+
+static int __init posixtimer_init(void)
+{
+ unsigned long i, size;
+ unsigned int shift;
+
+ posix_timers_cache = kmem_cache_create("posix_timers_cache",
+ sizeof(struct k_itimer),
+ __alignof__(struct k_itimer),
+ SLAB_ACCOUNT, NULL);
+
+ if (IS_ENABLED(CONFIG_BASE_SMALL))
+ size = 512;
+ else
+ size = roundup_pow_of_two(512 * num_possible_cpus());
+
+ timer_buckets = alloc_large_system_hash("posixtimers", sizeof(*timer_buckets),
+ size, 0, 0, &shift, NULL, size, size);
+ size = 1UL << shift;
+ timer_hashmask = size - 1;
+
+ for (i = 0; i < size; i++) {
+ spin_lock_init(&timer_buckets[i].lock);
+ INIT_HLIST_HEAD(&timer_buckets[i].head);
+ }
+ return 0;
+}
+core_initcall(posixtimer_init);
diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h
index 61906f0688c1..7f259e845d24 100644
--- a/kernel/time/posix-timers.h
+++ b/kernel/time/posix-timers.h
@@ -41,6 +41,7 @@ extern const struct k_clock clock_posix_dynamic;
extern const struct k_clock clock_process;
extern const struct k_clock clock_thread;
extern const struct k_clock alarm_clock;
+extern const struct k_clock clock_aux;
void posix_timer_queue_signal(struct k_itimer *timr);
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index fcca4e72f1ef..f3aaef695b8c 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -174,8 +174,7 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
return HRTIMER_RESTART;
}
-void __init
-sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
+void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
{
u64 res, wrap, new_mask, new_epoch, cyc, ns;
u32 new_mult, new_shift;
@@ -216,7 +215,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
update_clock_read_data(&rd);
- if (sched_clock_timer.function != NULL) {
+ if (ACCESS_PRIVATE(&sched_clock_timer, function) != NULL) {
/* update timeout for clock wrap */
hrtimer_start(&sched_clock_timer, cd.wrap_kt,
HRTIMER_MODE_REL_HARD);
@@ -247,6 +246,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
pr_debug("Registered %pS as sched_clock source\n", read);
}
+EXPORT_SYMBOL_GPL(sched_clock_register);
void __init generic_sched_clock_init(void)
{
@@ -263,8 +263,7 @@ void __init generic_sched_clock_init(void)
* Start the timer to keep sched_clock() properly updated and
* sets the initial epoch.
*/
- hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
- sched_clock_timer.function = sched_clock_poll;
+ hrtimer_setup(&sched_clock_timer, sched_clock_poll, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD);
}
@@ -297,6 +296,11 @@ int sched_clock_suspend(void)
return 0;
}
+static int sched_clock_syscore_suspend(void *data)
+{
+ return sched_clock_suspend();
+}
+
void sched_clock_resume(void)
{
struct clock_read_data *rd = &cd.read_data[0];
@@ -306,14 +310,23 @@ void sched_clock_resume(void)
rd->read_sched_clock = cd.actual_read_sched_clock;
}
-static struct syscore_ops sched_clock_ops = {
- .suspend = sched_clock_suspend,
- .resume = sched_clock_resume,
+static void sched_clock_syscore_resume(void *data)
+{
+ sched_clock_resume();
+}
+
+static const struct syscore_ops sched_clock_syscore_ops = {
+ .suspend = sched_clock_syscore_suspend,
+ .resume = sched_clock_syscore_resume,
+};
+
+static struct syscore sched_clock_syscore = {
+ .ops = &sched_clock_syscore_ops,
};
static int __init sched_clock_syscore_init(void)
{
- register_syscore_ops(&sched_clock_ops);
+ register_syscore(&sched_clock_syscore);
return 0;
}
diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c
index dfe939f6e4ec..3c90574bd904 100644
--- a/kernel/time/sleep_timeout.c
+++ b/kernel/time/sleep_timeout.c
@@ -22,7 +22,7 @@ struct process_timer {
static void process_timeout(struct timer_list *t)
{
- struct process_timer *timeout = from_timer(timeout, t, timer);
+ struct process_timer *timeout = timer_container_of(timeout, t, timer);
wake_up_process(timeout->task);
}
@@ -97,10 +97,10 @@ signed long __sched schedule_timeout(signed long timeout)
timer.timer.expires = expire;
add_timer(&timer.timer);
schedule();
- del_timer_sync(&timer.timer);
+ timer_delete_sync(&timer.timer);
/* Remove the timer from the object tracker */
- destroy_timer_on_stack(&timer.timer);
+ timer_destroy_on_stack(&timer.timer);
timeout = expire - jiffies;
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index e28f9210f8a1..51f6a1032c83 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -78,7 +78,6 @@ static struct clock_event_device ce_broadcast_hrtimer = {
.set_state_shutdown = bc_shutdown,
.set_next_ktime = bc_set_next,
.features = CLOCK_EVT_FEAT_ONESHOT |
- CLOCK_EVT_FEAT_KTIME |
CLOCK_EVT_FEAT_HRTIMER,
.rating = 0,
.bound_on = -1,
@@ -100,7 +99,6 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
void tick_setup_hrtimer_broadcast(void)
{
- hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
- bctimer.function = bc_handler;
+ hrtimer_setup(&bctimer, bc_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
clockevents_register_device(&ce_broadcast_hrtimer);
}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 0207868c8b4d..115e0bf01276 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -3,7 +3,7 @@
* This file contains functions which emulate a local clock-event
* device via a broadcast event source.
*
- * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org>
* Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
* Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
*/
@@ -76,8 +76,10 @@ const struct clock_event_device *tick_get_wakeup_device(int cpu)
*/
static void tick_broadcast_start_periodic(struct clock_event_device *bc)
{
- if (bc)
+ if (bc) {
+ bc->next_event_forced = 0;
tick_setup_periodic(bc, 1);
+ }
}
/*
@@ -106,6 +108,7 @@ static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu)
static void tick_oneshot_wakeup_handler(struct clock_event_device *wd)
{
+ wd->next_event_forced = 0;
/*
* If we woke up early and the tick was reprogrammed in the
* meantime then this may be spurious but harmless.
@@ -403,6 +406,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
bool bc_local;
raw_spin_lock(&tick_broadcast_lock);
+ tick_broadcast_device.evtdev->next_event_forced = 0;
/* Handle spurious interrupts gracefully */
if (clockevent_state_shutdown(tick_broadcast_device.evtdev)) {
@@ -696,6 +700,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
raw_spin_lock(&tick_broadcast_lock);
dev->next_event = KTIME_MAX;
+ tick_broadcast_device.evtdev->next_event_forced = 0;
next_event = KTIME_MAX;
cpumask_clear(tmpmask);
now = ktime_get();
@@ -1063,6 +1068,7 @@ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc,
bc->event_handler = tick_handle_oneshot_broadcast;
+ bc->next_event_forced = 0;
bc->next_event = KTIME_MAX;
/*
@@ -1175,6 +1181,7 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu)
}
/* This moves the broadcast assignment to this CPU: */
+ bc->next_event_forced = 0;
clockevents_program_event(bc, bc->next_event, 1);
}
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index a47bcf71defc..6a9198a4279b 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -3,7 +3,7 @@
* This file contains the base functions to manage periodic tick
* related events.
*
- * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org>
* Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
* Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
*/
@@ -110,6 +110,7 @@ void tick_handle_periodic(struct clock_event_device *dev)
int cpu = smp_processor_id();
ktime_t next = dev->next_event;
+ dev->next_event_forced = 0;
tick_periodic(cpu);
/*
@@ -411,24 +412,18 @@ int tick_cpu_dying(unsigned int dying_cpu)
}
/*
- * Shutdown an event device on a given cpu:
+ * Shutdown an event device on the outgoing CPU:
*
- * This is called on a life CPU, when a CPU is dead. So we cannot
- * access the hardware device itself.
- * We just set the mode and remove it from the lists.
+ * Called by the dying CPU during teardown, with clockevents_lock held
+ * and interrupts disabled.
*/
-void tick_shutdown(unsigned int cpu)
+void tick_shutdown(void)
{
- struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
struct clock_event_device *dev = td->evtdev;
td->mode = TICKDEV_MODE_PERIODIC;
if (dev) {
- /*
- * Prevent that the clock events layer tries to call
- * the set mode function!
- */
- clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
clockevents_exchange_device(dev, NULL);
dev->event_handler = clockevents_handle_noop;
td->evtdev = NULL;
@@ -509,6 +504,7 @@ void tick_resume(void)
#ifdef CONFIG_SUSPEND
static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
+static DEFINE_WAIT_OVERRIDE_MAP(tick_freeze_map, LD_WAIT_SLEEP);
static unsigned int tick_freeze_depth;
/**
@@ -528,9 +524,22 @@ void tick_freeze(void)
if (tick_freeze_depth == num_online_cpus()) {
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), true);
+ /*
+ * All other CPUs have their interrupts disabled and are
+ * suspended to idle. Other tasks have been frozen so there
+ * is no scheduling happening. This means that there is no
+ * concurrency in the system at this point. Therefore it is
+ * okay to acquire a sleeping lock on PREEMPT_RT, such as a
+ * spinlock, because the lock cannot be held by other CPUs
+ * or threads and acquiring it cannot block.
+ *
+ * Inform lockdep about the situation.
+ */
+ lock_map_acquire_try(&tick_freeze_map);
system_state = SYSTEM_SUSPEND;
sched_clock_suspend();
timekeeping_suspend();
+ lock_map_release(&tick_freeze_map);
} else {
tick_suspend_local();
}
@@ -552,8 +561,16 @@ void tick_unfreeze(void)
raw_spin_lock(&tick_freeze_lock);
if (tick_freeze_depth == num_online_cpus()) {
+ /*
+ * Similar to tick_freeze(). On resumption the first CPU may
+ * acquire uncontended sleeping locks while other CPUs block on
+ * tick_freeze_lock.
+ */
+ lock_map_acquire_try(&tick_freeze_map);
timekeeping_resume();
sched_clock_resume();
+ lock_map_release(&tick_freeze_map);
+
system_state = SYSTEM_RUNNING;
trace_suspend_resume(TPS("timekeeping_freeze"),
smp_processor_id(), false);
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index faac36de35b9..597d816d22e8 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -26,7 +26,7 @@ extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
extern void tick_handle_periodic(struct clock_event_device *dev);
extern void tick_check_new_device(struct clock_event_device *dev);
extern void tick_offline_cpu(unsigned int cpu);
-extern void tick_shutdown(unsigned int cpu);
+extern void tick_shutdown(void);
extern void tick_suspend(void);
extern void tick_resume(void);
extern bool tick_check_replacement(struct clock_event_device *curdev,
@@ -156,7 +156,6 @@ static inline void tick_nohz_init(void) { }
#endif
#ifdef CONFIG_NO_HZ_COMMON
-extern unsigned long tick_nohz_active;
extern void timers_update_nohz(void);
extern u64 get_jiffies_update(unsigned long *basej);
# ifdef CONFIG_SMP
@@ -171,7 +170,6 @@ extern void timer_expire_remote(unsigned int cpu);
# endif
#else /* CONFIG_NO_HZ_COMMON */
static inline void timers_update_nohz(void) { }
-#define tick_nohz_active (0)
#endif
DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 5e2c2c26b3cc..7472597f3225 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -3,7 +3,7 @@
* This file contains functions which manage high resolution tick
* related events.
*
- * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org>
* Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
* Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner
*/
@@ -19,6 +19,10 @@
/**
* tick_program_event - program the CPU local timer device for the next event
+ * @expires: the time at which the next timer event should occur
+ * @force: flag to force reprograming even if the event time hasn't changed
+ *
+ * Return: 0 on success, negative error code on failure
*/
int tick_program_event(ktime_t expires, int force)
{
@@ -57,6 +61,13 @@ void tick_resume_oneshot(void)
/**
* tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz)
+ * @newdev: Pointer to the clock event device to configure
+ * @handler: Function to be called when the event device triggers an interrupt
+ * @next_event: Initial expiry time for the next event (in ktime)
+ *
+ * Configures the specified clock event device for onshot mode,
+ * assigns the given handler as its event callback, and programs
+ * the device to trigger at the specified next event time.
*/
void tick_setup_oneshot(struct clock_event_device *newdev,
void (*handler)(struct clock_event_device *),
@@ -69,6 +80,10 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
/**
* tick_switch_to_oneshot - switch to oneshot mode
+ * @handler: function to call when an event occurs on the tick device
+ *
+ * Return: 0 on success, -EINVAL if the tick device is not present,
+ * not functional, or does not support oneshot mode.
*/
int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
{
@@ -101,7 +116,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
/**
* tick_oneshot_mode_active - check whether the system is in oneshot mode
*
- * returns 1 when either nohz or highres are enabled. otherwise 0.
+ * Return: 1 when either nohz or highres are enabled, otherwise 0.
*/
int tick_oneshot_mode_active(void)
{
@@ -120,6 +135,9 @@ int tick_oneshot_mode_active(void)
* tick_init_highres - switch to high resolution mode
*
* Called with interrupts disabled.
+ *
+ * Return: 0 on success, -EINVAL if the tick device cannot switch
+ * to oneshot/high-resolution mode.
*/
int tick_init_highres(void)
{
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index fa058510af9c..cbbb87a0c6e7 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org>
* Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
* Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner
*
@@ -201,6 +201,27 @@ static inline void tick_sched_flag_clear(struct tick_sched *ts,
ts->flags &= ~flag;
}
+/*
+ * Allow only one non-timekeeper CPU at a time update jiffies from
+ * the timer tick.
+ *
+ * Returns true if update was run.
+ */
+static bool tick_limited_update_jiffies64(struct tick_sched *ts, ktime_t now)
+{
+ static atomic_t in_progress;
+ int inp;
+
+ inp = atomic_read(&in_progress);
+ if (inp || !atomic_try_cmpxchg(&in_progress, &inp, 1))
+ return false;
+
+ if (ts->last_tick_jiffies == jiffies)
+ tick_do_update_jiffies64(now);
+ atomic_set(&in_progress, 0);
+ return true;
+}
+
#define MAX_STALLED_JIFFIES 5
static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
@@ -239,10 +260,11 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
ts->stalled_jiffies = 0;
ts->last_tick_jiffies = READ_ONCE(jiffies);
} else {
- if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) {
- tick_do_update_jiffies64(now);
- ts->stalled_jiffies = 0;
- ts->last_tick_jiffies = READ_ONCE(jiffies);
+ if (++ts->stalled_jiffies >= MAX_STALLED_JIFFIES) {
+ if (tick_limited_update_jiffies64(ts, now)) {
+ ts->stalled_jiffies = 0;
+ ts->last_tick_jiffies = READ_ONCE(jiffies);
+ }
}
}
@@ -322,6 +344,9 @@ static bool check_tick_dependency(atomic_t *dep)
{
int val = atomic_read(dep);
+ if (likely(!tracepoint_enabled(tick_stop)))
+ return !!val;
+
if (val & TICK_DEP_MASK_POSIX_TIMER) {
trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
return true;
@@ -671,7 +696,7 @@ void __init tick_nohz_init(void)
* NO HZ enabled ?
*/
bool tick_nohz_enabled __read_mostly = true;
-unsigned long tick_nohz_active __read_mostly;
+static unsigned long tick_nohz_active __read_mostly;
/*
* Enable / Disable tickless mode
*/
@@ -682,6 +707,12 @@ static int __init setup_tick_nohz(char *str)
__setup("nohz=", setup_tick_nohz);
+bool tick_nohz_is_active(void)
+{
+ return tick_nohz_active;
+}
+EXPORT_SYMBOL_GPL(tick_nohz_is_active);
+
bool tick_nohz_tick_stopped(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
@@ -833,19 +864,32 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
+/* Simplified variant of hrtimer_forward_now() */
+static ktime_t tick_forward_now(ktime_t expires, ktime_t now)
+{
+ ktime_t delta = now - expires;
+
+ if (likely(delta < TICK_NSEC))
+ return expires + TICK_NSEC;
+
+ expires += TICK_NSEC * ktime_divns(delta, TICK_NSEC);
+ if (expires > now)
+ return expires;
+ return expires + TICK_NSEC;
+}
+
static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
{
- hrtimer_cancel(&ts->sched_timer);
- hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
+ ktime_t expires = ts->last_tick;
- /* Forward the time to expire in the future */
- hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
+ if (now >= expires)
+ expires = tick_forward_now(expires, now);
if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) {
- hrtimer_start_expires(&ts->sched_timer,
- HRTIMER_MODE_ABS_PINNED_HARD);
+ hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED_HARD);
} else {
- tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+ hrtimer_set_expires(&ts->sched_timer, expires);
+ tick_program_event(expires, 1);
}
/*
@@ -1152,16 +1196,15 @@ static bool report_idle_softirq(void)
return false;
}
- if (ratelimit >= 10)
- return false;
-
/* On RT, softirq handling may be waiting on some lock */
if (local_bh_blocked())
return false;
- pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
- pending);
- ratelimit++;
+ if (ratelimit < 10) {
+ pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
+ pending);
+ ratelimit++;
+ }
return true;
}
@@ -1483,6 +1526,7 @@ static void tick_nohz_lowres_handler(struct clock_event_device *dev)
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
dev->next_event = KTIME_MAX;
+ dev->next_event_forced = 0;
if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART))
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
@@ -1573,12 +1617,10 @@ void tick_setup_sched_timer(bool hrtimer)
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
/* Emulate tick processing via per-CPU hrtimers: */
- hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
+ hrtimer_setup(&ts->sched_timer, tick_nohz_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
- if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) {
+ if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer)
tick_sched_flag_set(ts, TS_FLAG_HIGHRES);
- ts->sched_timer.function = tick_nohz_handler;
- }
/* Get the next period (per-CPU) */
hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 1b69caa87480..771cef87ad3b 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -207,7 +207,7 @@ SYSCALL_DEFINE2(settimeofday, struct __kernel_old_timeval __user *, tv,
get_user(new_ts.tv_nsec, &tv->tv_usec))
return -EFAULT;
- if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0)
+ if (new_ts.tv_nsec >= USEC_PER_SEC || new_ts.tv_nsec < 0)
return -EINVAL;
new_ts.tv_nsec *= NSEC_PER_USEC;
@@ -365,20 +365,16 @@ SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp)
}
#endif
+#if HZ > MSEC_PER_SEC || (MSEC_PER_SEC % HZ)
/**
* jiffies_to_msecs - Convert jiffies to milliseconds
* @j: jiffies value
*
- * Avoid unnecessary multiplications/divisions in the
- * two most common HZ cases.
- *
* Return: milliseconds value
*/
unsigned int jiffies_to_msecs(const unsigned long j)
{
-#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
- return (MSEC_PER_SEC / HZ) * j;
-#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+#if HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
#else
# if BITS_PER_LONG == 32
@@ -390,7 +386,9 @@ unsigned int jiffies_to_msecs(const unsigned long j)
#endif
}
EXPORT_SYMBOL(jiffies_to_msecs);
+#endif
+#if (USEC_PER_SEC % HZ)
/**
* jiffies_to_usecs - Convert jiffies to microseconds
* @j: jiffies value
@@ -405,17 +403,14 @@ unsigned int jiffies_to_usecs(const unsigned long j)
*/
BUILD_BUG_ON(HZ > USEC_PER_SEC);
-#if !(USEC_PER_SEC % HZ)
- return (USEC_PER_SEC / HZ) * j;
-#else
-# if BITS_PER_LONG == 32
+#if BITS_PER_LONG == 32
return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
-# else
+#else
return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
-# endif
#endif
}
EXPORT_SYMBOL(jiffies_to_usecs);
+#endif
/**
* mktime64 - Converts date to seconds.
@@ -702,7 +697,7 @@ EXPORT_SYMBOL(clock_t_to_jiffies);
*
* Return: jiffies_64 value converted to 64-bit "clock_t" (CLOCKS_PER_SEC)
*/
-u64 jiffies_64_to_clock_t(u64 x)
+notrace u64 jiffies_64_to_clock_t(u64 x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
# if HZ < USER_HZ
@@ -858,6 +853,7 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
return res;
}
+EXPORT_SYMBOL_GPL(timespec64_add_safe);
/**
* get_timespec64 - get user's time value into kernel space
diff --git a/kernel/time/time_test.c b/kernel/time/time_test.c
index 2889763165e5..1b99180da288 100644
--- a/kernel/time/time_test.c
+++ b/kernel/time/time_test.c
@@ -4,7 +4,9 @@
#include <linux/time.h>
/*
- * Traditional implementation of leap year evaluation.
+ * Traditional implementation of leap year evaluation, but note that long
+ * is a signed type and the tests do cover negative year values. So this
+ * can't use the is_leap_year() helper from rtc.h.
*/
static bool is_leap(long year)
{
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
index e6285288d765..2e64dbb6302d 100644
--- a/kernel/time/timecounter.c
+++ b/kernel/time/timecounter.c
@@ -6,7 +6,7 @@
#include <linux/timecounter.h>
void timecounter_init(struct timecounter *tc,
- const struct cyclecounter *cc,
+ struct cyclecounter *cc,
u64 start_tstamp)
{
tc->cc = cc;
@@ -62,38 +62,3 @@ u64 timecounter_read(struct timecounter *tc)
}
EXPORT_SYMBOL_GPL(timecounter_read);
-/*
- * This is like cyclecounter_cyc2ns(), but it is used for computing a
- * time previous to the time stored in the cycle counter.
- */
-static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc,
- u64 cycles, u64 mask, u64 frac)
-{
- u64 ns = (u64) cycles;
-
- ns = ((ns * cc->mult) - frac) >> cc->shift;
-
- return ns;
-}
-
-u64 timecounter_cyc2time(const struct timecounter *tc,
- u64 cycle_tstamp)
-{
- u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
- u64 nsec = tc->nsec, frac = tc->frac;
-
- /*
- * Instead of always treating cycle_tstamp as more recent
- * than tc->cycle_last, detect when it is too far in the
- * future and treat it as old time stamp instead.
- */
- if (delta > tc->cc->mask / 2) {
- delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
- nsec -= cc_cyc2ns_backwards(tc->cc, delta, tc->mask, frac);
- } else {
- nsec += cyclecounter_cyc2ns(tc->cc, delta, tc->mask, &frac);
- }
-
- return nsec;
-}
-EXPORT_SYMBOL_GPL(timecounter_cyc2time);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 1e67d076f195..c493a4010305 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -3,31 +3,30 @@
* Kernel timekeeping code and accessor functions. Based on code from
* timer.c, moved in commit 8524070b7982.
*/
-#include <linux/timekeeper_internal.h>
+#include <linux/audit.h>
+#include <linux/clocksource.h>
+#include <linux/compiler.h>
+#include <linux/jiffies.h>
+#include <linux/kobject.h>
#include <linux/module.h>
-#include <linux/interrupt.h>
-#include <linux/percpu.h>
-#include <linux/init.h>
-#include <linux/mm.h>
#include <linux/nmi.h>
-#include <linux/sched.h>
-#include <linux/sched/loadavg.h>
+#include <linux/pvclock_gtod.h>
+#include <linux/random.h>
#include <linux/sched/clock.h>
+#include <linux/sched/loadavg.h>
+#include <linux/static_key.h>
+#include <linux/stop_machine.h>
#include <linux/syscore_ops.h>
-#include <linux/clocksource.h>
-#include <linux/jiffies.h>
+#include <linux/tick.h>
#include <linux/time.h>
#include <linux/timex.h>
-#include <linux/tick.h>
-#include <linux/stop_machine.h>
-#include <linux/pvclock_gtod.h>
-#include <linux/compiler.h>
-#include <linux/audit.h>
-#include <linux/random.h>
+#include <linux/timekeeper_internal.h>
+
+#include <vdso/auxclock.h>
#include "tick-internal.h"
-#include "ntp_internal.h"
#include "timekeeping_internal.h"
+#include "ntp_internal.h"
#define TK_CLEAR_NTP (1 << 0)
#define TK_CLOCK_WAS_SET (1 << 1)
@@ -53,7 +52,38 @@ struct tk_data {
raw_spinlock_t lock;
} ____cacheline_aligned;
-static struct tk_data tk_core;
+static struct tk_data timekeeper_data[TIMEKEEPERS_MAX];
+
+/* The core timekeeper */
+#define tk_core (timekeeper_data[TIMEKEEPER_CORE])
+
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts)
+{
+ return ktime_get_aux_ts64(CLOCK_AUX + tkid - TIMEKEEPER_AUX_FIRST, ts);
+}
+
+static inline bool tk_is_aux(const struct timekeeper *tk)
+{
+ return tk->id >= TIMEKEEPER_AUX_FIRST && tk->id <= TIMEKEEPER_AUX_LAST;
+}
+#else
+static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts)
+{
+ return false;
+}
+
+static inline bool tk_is_aux(const struct timekeeper *tk)
+{
+ return false;
+}
+#endif
+
+static inline void tk_update_aux_offs(struct timekeeper *tk, ktime_t offs)
+{
+ tk->offs_aux = offs;
+ tk->monotonic_to_aux = ktime_to_timespec64(offs);
+}
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
@@ -113,6 +143,16 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = {
.base[1] = FAST_TK_INIT,
};
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+static __init void tk_aux_setup(void);
+static void tk_aux_update_clocksource(void);
+static void tk_aux_advance(void);
+#else
+static inline void tk_aux_setup(void) { }
+static inline void tk_aux_update_clocksource(void) { }
+static inline void tk_aux_advance(void) { }
+#endif
+
unsigned long timekeeper_lock_irqsave(void)
{
unsigned long flags;
@@ -164,10 +204,34 @@ static inline struct timespec64 tk_xtime(const struct timekeeper *tk)
return ts;
}
+static inline struct timespec64 tk_xtime_coarse(const struct timekeeper *tk)
+{
+ struct timespec64 ts;
+
+ ts.tv_sec = tk->xtime_sec;
+ ts.tv_nsec = tk->coarse_nsec;
+ return ts;
+}
+
+/*
+ * Update the nanoseconds part for the coarse time keepers. They can't rely
+ * on xtime_nsec because xtime_nsec could be adjusted by a small negative
+ * amount when the multiplication factor of the clock is adjusted, which
+ * could cause the coarse clocks to go slightly backwards. See
+ * timekeeping_apply_adjustment(). Thus we keep a separate copy for the coarse
+ * clockids which only is updated when the clock has been set or we have
+ * accumulated time.
+ */
+static inline void tk_update_coarse_nsecs(struct timekeeper *tk)
+{
+ tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+}
+
static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
{
tk->xtime_sec = ts->tv_sec;
tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
+ tk_update_coarse_nsecs(tk);
}
static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
@@ -175,6 +239,7 @@ static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
tk->xtime_sec += ts->tv_sec;
tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
tk_normalize_xtime(tk);
+ tk_update_coarse_nsecs(tk);
}
static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
@@ -206,6 +271,11 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot);
}
+#ifdef CONFIG_ARCH_WANTS_CLOCKSOURCE_READ_INLINE
+#include <asm/clock_inlined.h>
+
+static DEFINE_STATIC_KEY_FALSE(clocksource_read_inlined);
+
/*
* tk_clock_read - atomic clocksource read() helper
*
@@ -219,12 +289,35 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
* a read of the fast-timekeeper tkrs (which is protected by its own locking
* and update logic).
*/
-static inline u64 tk_clock_read(const struct tk_read_base *tkr)
+static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr)
+{
+ struct clocksource *clock = READ_ONCE(tkr->clock);
+
+ if (static_branch_likely(&clocksource_read_inlined))
+ return arch_inlined_clocksource_read(clock);
+
+ return clock->read(clock);
+}
+
+static inline void clocksource_disable_inline_read(void)
+{
+ static_branch_disable(&clocksource_read_inlined);
+}
+
+static inline void clocksource_enable_inline_read(void)
+{
+ static_branch_enable(&clocksource_read_inlined);
+}
+#else
+static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr)
{
struct clocksource *clock = READ_ONCE(tkr->clock);
return clock->read(clock);
}
+static inline void clocksource_disable_inline_read(void) { }
+static inline void clocksource_enable_inline_read(void) { }
+#endif
/**
* tk_setup_internals - Set up internals to use clocksource clock.
@@ -298,6 +391,27 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
tk->tkr_raw.mult = clock->mult;
tk->ntp_err_mult = 0;
tk->skip_second_overflow = 0;
+
+ tk->cs_id = clock->id;
+
+ /* Coupled clockevent data */
+ if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) &&
+ clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT) {
+ /*
+ * Aim for an one hour maximum delta and use KHz to handle
+ * clocksources with a frequency above 4GHz correctly as
+ * the frequency argument of clocks_calc_mult_shift() is u32.
+ */
+ clocks_calc_mult_shift(&tk->cs_ns_to_cyc_mult, &tk->cs_ns_to_cyc_shift,
+ NSEC_PER_MSEC, clock->freq_khz, 3600 * 1000);
+ /*
+ * Initialize the conversion limit as the previous clocksource
+ * might have the same shift/mult pair so the quick check in
+ * tk_update_ns_to_cyc() fails to update it after a clocksource
+ * change leaving it effectivly zero.
+ */
+ tk->cs_ns_to_cyc_maxns = div_u64(clock->mask, tk->cs_ns_to_cyc_mult);
+ }
}
/* Timekeeper helper functions. */
@@ -306,7 +420,7 @@ static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta)
return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift);
}
-static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles)
+static __always_inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles)
{
/* Calculate the delta since the last update_wall_time() */
u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask;
@@ -576,7 +690,7 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
*/
static inline void tk_update_leap_state(struct timekeeper *tk)
{
- tk->next_leap_ktime = ntp_get_next_leap();
+ tk->next_leap_ktime = ntp_get_next_leap(tk->id);
if (tk->next_leap_ktime != KTIME_MAX)
/* Convert to monotonic time */
tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real);
@@ -627,6 +741,36 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);
}
+static inline void tk_update_ns_to_cyc(struct timekeeper *tks, struct timekeeper *tkc)
+{
+ struct tk_read_base *tkrs = &tks->tkr_mono;
+ struct tk_read_base *tkrc = &tkc->tkr_mono;
+ unsigned int shift;
+
+ if (!IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) ||
+ !(tkrs->clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT))
+ return;
+
+ if (tkrs->mult == tkrc->mult && tkrs->shift == tkrc->shift)
+ return;
+ /*
+ * The conversion math is simple:
+ *
+ * CS::MULT (1 << NS_TO_CYC_SHIFT)
+ * --------------- = ----------------------
+ * (1 << CS:SHIFT) NS_TO_CYC_MULT
+ *
+ * Ergo:
+ *
+ * NS_TO_CYC_MULT = (1 << (CS::SHIFT + NS_TO_CYC_SHIFT)) / CS::MULT
+ *
+ * NS_TO_CYC_SHIFT has been set up in tk_setup_internals()
+ */
+ shift = tkrs->shift + tks->cs_ns_to_cyc_shift;
+ tks->cs_ns_to_cyc_mult = (u32)div_u64(1ULL << shift, tkrs->mult);
+ tks->cs_ns_to_cyc_maxns = div_u64(tkrs->clock->mask, tks->cs_ns_to_cyc_mult);
+}
+
/*
* Restore the shadow timekeeper from the real timekeeper.
*/
@@ -638,7 +782,7 @@ static void timekeeping_restore_shadow(struct tk_data *tkd)
static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action)
{
- struct timekeeper *tk = &tk_core.shadow_timekeeper;
+ struct timekeeper *tk = &tkd->shadow_timekeeper;
lockdep_assert_held(&tkd->lock);
@@ -653,18 +797,23 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act
if (action & TK_CLEAR_NTP) {
tk->ntp_error = 0;
- ntp_clear();
+ ntp_clear(tk->id);
}
tk_update_leap_state(tk);
tk_update_ktime_data(tk);
+ tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
- update_vsyscall(tk);
- update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
+ if (tk->id == TIMEKEEPER_CORE) {
+ tk_update_ns_to_cyc(tk, &tkd->timekeeper);
+ update_vsyscall(tk);
+ update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
- tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
- update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
- update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
+ update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
+ update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw);
+ } else if (tk_is_aux(tk)) {
+ vdso_time_update_aux(tk);
+ }
if (action & TK_CLOCK_WAS_SET)
tk->clock_was_set_seq++;
@@ -708,6 +857,72 @@ static void timekeeping_forward_now(struct timekeeper *tk)
tk_normalize_xtime(tk);
delta -= incr;
}
+ tk_update_coarse_nsecs(tk);
+}
+
+/*
+ * ktime_expiry_to_cycles - Convert a expiry time to clocksource cycles
+ * @id: Clocksource ID which is required for validity
+ * @expires_ns: Absolute CLOCK_MONOTONIC expiry time (nsecs) to be converted
+ * @cycles: Pointer to storage for corresponding absolute cycles value
+ *
+ * Convert a CLOCK_MONOTONIC based absolute expiry time to a cycles value
+ * based on the correlated clocksource of the clockevent device by using
+ * the base nanoseconds and cycles values of the last timekeeper update and
+ * converting the delta between @expires_ns and base nanoseconds to cycles.
+ *
+ * This only works for clockevent devices which are using a less than or
+ * equal comparator against the clocksource.
+ *
+ * Utilizing this avoids two clocksource reads for such devices, the
+ * ktime_get() in clockevents_program_event() to calculate the delta expiry
+ * value and the readout in the device::set_next_event() callback to
+ * convert the delta back to a absolute comparator value.
+ *
+ * Returns: True if @id matches the current clocksource ID, false otherwise
+ */
+bool ktime_expiry_to_cycles(enum clocksource_ids id, ktime_t expires_ns, u64 *cycles)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ struct tk_read_base *tkrm = &tk->tkr_mono;
+ ktime_t base_ns, delta_ns, max_ns;
+ u64 base_cycles, delta_cycles;
+ unsigned int seq;
+ u32 mult, shift;
+
+ /*
+ * Racy check to avoid the seqcount overhead when ID does not match. If
+ * the relevant clocksource is installed concurrently, then this will
+ * just delay the switch over to this mechanism until the next event is
+ * programmed. If the ID is not matching the clock events code will use
+ * the regular relative set_next_event() callback as before.
+ */
+ if (data_race(tk->cs_id) != id)
+ return false;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+
+ if (tk->cs_id != id)
+ return false;
+
+ base_cycles = tkrm->cycle_last;
+ base_ns = tkrm->base + (tkrm->xtime_nsec >> tkrm->shift);
+
+ mult = tk->cs_ns_to_cyc_mult;
+ shift = tk->cs_ns_to_cyc_shift;
+ max_ns = tk->cs_ns_to_cyc_maxns;
+
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ /* Prevent negative deltas and multiplication overflows */
+ delta_ns = min(expires_ns - base_ns, max_ns);
+ delta_ns = max(delta_ns, 0);
+
+ /* Convert to cycles */
+ delta_cycles = ((u64)delta_ns * mult) >> shift;
+ *cycles = base_cycles + delta_cycles;
+ return true;
}
/**
@@ -774,7 +989,7 @@ u32 ktime_get_resolution_ns(void)
}
EXPORT_SYMBOL_GPL(ktime_get_resolution_ns);
-static ktime_t *offsets[TK_OFFS_MAX] = {
+static const ktime_t *const offsets[TK_OFFS_MAX] = {
[TK_OFFS_REAL] = &tk_core.timekeeper.offs_real,
[TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot,
[TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai,
@@ -783,8 +998,9 @@ static ktime_t *offsets[TK_OFFS_MAX] = {
ktime_t ktime_get_with_offset(enum tk_offsets offs)
{
struct timekeeper *tk = &tk_core.timekeeper;
+ const ktime_t *offset = offsets[offs];
unsigned int seq;
- ktime_t base, *offset = offsets[offs];
+ ktime_t base;
u64 nsecs;
WARN_ON(timekeeping_suspended);
@@ -804,8 +1020,9 @@ EXPORT_SYMBOL_GPL(ktime_get_with_offset);
ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
{
struct timekeeper *tk = &tk_core.timekeeper;
+ const ktime_t *offset = offsets[offs];
unsigned int seq;
- ktime_t base, *offset = offsets[offs];
+ ktime_t base;
u64 nsecs;
WARN_ON(timekeeping_suspended);
@@ -813,7 +1030,7 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
do {
seq = read_seqcount_begin(&tk_core.seq);
base = ktime_add(tk->tkr_mono.base, *offset);
- nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ nsecs = tk->coarse_nsec;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -828,7 +1045,7 @@ EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);
*/
ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
{
- ktime_t *offset = offsets[offs];
+ const ktime_t *offset = offsets[offs];
unsigned int seq;
ktime_t tconv;
@@ -949,9 +1166,14 @@ time64_t ktime_get_real_seconds(void)
EXPORT_SYMBOL_GPL(ktime_get_real_seconds);
/**
- * __ktime_get_real_seconds - The same as ktime_get_real_seconds
- * but without the sequence counter protect. This internal function
- * is called just when timekeeping lock is already held.
+ * __ktime_get_real_seconds - Unprotected access to CLOCK_REALTIME seconds
+ *
+ * The same as ktime_get_real_seconds() but without the sequence counter
+ * protection. This function is used in restricted contexts like the x86 MCE
+ * handler and in KGDB. It's unprotected on 32-bit vs. concurrent half
+ * completed modification and only to be used for such critical contexts.
+ *
+ * Returns: Racy snapshot of the CLOCK_REALTIME seconds value
*/
noinstr time64_t __ktime_get_real_seconds(void)
{
@@ -1230,7 +1452,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
struct system_time_snapshot *history_begin,
struct system_device_crosststamp *xtstamp)
{
- struct system_counterval_t system_counterval;
+ struct system_counterval_t system_counterval = {};
struct timekeeper *tk = &tk_core.timekeeper;
u64 cycles, now, interval_start;
unsigned int clock_was_set_seq = 0;
@@ -1386,41 +1608,73 @@ int do_settimeofday64(const struct timespec64 *ts)
}
EXPORT_SYMBOL(do_settimeofday64);
+static inline bool timekeeper_is_core_tk(struct timekeeper *tk)
+{
+ return !IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS) || tk->id == TIMEKEEPER_CORE;
+}
+
/**
- * timekeeping_inject_offset - Adds or subtracts from the current time.
+ * __timekeeping_inject_offset - Adds or subtracts from the current time.
+ * @tkd: Pointer to the timekeeper to modify
* @ts: Pointer to the timespec variable containing the offset
*
* Adds or subtracts an offset value from the current time.
*/
-static int timekeeping_inject_offset(const struct timespec64 *ts)
+static int __timekeeping_inject_offset(struct tk_data *tkd, const struct timespec64 *ts)
{
+ struct timekeeper *tks = &tkd->shadow_timekeeper;
+ struct timespec64 tmp;
+
if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC)
return -EINVAL;
- scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
- struct timekeeper *tks = &tk_core.shadow_timekeeper;
- struct timespec64 tmp;
-
- timekeeping_forward_now(tks);
+ timekeeping_forward_now(tks);
+ if (timekeeper_is_core_tk(tks)) {
/* Make sure the proposed value is valid */
tmp = timespec64_add(tk_xtime(tks), *ts);
if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 ||
!timespec64_valid_settod(&tmp)) {
- timekeeping_restore_shadow(&tk_core);
+ timekeeping_restore_shadow(tkd);
return -EINVAL;
}
tk_xtime_add(tks, ts);
tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts));
- timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
+ } else {
+ struct tk_read_base *tkr_mono = &tks->tkr_mono;
+ ktime_t now, offs;
+
+ /* Get the current time */
+ now = ktime_add_ns(tkr_mono->base, timekeeping_get_ns(tkr_mono));
+ /* Add the relative offset change */
+ offs = ktime_add(tks->offs_aux, timespec64_to_ktime(*ts));
+
+ /* Prevent that the resulting time becomes negative */
+ if (ktime_add(now, offs) < 0) {
+ timekeeping_restore_shadow(tkd);
+ return -EINVAL;
+ }
+ tk_update_aux_offs(tks, offs);
}
- /* Signal hrtimers about time change */
- clock_was_set(CLOCK_SET_WALL);
+ timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL);
return 0;
}
+static int timekeeping_inject_offset(const struct timespec64 *ts)
+{
+ int ret;
+
+ scoped_guard (raw_spinlock_irqsave, &tk_core.lock)
+ ret = __timekeeping_inject_offset(&tk_core, ts);
+
+ /* Signal hrtimers about time change */
+ if (!ret)
+ clock_was_set(CLOCK_SET_WALL);
+ return ret;
+}
+
/*
* Indicates if there is an offset between the system clock and the hardware
* clock/persistent clock/rtc.
@@ -1496,6 +1750,8 @@ static int change_clocksource(void *data)
timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
}
+ tk_aux_update_clocksource();
+
if (old) {
if (old->disable)
old->disable(old);
@@ -1518,7 +1774,19 @@ int timekeeping_notify(struct clocksource *clock)
if (tk->tkr_mono.clock == clock)
return 0;
+
+ /* Disable inlined reads accross the clocksource switch */
+ clocksource_disable_inline_read();
+
stop_machine(change_clocksource, clock, NULL);
+
+ /*
+ * If the clocksource has been selected and supports inlined reads
+ * enable the branch.
+ */
+ if (tk->tkr_mono.clock == clock && clock->flags & CLOCK_SOURCE_CAN_INLINE_READ)
+ clocksource_enable_inline_read();
+
tick_clock_notify();
return tk->tkr_mono.clock == clock ? 0 : -1;
}
@@ -1547,6 +1815,39 @@ void ktime_get_raw_ts64(struct timespec64 *ts)
}
EXPORT_SYMBOL(ktime_get_raw_ts64);
+/**
+ * ktime_get_clock_ts64 - Returns time of a clock in a timespec
+ * @id: POSIX clock ID of the clock to read
+ * @ts: Pointer to the timespec64 to be set
+ *
+ * The timestamp is invalidated (@ts->sec is set to -1) if the
+ * clock @id is not available.
+ */
+void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts)
+{
+ /* Invalidate time stamp */
+ ts->tv_sec = -1;
+ ts->tv_nsec = 0;
+
+ switch (id) {
+ case CLOCK_REALTIME:
+ ktime_get_real_ts64(ts);
+ return;
+ case CLOCK_MONOTONIC:
+ ktime_get_ts64(ts);
+ return;
+ case CLOCK_MONOTONIC_RAW:
+ ktime_get_raw_ts64(ts);
+ return;
+ case CLOCK_AUX ... CLOCK_AUX_LAST:
+ if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS))
+ ktime_get_aux_ts64(id, ts);
+ return;
+ default:
+ WARN_ON_ONCE(1);
+ }
+}
+EXPORT_SYMBOL_GPL(ktime_get_clock_ts64);
/**
* timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
@@ -1623,10 +1924,12 @@ read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
*boot_offset = ns_to_timespec64(local_clock());
}
-static __init void tkd_basic_setup(struct tk_data *tkd)
+static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id, bool valid)
{
raw_spin_lock_init(&tkd->lock);
seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock);
+ tkd->timekeeper.id = tkd->shadow_timekeeper.id = tk_id;
+ tkd->timekeeper.clock_valid = tkd->shadow_timekeeper.clock_valid = valid;
}
/*
@@ -1656,7 +1959,8 @@ void __init timekeeping_init(void)
struct timekeeper *tks = &tk_core.shadow_timekeeper;
struct clocksource *clock;
- tkd_basic_setup(&tk_core);
+ tkd_basic_setup(&tk_core, TIMEKEEPER_CORE, true);
+ tk_aux_setup();
read_persistent_wall_and_boot_offset(&wall_time, &boot_offset);
if (timespec64_valid_settod(&wall_time) &&
@@ -1845,6 +2149,11 @@ void timekeeping_resume(void)
timerfd_resume();
}
+static void timekeeping_syscore_resume(void *data)
+{
+ timekeeping_resume();
+}
+
int timekeeping_suspend(void)
{
struct timekeeper *tks = &tk_core.shadow_timekeeper;
@@ -1912,15 +2221,24 @@ int timekeeping_suspend(void)
return 0;
}
+static int timekeeping_syscore_suspend(void *data)
+{
+ return timekeeping_suspend();
+}
+
/* sysfs resume/suspend bits for timekeeping */
-static struct syscore_ops timekeeping_syscore_ops = {
- .resume = timekeeping_resume,
- .suspend = timekeeping_suspend,
+static const struct syscore_ops timekeeping_syscore_ops = {
+ .resume = timekeeping_syscore_resume,
+ .suspend = timekeeping_syscore_suspend,
+};
+
+static struct syscore timekeeping_syscore = {
+ .ops = &timekeeping_syscore_ops,
};
static int __init timekeeping_init_ops(void)
{
- register_syscore_ops(&timekeeping_syscore_ops);
+ register_syscore(&timekeeping_syscore);
return 0;
}
device_initcall(timekeeping_init_ops);
@@ -2008,7 +2326,7 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
*/
static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
{
- u64 ntp_tl = ntp_tick_length();
+ u64 ntp_tl = ntp_tick_length(tk->id);
u32 mult;
/*
@@ -2089,7 +2407,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
}
/* Figure out if its a leap sec and apply if needed */
- leap = second_overflow(tk->xtime_sec);
+ leap = second_overflow(tk->id, tk->xtime_sec);
if (unlikely(leap)) {
struct timespec64 ts;
@@ -2155,15 +2473,13 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
* timekeeping_advance - Updates the timekeeper to the current time and
* current NTP tick length
*/
-static bool timekeeping_advance(enum timekeeping_adv_mode mode)
+static bool __timekeeping_advance(struct tk_data *tkd, enum timekeeping_adv_mode mode)
{
- struct timekeeper *tk = &tk_core.shadow_timekeeper;
- struct timekeeper *real_tk = &tk_core.timekeeper;
+ struct timekeeper *tk = &tkd->shadow_timekeeper;
+ struct timekeeper *real_tk = &tkd->timekeeper;
unsigned int clock_set = 0;
int shift = 0, maxshift;
- u64 offset;
-
- guard(raw_spinlock_irqsave)(&tk_core.lock);
+ u64 offset, orig_offset;
/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
@@ -2172,7 +2488,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
tk->tkr_mono.clock->max_raw_delta);
-
+ orig_offset = offset;
/* Check if there's really nothing to do */
if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
return false;
@@ -2188,7 +2504,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
shift = ilog2(offset) - ilog2(tk->cycle_interval);
shift = max(0, shift);
/* Bound shift to one less than what overflows tick_length */
- maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
+ maxshift = (64 - (ilog2(ntp_tick_length(tk->id)) + 1)) - 1;
shift = min(shift, maxshift);
while (offset >= tk->cycle_interval) {
offset = logarithmic_accumulation(tk, offset, shift, &clock_set);
@@ -2205,19 +2521,35 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode)
*/
clock_set |= accumulate_nsecs_to_secs(tk);
- timekeeping_update_from_shadow(&tk_core, clock_set);
+ /*
+ * To avoid inconsistencies caused adjtimex TK_ADV_FREQ calls
+ * making small negative adjustments to the base xtime_nsec
+ * value, only update the coarse clocks if we accumulated time
+ */
+ if (orig_offset != offset)
+ tk_update_coarse_nsecs(tk);
+
+ timekeeping_update_from_shadow(tkd, clock_set);
return !!clock_set;
}
+static bool timekeeping_advance(enum timekeeping_adv_mode mode)
+{
+ guard(raw_spinlock_irqsave)(&tk_core.lock);
+ return __timekeeping_advance(&tk_core, mode);
+}
+
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
+ * It also updates the enabled auxiliary clock timekeepers
*/
void update_wall_time(void)
{
if (timekeeping_advance(TK_ADV_TICK))
clock_was_set_delayed();
+ tk_aux_advance();
}
/**
@@ -2248,7 +2580,7 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts)
do {
seq = read_seqcount_begin(&tk_core.seq);
- *ts = tk_xtime(tk);
+ *ts = tk_xtime_coarse(tk);
} while (read_seqcount_retry(&tk_core.seq, seq));
}
EXPORT_SYMBOL(ktime_get_coarse_real_ts64);
@@ -2271,7 +2603,7 @@ void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts)
do {
seq = read_seqcount_begin(&tk_core.seq);
- *ts = tk_xtime(tk);
+ *ts = tk_xtime_coarse(tk);
offset = tk_core.timekeeper.offs_real;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -2350,12 +2682,12 @@ void ktime_get_coarse_ts64(struct timespec64 *ts)
do {
seq = read_seqcount_begin(&tk_core.seq);
- now = tk_xtime(tk);
+ now = tk_xtime_coarse(tk);
mono = tk->wall_to_monotonic;
} while (read_seqcount_retry(&tk_core.seq, seq));
set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec,
- now.tv_nsec + mono.tv_nsec);
+ now.tv_nsec + mono.tv_nsec);
}
EXPORT_SYMBOL(ktime_get_coarse_ts64);
@@ -2415,7 +2747,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
/*
* timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
*/
-static int timekeeping_validate_timex(const struct __kernel_timex *txc)
+static int timekeeping_validate_timex(const struct __kernel_timex *txc, bool aux_clock)
{
if (txc->modes & ADJ_ADJTIME) {
/* singleshot must not be used with any other mode bits */
@@ -2474,6 +2806,22 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc)
return -EINVAL;
}
+ if (aux_clock) {
+ /* Auxiliary clocks are similar to TAI and do not have leap seconds */
+ if (txc->modes & ADJ_STATUS &&
+ txc->status & (STA_INS | STA_DEL))
+ return -EINVAL;
+
+ /* No TAI offset setting */
+ if (txc->modes & ADJ_TAI)
+ return -EINVAL;
+
+ /* No PPS support either */
+ if (txc->modes & ADJ_STATUS &&
+ txc->status & (STA_PPSFREQ | STA_PPSTIME))
+ return -EINVAL;
+ }
+
return 0;
}
@@ -2492,74 +2840,103 @@ unsigned long random_get_entropy_fallback(void)
}
EXPORT_SYMBOL_GPL(random_get_entropy_fallback);
-/**
- * do_adjtimex() - Accessor function to NTP __do_adjtimex function
- * @txc: Pointer to kernel_timex structure containing NTP parameters
- */
-int do_adjtimex(struct __kernel_timex *txc)
+struct adjtimex_result {
+ struct audit_ntp_data ad;
+ struct timespec64 delta;
+ bool clock_set;
+};
+
+static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc,
+ struct adjtimex_result *result)
{
- struct audit_ntp_data ad;
- bool offset_set = false;
- bool clock_set = false;
+ struct timekeeper *tks = &tkd->shadow_timekeeper;
+ bool aux_clock = !timekeeper_is_core_tk(tks);
struct timespec64 ts;
+ s32 orig_tai, tai;
int ret;
/* Validate the data before disabling interrupts */
- ret = timekeeping_validate_timex(txc);
+ ret = timekeeping_validate_timex(txc, aux_clock);
if (ret)
return ret;
add_device_randomness(txc, sizeof(*txc));
- if (txc->modes & ADJ_SETOFFSET) {
- struct timespec64 delta;
+ if (!aux_clock)
+ ktime_get_real_ts64(&ts);
+ else
+ tk_get_aux_ts64(tkd->timekeeper.id, &ts);
+
+ add_device_randomness(&ts, sizeof(ts));
- delta.tv_sec = txc->time.tv_sec;
- delta.tv_nsec = txc->time.tv_usec;
+ guard(raw_spinlock_irqsave)(&tkd->lock);
+
+ if (!tks->clock_valid)
+ return -ENODEV;
+
+ if (txc->modes & ADJ_SETOFFSET) {
+ result->delta.tv_sec = txc->time.tv_sec;
+ result->delta.tv_nsec = txc->time.tv_usec;
if (!(txc->modes & ADJ_NANO))
- delta.tv_nsec *= 1000;
- ret = timekeeping_inject_offset(&delta);
+ result->delta.tv_nsec *= 1000;
+ ret = __timekeeping_inject_offset(tkd, &result->delta);
if (ret)
return ret;
-
- offset_set = delta.tv_sec != 0;
- audit_tk_injoffset(delta);
+ result->clock_set = true;
}
- audit_ntp_init(&ad);
+ orig_tai = tai = tks->tai_offset;
+ ret = ntp_adjtimex(tks->id, txc, &ts, &tai, &result->ad);
- ktime_get_real_ts64(&ts);
- add_device_randomness(&ts, sizeof(ts));
+ if (tai != orig_tai) {
+ __timekeeping_set_tai_offset(tks, tai);
+ timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET);
+ result->clock_set = true;
+ } else {
+ tk_update_leap_state_all(tkd);
+ }
- scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
- struct timekeeper *tks = &tk_core.shadow_timekeeper;
- s32 orig_tai, tai;
+ /* Update the multiplier immediately if frequency was set directly */
+ if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK))
+ result->clock_set |= __timekeeping_advance(tkd, TK_ADV_FREQ);
- orig_tai = tai = tks->tai_offset;
- ret = __do_adjtimex(txc, &ts, &tai, &ad);
+ return ret;
+}
- if (tai != orig_tai) {
- __timekeeping_set_tai_offset(tks, tai);
- timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET);
- clock_set = true;
- } else {
- tk_update_leap_state_all(&tk_core);
- }
- }
+/**
+ * do_adjtimex() - Accessor function to NTP __do_adjtimex function
+ * @txc: Pointer to kernel_timex structure containing NTP parameters
+ */
+int do_adjtimex(struct __kernel_timex *txc)
+{
+ struct adjtimex_result result = { };
+ int ret;
+
+ ret = __do_adjtimex(&tk_core, txc, &result);
+ if (ret < 0)
+ return ret;
- audit_ntp_log(&ad);
+ if (txc->modes & ADJ_SETOFFSET)
+ audit_tk_injoffset(result.delta);
- /* Update the multiplier immediately if frequency was set directly */
- if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK))
- clock_set |= timekeeping_advance(TK_ADV_FREQ);
+ audit_ntp_log(&result.ad);
- if (clock_set)
+ if (result.clock_set)
clock_was_set(CLOCK_SET_WALL);
- ntp_notify_cmos_timer(offset_set);
+ ntp_notify_cmos_timer(result.delta.tv_sec != 0);
return ret;
}
+/*
+ * Invoked from NTP with the time keeper lock held, so lockless access is
+ * fine.
+ */
+long ktime_get_ntp_seconds(unsigned int id)
+{
+ return timekeeper_data[id].timekeeper.xtime_sec;
+}
+
#ifdef CONFIG_NTP_PPS
/**
* hardpps() - Accessor function to NTP __hardpps function
@@ -2573,3 +2950,321 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
}
EXPORT_SYMBOL(hardpps);
#endif /* CONFIG_NTP_PPS */
+
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+#include "posix-timers.h"
+
+/*
+ * Bitmap for the activated auxiliary timekeepers to allow lockless quick
+ * checks in the hot paths without touching extra cache lines. If set, then
+ * the state of the corresponding timekeeper has to be re-checked under
+ * timekeeper::lock.
+ */
+static unsigned long aux_timekeepers;
+
+static inline unsigned int clockid_to_tkid(unsigned int id)
+{
+ return TIMEKEEPER_AUX_FIRST + id - CLOCK_AUX;
+}
+
+static inline struct tk_data *aux_get_tk_data(clockid_t id)
+{
+ if (!clockid_aux_valid(id))
+ return NULL;
+ return &timekeeper_data[clockid_to_tkid(id)];
+}
+
+/* Invoked from timekeeping after a clocksource change */
+static void tk_aux_update_clocksource(void)
+{
+ unsigned long active = READ_ONCE(aux_timekeepers);
+ unsigned int id;
+
+ for_each_set_bit(id, &active, BITS_PER_LONG) {
+ struct tk_data *tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST];
+ struct timekeeper *tks = &tkd->shadow_timekeeper;
+
+ guard(raw_spinlock_irqsave)(&tkd->lock);
+ if (!tks->clock_valid)
+ continue;
+
+ timekeeping_forward_now(tks);
+ tk_setup_internals(tks, tk_core.timekeeper.tkr_raw.clock);
+ timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL);
+ }
+}
+
+static void tk_aux_advance(void)
+{
+ unsigned long active = READ_ONCE(aux_timekeepers);
+ unsigned int id;
+
+ /* Lockless quick check to avoid extra cache lines */
+ for_each_set_bit(id, &active, BITS_PER_LONG) {
+ struct tk_data *aux_tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST];
+
+ guard(raw_spinlock)(&aux_tkd->lock);
+ if (aux_tkd->shadow_timekeeper.clock_valid)
+ __timekeeping_advance(aux_tkd, TK_ADV_TICK);
+ }
+}
+
+/**
+ * ktime_get_aux - Get time for a AUX clock
+ * @id: ID of the clock to read (CLOCK_AUX...)
+ * @kt: Pointer to ktime_t to store the time stamp
+ *
+ * Returns: True if the timestamp is valid, false otherwise
+ */
+bool ktime_get_aux(clockid_t id, ktime_t *kt)
+{
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+ struct timekeeper *aux_tk;
+ unsigned int seq;
+ ktime_t base;
+ u64 nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ if (!aux_tkd)
+ return false;
+
+ aux_tk = &aux_tkd->timekeeper;
+ do {
+ seq = read_seqcount_begin(&aux_tkd->seq);
+ if (!aux_tk->clock_valid)
+ return false;
+
+ base = ktime_add(aux_tk->tkr_mono.base, aux_tk->offs_aux);
+ nsecs = timekeeping_get_ns(&aux_tk->tkr_mono);
+ } while (read_seqcount_retry(&aux_tkd->seq, seq));
+
+ *kt = ktime_add_ns(base, nsecs);
+ return true;
+}
+EXPORT_SYMBOL_GPL(ktime_get_aux);
+
+/**
+ * ktime_get_aux_ts64 - Get time for a AUX clock
+ * @id: ID of the clock to read (CLOCK_AUX...)
+ * @ts: Pointer to timespec64 to store the time stamp
+ *
+ * Returns: True if the timestamp is valid, false otherwise
+ */
+bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *ts)
+{
+ ktime_t now;
+
+ if (!ktime_get_aux(id, &now))
+ return false;
+ *ts = ktime_to_timespec64(now);
+ return true;
+}
+EXPORT_SYMBOL_GPL(ktime_get_aux_ts64);
+
+static int aux_get_res(clockid_t id, struct timespec64 *tp)
+{
+ if (!clockid_aux_valid(id))
+ return -ENODEV;
+
+ tp->tv_sec = aux_clock_resolution_ns() / NSEC_PER_SEC;
+ tp->tv_nsec = aux_clock_resolution_ns() % NSEC_PER_SEC;
+ return 0;
+}
+
+static int aux_get_timespec(clockid_t id, struct timespec64 *tp)
+{
+ return ktime_get_aux_ts64(id, tp) ? 0 : -ENODEV;
+}
+
+static int aux_clock_set(const clockid_t id, const struct timespec64 *tnew)
+{
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+ struct timekeeper *aux_tks;
+ ktime_t tnow, nsecs;
+
+ if (!timespec64_valid_settod(tnew))
+ return -EINVAL;
+ if (!aux_tkd)
+ return -ENODEV;
+
+ aux_tks = &aux_tkd->shadow_timekeeper;
+
+ guard(raw_spinlock_irq)(&aux_tkd->lock);
+ if (!aux_tks->clock_valid)
+ return -ENODEV;
+
+ /* Forward the timekeeper base time */
+ timekeeping_forward_now(aux_tks);
+ /*
+ * Get the updated base time. tkr_mono.base has not been
+ * updated yet, so do that first. That makes the update
+ * in timekeeping_update_from_shadow() redundant, but
+ * that's harmless. After that @tnow can be calculated
+ * by using tkr_mono::cycle_last, which has been set
+ * by timekeeping_forward_now().
+ */
+ tk_update_ktime_data(aux_tks);
+ nsecs = timekeeping_cycles_to_ns(&aux_tks->tkr_mono, aux_tks->tkr_mono.cycle_last);
+ tnow = ktime_add(aux_tks->tkr_mono.base, nsecs);
+
+ /*
+ * Calculate the new AUX offset as delta to @tnow ("monotonic").
+ * That avoids all the tk::xtime back and forth conversions as
+ * xtime ("realtime") is not applicable for auxiliary clocks and
+ * kept in sync with "monotonic".
+ */
+ tk_update_aux_offs(aux_tks, ktime_sub(timespec64_to_ktime(*tnew), tnow));
+
+ timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
+ return 0;
+}
+
+static int aux_clock_adj(const clockid_t id, struct __kernel_timex *txc)
+{
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+ struct adjtimex_result result = { };
+
+ if (!aux_tkd)
+ return -ENODEV;
+
+ /*
+ * @result is ignored for now as there are neither hrtimers nor a
+ * RTC related to auxiliary clocks for now.
+ */
+ return __do_adjtimex(aux_tkd, txc, &result);
+}
+
+const struct k_clock clock_aux = {
+ .clock_getres = aux_get_res,
+ .clock_get_timespec = aux_get_timespec,
+ .clock_set = aux_clock_set,
+ .clock_adj = aux_clock_adj,
+};
+
+static void aux_clock_enable(clockid_t id)
+{
+ struct tk_read_base *tkr_raw = &tk_core.timekeeper.tkr_raw;
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+ struct timekeeper *aux_tks = &aux_tkd->shadow_timekeeper;
+
+ /* Prevent the core timekeeper from changing. */
+ guard(raw_spinlock_irq)(&tk_core.lock);
+
+ /*
+ * Setup the auxiliary clock assuming that the raw core timekeeper
+ * clock frequency conversion is close enough. Userspace has to
+ * adjust for the deviation via clock_adjtime(2).
+ */
+ guard(raw_spinlock_nested)(&aux_tkd->lock);
+
+ /* Remove leftovers of a previous registration */
+ memset(aux_tks, 0, sizeof(*aux_tks));
+ /* Restore the timekeeper id */
+ aux_tks->id = aux_tkd->timekeeper.id;
+ /* Setup the timekeeper based on the current system clocksource */
+ tk_setup_internals(aux_tks, tkr_raw->clock);
+
+ /* Mark it valid and set it live */
+ aux_tks->clock_valid = true;
+ timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
+}
+
+static void aux_clock_disable(clockid_t id)
+{
+ struct tk_data *aux_tkd = aux_get_tk_data(id);
+
+ guard(raw_spinlock_irq)(&aux_tkd->lock);
+ aux_tkd->shadow_timekeeper.clock_valid = false;
+ timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL);
+}
+
+static DEFINE_MUTEX(aux_clock_mutex);
+
+static ssize_t aux_clock_enable_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ /* Lazy atoi() as name is "0..7" */
+ int id = kobj->name[0] & 0x7;
+ bool enable;
+
+ if (!capable(CAP_SYS_TIME))
+ return -EPERM;
+
+ if (kstrtobool(buf, &enable) < 0)
+ return -EINVAL;
+
+ guard(mutex)(&aux_clock_mutex);
+ if (enable == test_bit(id, &aux_timekeepers))
+ return count;
+
+ if (enable) {
+ aux_clock_enable(CLOCK_AUX + id);
+ set_bit(id, &aux_timekeepers);
+ } else {
+ aux_clock_disable(CLOCK_AUX + id);
+ clear_bit(id, &aux_timekeepers);
+ }
+ return count;
+}
+
+static ssize_t aux_clock_enable_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ unsigned long active = READ_ONCE(aux_timekeepers);
+ /* Lazy atoi() as name is "0..7" */
+ int id = kobj->name[0] & 0x7;
+
+ return sysfs_emit(buf, "%d\n", test_bit(id, &active));
+}
+
+static struct kobj_attribute aux_clock_enable_attr = __ATTR_RW(aux_clock_enable);
+
+static struct attribute *aux_clock_enable_attrs[] = {
+ &aux_clock_enable_attr.attr,
+ NULL
+};
+
+static const struct attribute_group aux_clock_enable_attr_group = {
+ .attrs = aux_clock_enable_attrs,
+};
+
+static int __init tk_aux_sysfs_init(void)
+{
+ struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj);
+ int ret = -ENOMEM;
+
+ if (!tko)
+ return ret;
+
+ auxo = kobject_create_and_add("aux_clocks", tko);
+ if (!auxo)
+ goto err_clean;
+
+ for (int i = 0; i < MAX_AUX_CLOCKS; i++) {
+ char id[2] = { [0] = '0' + i, };
+ struct kobject *clk = kobject_create_and_add(id, auxo);
+
+ if (!clk) {
+ ret = -ENOMEM;
+ goto err_clean;
+ }
+
+ ret = sysfs_create_group(clk, &aux_clock_enable_attr_group);
+ if (ret)
+ goto err_clean;
+ }
+ return 0;
+
+err_clean:
+ kobject_put(auxo);
+ kobject_put(tko);
+ return ret;
+}
+late_initcall(tk_aux_sysfs_init);
+
+static __init void tk_aux_setup(void)
+{
+ for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++)
+ tkd_basic_setup(&timekeeper_data[i], i, false);
+}
+#endif /* CONFIG_POSIX_AUX_CLOCKS */
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 543beba096c7..198d0608db74 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -9,6 +9,8 @@ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq,
ktime_t *offs_boot,
ktime_t *offs_tai);
+bool ktime_expiry_to_cycles(enum clocksource_ids id, ktime_t expires_ns, u64 *cycles);
+
extern int timekeeping_valid_for_hres(void);
extern u64 timekeeping_max_deferment(void);
extern void timekeeping_warp_clock(void);
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index 8c9079108ffb..973ede670a36 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -45,4 +45,7 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask, u64 max_delta)
unsigned long timekeeper_lock_irqsave(void);
void timekeeper_unlock_irqrestore(unsigned long flags);
+/* NTP specific interface to access the current seconds value */
+long ktime_get_ntp_seconds(unsigned int id);
+
#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index c8f776dc6ee0..04d928c21aba 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -281,7 +281,7 @@ DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);
static void timers_update_migration(void)
{
- if (sysctl_timer_migration && tick_nohz_active)
+ if (sysctl_timer_migration && tick_nohz_is_active())
static_branch_enable(&timers_migration_enabled);
else
static_branch_disable(&timers_migration_enabled);
@@ -386,32 +386,6 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu,
}
/**
- * __round_jiffies - function to round jiffies to a full second
- * @j: the time in (absolute) jiffies that should be rounded
- * @cpu: the processor number on which the timeout will happen
- *
- * __round_jiffies() rounds an absolute time in the future (in jiffies)
- * up or down to (approximately) full seconds. This is useful for timers
- * for which the exact time they fire does not matter too much, as long as
- * they fire approximately every X seconds.
- *
- * By rounding these timers to whole seconds, all such timers will fire
- * at the same time, rather than at various times spread out. The goal
- * of this is to have the CPU wake up less, which saves power.
- *
- * The exact rounding is skewed for each processor to avoid all
- * processors firing at the exact same time, which could lead
- * to lock contention or spurious cache line bouncing.
- *
- * The return value is the rounded version of the @j parameter.
- */
-unsigned long __round_jiffies(unsigned long j, int cpu)
-{
- return round_jiffies_common(j, cpu, false);
-}
-EXPORT_SYMBOL_GPL(__round_jiffies);
-
-/**
* __round_jiffies_relative - function to round jiffies to a full second
* @j: the time in (relative) jiffies that should be rounded
* @cpu: the processor number on which the timeout will happen
@@ -483,22 +457,6 @@ unsigned long round_jiffies_relative(unsigned long j)
EXPORT_SYMBOL_GPL(round_jiffies_relative);
/**
- * __round_jiffies_up - function to round jiffies up to a full second
- * @j: the time in (absolute) jiffies that should be rounded
- * @cpu: the processor number on which the timeout will happen
- *
- * This is the same as __round_jiffies() except that it will never
- * round down. This is useful for timeouts for which the exact time
- * of firing does not matter too much, as long as they don't fire too
- * early.
- */
-unsigned long __round_jiffies_up(unsigned long j, int cpu)
-{
- return round_jiffies_common(j, cpu, true);
-}
-EXPORT_SYMBOL_GPL(__round_jiffies_up);
-
-/**
* __round_jiffies_up_relative - function to round jiffies up to a full second
* @j: the time in (relative) jiffies that should be rounded
* @cpu: the processor number on which the timeout will happen
@@ -744,7 +702,7 @@ static bool timer_fixup_init(void *addr, enum debug_obj_state state)
switch (state) {
case ODEBUG_STATE_ACTIVE:
- del_timer_sync(timer);
+ timer_delete_sync(timer);
debug_object_init(timer, &timer_debug_descr);
return true;
default:
@@ -790,7 +748,7 @@ static bool timer_fixup_free(void *addr, enum debug_obj_state state)
switch (state) {
case ODEBUG_STATE_ACTIVE:
- del_timer_sync(timer);
+ timer_delete_sync(timer);
debug_object_free(timer, &timer_debug_descr);
return true;
default:
@@ -850,7 +808,7 @@ static void do_init_timer(struct timer_list *timer,
unsigned int flags,
const char *name, struct lock_class_key *key);
-void init_timer_on_stack_key(struct timer_list *timer,
+void timer_init_key_on_stack(struct timer_list *timer,
void (*func)(struct timer_list *),
unsigned int flags,
const char *name, struct lock_class_key *key)
@@ -858,13 +816,13 @@ void init_timer_on_stack_key(struct timer_list *timer,
debug_object_init_on_stack(timer, &timer_debug_descr);
do_init_timer(timer, func, flags, name, key);
}
-EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
+EXPORT_SYMBOL_GPL(timer_init_key_on_stack);
-void destroy_timer_on_stack(struct timer_list *timer)
+void timer_destroy_on_stack(struct timer_list *timer)
{
debug_object_free(timer, &timer_debug_descr);
}
-EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
+EXPORT_SYMBOL_GPL(timer_destroy_on_stack);
#else
static inline void debug_timer_init(struct timer_list *timer) { }
@@ -904,7 +862,7 @@ static void do_init_timer(struct timer_list *timer,
}
/**
- * init_timer_key - initialize a timer
+ * timer_init_key - initialize a timer
* @timer: the timer to be initialized
* @func: timer callback function
* @flags: timer flags
@@ -912,17 +870,17 @@ static void do_init_timer(struct timer_list *timer,
* @key: lockdep class key of the fake lock used for tracking timer
* sync lock dependencies
*
- * init_timer_key() must be done to a timer prior to calling *any* of the
+ * timer_init_key() must be done to a timer prior to calling *any* of the
* other timer functions.
*/
-void init_timer_key(struct timer_list *timer,
+void timer_init_key(struct timer_list *timer,
void (*func)(struct timer_list *), unsigned int flags,
const char *name, struct lock_class_key *key)
{
debug_init(timer);
do_init_timer(timer, func, flags, name, key);
}
-EXPORT_SYMBOL(init_timer_key);
+EXPORT_SYMBOL(timer_init_key);
static inline void detach_timer(struct timer_list *timer, bool clear_pending)
{
@@ -1212,10 +1170,10 @@ EXPORT_SYMBOL(mod_timer_pending);
*
* mod_timer(timer, expires) is equivalent to:
*
- * del_timer(timer); timer->expires = expires; add_timer(timer);
+ * timer_delete(timer); timer->expires = expires; add_timer(timer);
*
* mod_timer() is more efficient than the above open coded sequence. In
- * case that the timer is inactive, the del_timer() part is a NOP. The
+ * case that the timer is inactive, the timer_delete() part is a NOP. The
* timer is in any case activated with the new expiry time @expires.
*
* Note that if there are multiple unserialized concurrent users of the
@@ -1500,10 +1458,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
base = lock_timer_base(timer, &flags);
- if (base->running_timer != timer)
+ if (base->running_timer != timer) {
ret = detach_if_pending(timer, base, true);
- if (shutdown)
- timer->function = NULL;
+ if (shutdown)
+ timer->function = NULL;
+ }
raw_spin_unlock_irqrestore(&base->lock, flags);
@@ -1511,7 +1470,7 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
}
/**
- * try_to_del_timer_sync - Try to deactivate a timer
+ * timer_delete_sync_try - Try to deactivate a timer
* @timer: Timer to deactivate
*
* This function tries to deactivate a timer. On success the timer is not
@@ -1526,11 +1485,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
* * %1 - The timer was pending and deactivated
* * %-1 - The timer callback function is running on a different CPU
*/
-int try_to_del_timer_sync(struct timer_list *timer)
+int timer_delete_sync_try(struct timer_list *timer)
{
return __try_to_del_timer_sync(timer, false);
}
-EXPORT_SYMBOL(try_to_del_timer_sync);
+EXPORT_SYMBOL(timer_delete_sync_try);
#ifdef CONFIG_PREEMPT_RT
static __init void timer_base_init_expiry_lock(struct timer_base *base)
@@ -1900,7 +1859,7 @@ static void timer_recalc_next_expiry(struct timer_base *base)
unsigned long clk, next, adj;
unsigned lvl, offset = 0;
- next = base->clk + NEXT_TIMER_MAX_DELTA;
+ next = base->clk + TIMER_NEXT_MAX_DELTA;
clk = base->clk;
for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
@@ -1963,7 +1922,7 @@ static void timer_recalc_next_expiry(struct timer_base *base)
WRITE_ONCE(base->next_expiry, next);
base->next_expiry_recalc = false;
- base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
+ base->timers_pending = !(next == base->clk + TIMER_NEXT_MAX_DELTA);
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -2015,7 +1974,7 @@ static unsigned long next_timer_interrupt(struct timer_base *base,
* easy comparable to find out which base holds the first pending timer.
*/
if (!base->timers_pending)
- WRITE_ONCE(base->next_expiry, basej + NEXT_TIMER_MAX_DELTA);
+ WRITE_ONCE(base->next_expiry, basej + TIMER_NEXT_MAX_DELTA);
return base->next_expiry;
}
@@ -2360,6 +2319,7 @@ u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle)
*/
void timer_clear_idle(void)
{
+ int this_cpu = smp_processor_id();
/*
* We do this unlocked. The worst outcome is a remote pinned timer
* enqueue sending a pointless IPI, but taking the lock would just
@@ -2368,9 +2328,9 @@ void timer_clear_idle(void)
* path. Required for BASE_LOCAL only.
*/
__this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false);
- if (tick_nohz_full_cpu(smp_processor_id()))
+ if (tick_nohz_full_cpu(this_cpu))
__this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false);
- trace_timer_base_idle(false, smp_processor_id());
+ trace_timer_base_idle(false, this_cpu);
/* Activate without holding the timer_base->lock */
tmigr_cpu_activate();
@@ -2399,7 +2359,7 @@ static inline void __run_timers(struct timer_base *base)
* timer at this clk are that all matching timers have been
* dequeued or no timer has been queued since
* base::next_expiry was set to base::clk +
- * NEXT_TIMER_MAX_DELTA.
+ * TIMER_NEXT_MAX_DELTA.
*/
WARN_ON_ONCE(!levels && !base->next_expiry_recalc
&& base->timers_pending);
@@ -2514,7 +2474,7 @@ void update_process_times(int user_tick)
run_local_timers();
rcu_sched_clock_irq(user_tick);
#ifdef CONFIG_IRQ_WORK
- if (in_irq())
+ if (in_hardirq())
irq_work_tick();
#endif
sched_tick();
@@ -2544,7 +2504,7 @@ int timers_prepare_cpu(unsigned int cpu)
for (b = 0; b < NR_BASES; b++) {
base = per_cpu_ptr(&timer_bases[b], cpu);
base->clk = jiffies;
- base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+ base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA;
base->next_expiry_recalc = false;
base->timers_pending = false;
base->is_idle = false;
@@ -2599,7 +2559,7 @@ static void __init init_timer_cpu(int cpu)
base->cpu = cpu;
raw_spin_lock_init(&base->lock);
base->clk = jiffies;
- base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+ base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA;
timer_base_init_expiry_lock(base);
}
}
@@ -2612,7 +2572,7 @@ static void __init init_timer_cpus(void)
init_timer_cpu(cpu);
}
-void __init init_timers(void)
+void __init timers_init(void)
{
init_timer_cpus();
posix_cputimers_init_work();
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1c311c46da50..427d7ddea3af 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -46,8 +46,8 @@ static void
print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
int idx, u64 now)
{
- SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function);
- SEQ_printf(m, ", S:%02x", timer->state);
+ SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, ACCESS_PRIVATE(timer, function));
+ SEQ_printf(m, ", S:%02x", timer->is_queued);
SEQ_printf(m, "\n");
SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
(unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)),
@@ -56,13 +56,11 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
(long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now));
}
-static void
-print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
- u64 now)
+static void print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
{
+ struct timerqueue_linked_node *curr;
struct hrtimer *timer, tmp;
unsigned long next = 0, i;
- struct timerqueue_node *curr;
unsigned long flags;
next_one:
@@ -72,13 +70,13 @@ next_one:
raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
- curr = timerqueue_getnext(&base->active);
+ curr = timerqueue_linked_first(&base->active);
/*
* Crude but we have to do this O(N*N) thing, because
* we have to unlock the base when printing:
*/
while (curr && i < next) {
- curr = timerqueue_iterate_next(curr);
+ curr = timerqueue_linked_next(curr);
i++;
}
@@ -98,15 +96,13 @@ next_one:
static void
print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
{
- SEQ_printf(m, " .base: %pK\n", base);
+ SEQ_printf(m, " .base: %p\n", base);
SEQ_printf(m, " .index: %d\n", base->index);
SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution);
-
- SEQ_printf(m, " .get_time: %ps\n", base->get_time);
#ifdef CONFIG_HIGH_RES_TIMERS
- SEQ_printf(m, " .offset: %Lu nsecs\n",
- (unsigned long long) ktime_to_ns(base->offset));
+ SEQ_printf(m, " .offset: %Ld nsecs\n",
+ (long long) base->offset);
#endif
SEQ_printf(m, "active timers:\n");
print_active_timers(m, base, now + ktime_to_ns(base->offset));
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 2f6330831f08..52c15affdbff 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -10,6 +10,7 @@
#include <linux/spinlock.h>
#include <linux/timerqueue.h>
#include <trace/events/ipi.h>
+#include <linux/sched/isolation.h>
#include "timer_migration.h"
#include "tick-internal.h"
@@ -420,14 +421,53 @@ static struct list_head *tmigr_level_list __read_mostly;
static unsigned int tmigr_hierarchy_levels __read_mostly;
static unsigned int tmigr_crossnode_level __read_mostly;
+static struct tmigr_group *tmigr_root;
+
static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu);
+/*
+ * CPUs available for timer migration.
+ * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock.
+ * Additionally tmigr_available_mutex serializes set/clear operations with each other.
+ */
+static cpumask_var_t tmigr_available_cpumask;
+static DEFINE_MUTEX(tmigr_available_mutex);
+
+/* Enabled during late initcall */
+static DEFINE_STATIC_KEY_FALSE(tmigr_exclude_isolated);
+
#define TMIGR_NONE 0xFF
#define BIT_CNT 8
static inline bool tmigr_is_not_available(struct tmigr_cpu *tmc)
{
- return !(tmc->tmgroup && tmc->online);
+ return !(tmc->tmgroup && tmc->available);
+}
+
+/*
+ * Returns true if @cpu should be excluded from the hierarchy as isolated.
+ * Domain isolated CPUs don't participate in timer migration, nohz_full CPUs
+ * are still part of the hierarchy but become idle (from a tick and timer
+ * migration perspective) when they stop their tick. This lets the timekeeping
+ * CPU handle their global timers. Marking also isolated CPUs as idle would be
+ * too costly, hence they are completely excluded from the hierarchy.
+ * This check is necessary, for instance, to prevent offline isolated CPUs from
+ * being incorrectly marked as available once getting back online.
+ *
+ * This function returns false during early boot and the isolation logic is
+ * enabled only after isolated CPUs are marked as unavailable at late boot.
+ * The tick CPU can be isolated at boot, however we cannot mark it as
+ * unavailable to avoid having no global migrator for the nohz_full CPUs. This
+ * should be ensured by the callers of this function: implicitly from hotplug
+ * callbacks and explicitly in tmigr_init_isolation() and
+ * tmigr_isolated_exclude_cpumask().
+ */
+static inline bool tmigr_is_isolated(int cpu)
+{
+ if (!static_branch_unlikely(&tmigr_exclude_isolated))
+ return false;
+ return (!housekeeping_cpu(cpu, HK_TYPE_DOMAIN) &&
+ housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE));
}
/*
@@ -502,11 +542,6 @@ static bool tmigr_check_lonely(struct tmigr_group *group)
* @now: timer base monotonic
* @check: is set if there is the need to handle remote timers;
* required in tmigr_requires_handle_remote() only
- * @tmc_active: this flag indicates, whether the CPU which triggers
- * the hierarchy walk is !idle in the timer migration
- * hierarchy. When the CPU is idle and the whole hierarchy is
- * idle, only the first event of the top level has to be
- * considered.
*/
struct tmigr_walk {
u64 nextexp;
@@ -517,16 +552,13 @@ struct tmigr_walk {
unsigned long basej;
u64 now;
bool check;
- bool tmc_active;
};
typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *);
-static void __walk_groups(up_f up, struct tmigr_walk *data,
- struct tmigr_cpu *tmc)
+static void __walk_groups_from(up_f up, struct tmigr_walk *data,
+ struct tmigr_group *child, struct tmigr_group *group)
{
- struct tmigr_group *child = NULL, *group = tmc->tmgroup;
-
do {
WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels);
@@ -544,6 +576,12 @@ static void __walk_groups(up_f up, struct tmigr_walk *data,
} while (group);
}
+static void __walk_groups(up_f up, struct tmigr_walk *data,
+ struct tmigr_cpu *tmc)
+{
+ __walk_groups_from(up, data, NULL, tmc->tmgroup);
+}
+
static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc)
{
lockdep_assert_held(&tmc->lock);
@@ -708,7 +746,7 @@ void tmigr_cpu_activate(void)
/*
* Returns true, if there is nothing to be propagated to the next level
*
- * @data->firstexp is set to expiry of first gobal event of the (top level of
+ * @data->firstexp is set to expiry of first global event of the (top level of
* the) hierarchy, but only when hierarchy is completely idle.
*
* The child and group states need to be read under the lock, to prevent a race
@@ -926,7 +964,7 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now,
* updated the event takes care when hierarchy is completely
* idle. Otherwise the migrator does it as the event is enqueued.
*/
- if (!tmc->online || tmc->remote || tmc->cpuevt.ignore ||
+ if (!tmc->available || tmc->remote || tmc->cpuevt.ignore ||
now < tmc->cpuevt.nextevt.expires) {
raw_spin_unlock_irq(&tmc->lock);
return;
@@ -940,8 +978,12 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now,
/* Drop the lock to allow the remote CPU to exit idle */
raw_spin_unlock_irq(&tmc->lock);
- if (cpu != smp_processor_id())
- timer_expire_remote(cpu);
+ /*
+ * This can't exclude the local CPU because jiffies might have advanced
+ * after the timer softirq invoked run_timer_base(BASE_GLOBAL) and the
+ * point where the jiffies snapshot @jif was taken in tmigr_handle_remote().
+ */
+ timer_expire_remote(cpu);
/*
* Lock ordering needs to be preserved - timer_base locks before tmigr
@@ -973,7 +1015,7 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now,
* (See also section "Required event and timerqueue update after a
* remote expiry" in the documentation at the top)
*/
- if (!tmc->online || !tmc->idle) {
+ if (!tmc->available || !tmc->idle) {
timer_unlock_remote_bases(cpu);
goto unlock;
}
@@ -1113,15 +1155,6 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group,
*/
if (!tmigr_check_migrator(group, childmask))
return true;
-
- /*
- * When there is a parent group and the CPU which triggered the
- * hierarchy walk is not active, proceed the walk to reach the top level
- * group before reading the next_expiry value.
- */
- if (group->parent && !data->tmc_active)
- return false;
-
/*
* The lock is required on 32bit architectures to read the variable
* consistently with a concurrent writer. On 64bit the lock is not
@@ -1166,7 +1199,6 @@ bool tmigr_requires_handle_remote(void)
data.now = get_jiffies_update(&jif);
data.childmask = tmc->groupmask;
data.firstexp = KTIME_MAX;
- data.tmc_active = !tmc->idle;
data.check = false;
/*
@@ -1405,23 +1437,20 @@ u64 tmigr_quick_check(u64 nextevt)
return KTIME_MAX;
do {
- if (!tmigr_check_lonely(group)) {
+ if (!tmigr_check_lonely(group))
return KTIME_MAX;
- } else {
- /*
- * Since current CPU is active, events may not be sorted
- * from bottom to the top because the CPU's event is ignored
- * up to the top and its sibling's events not propagated upwards.
- * Thus keep track of the lowest observed expiry.
- */
- nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry));
- if (!group->parent)
- return nextevt;
- }
+
+ /*
+ * Since current CPU is active, events may not be sorted
+ * from bottom to the top because the CPU's event is ignored
+ * up to the top and its sibling's events not propagated upwards.
+ * Thus keep track of the lowest observed expiry.
+ */
+ nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry));
group = group->parent;
} while (group);
- return KTIME_MAX;
+ return nextevt;
}
/*
@@ -1435,38 +1464,43 @@ static long tmigr_trigger_active(void *unused)
{
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
- WARN_ON_ONCE(!tmc->online || tmc->idle);
+ WARN_ON_ONCE(!tmc->available || tmc->idle);
return 0;
}
-static int tmigr_cpu_offline(unsigned int cpu)
+static int tmigr_clear_cpu_available(unsigned int cpu)
{
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
int migrator;
u64 firstexp;
- raw_spin_lock_irq(&tmc->lock);
- tmc->online = false;
- WRITE_ONCE(tmc->wakeup, KTIME_MAX);
+ guard(mutex)(&tmigr_available_mutex);
- /*
- * CPU has to handle the local events on his own, when on the way to
- * offline; Therefore nextevt value is set to KTIME_MAX
- */
- firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX);
- trace_tmigr_cpu_offline(tmc);
- raw_spin_unlock_irq(&tmc->lock);
+ cpumask_clear_cpu(cpu, tmigr_available_cpumask);
+ scoped_guard(raw_spinlock_irq, &tmc->lock) {
+ if (!tmc->available)
+ return 0;
+ tmc->available = false;
+ WRITE_ONCE(tmc->wakeup, KTIME_MAX);
+
+ /*
+ * CPU has to handle the local events on his own, when on the way to
+ * offline; Therefore nextevt value is set to KTIME_MAX
+ */
+ firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX);
+ trace_tmigr_cpu_unavailable(tmc);
+ }
if (firstexp != KTIME_MAX) {
- migrator = cpumask_any_but(cpu_online_mask, cpu);
+ migrator = cpumask_any(tmigr_available_cpumask);
work_on_cpu(migrator, tmigr_trigger_active, NULL);
}
return 0;
}
-static int tmigr_cpu_online(unsigned int cpu)
+static int __tmigr_set_cpu_available(unsigned int cpu)
{
struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu);
@@ -1474,16 +1508,131 @@ static int tmigr_cpu_online(unsigned int cpu)
if (WARN_ON_ONCE(!tmc->tmgroup))
return -EINVAL;
- raw_spin_lock_irq(&tmc->lock);
- trace_tmigr_cpu_online(tmc);
- tmc->idle = timer_base_is_idle();
- if (!tmc->idle)
- __tmigr_cpu_activate(tmc);
- tmc->online = true;
- raw_spin_unlock_irq(&tmc->lock);
+ guard(mutex)(&tmigr_available_mutex);
+
+ cpumask_set_cpu(cpu, tmigr_available_cpumask);
+ scoped_guard(raw_spinlock_irq, &tmc->lock) {
+ if (tmc->available)
+ return 0;
+ trace_tmigr_cpu_available(tmc);
+ tmc->idle = timer_base_is_idle();
+ if (!tmc->idle)
+ __tmigr_cpu_activate(tmc);
+ tmc->available = true;
+ }
+ return 0;
+}
+
+static int tmigr_set_cpu_available(unsigned int cpu)
+{
+ if (tmigr_is_isolated(cpu))
+ return 0;
+
+ return __tmigr_set_cpu_available(cpu);
+}
+
+static void tmigr_cpu_isolate(struct work_struct *ignored)
+{
+ tmigr_clear_cpu_available(smp_processor_id());
+}
+
+static void tmigr_cpu_unisolate(struct work_struct *ignored)
+{
+ /*
+ * Don't call tmigr_is_isolated() ->housekeeping_cpu() directly because
+ * the cpuset mutex is correctly held by the workqueue caller but lockdep
+ * doesn't know that.
+ */
+ __tmigr_set_cpu_available(smp_processor_id());
+}
+
+/**
+ * tmigr_isolated_exclude_cpumask - Exclude given CPUs from hierarchy
+ * @exclude_cpumask: the cpumask to be excluded from timer migration hierarchy
+ *
+ * This function can be called from cpuset code to provide the new set of
+ * isolated CPUs that should be excluded from the hierarchy.
+ * Online CPUs not present in exclude_cpumask but already excluded are brought
+ * back to the hierarchy.
+ * Functions to isolate/unisolate need to be called locally and can sleep.
+ */
+int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask)
+{
+ struct work_struct __percpu *works __free(free_percpu) =
+ alloc_percpu(struct work_struct);
+ cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL;
+ int cpu;
+
+ if (!works)
+ return -ENOMEM;
+ if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ /*
+ * First set previously isolated CPUs as available (unisolate).
+ * This cpumask contains only CPUs that switched to available now.
+ */
+ guard(cpus_read_lock)();
+ cpumask_andnot(cpumask, cpu_online_mask, exclude_cpumask);
+ cpumask_andnot(cpumask, cpumask, tmigr_available_cpumask);
+
+ for_each_cpu(cpu, cpumask) {
+ struct work_struct *work = per_cpu_ptr(works, cpu);
+
+ INIT_WORK(work, tmigr_cpu_unisolate);
+ schedule_work_on(cpu, work);
+ }
+ for_each_cpu(cpu, cpumask)
+ flush_work(per_cpu_ptr(works, cpu));
+
+ /*
+ * Then clear previously available CPUs (isolate).
+ * This cpumask contains only CPUs that switched to not available now.
+ * There cannot be overlap with the newly available ones.
+ */
+ cpumask_and(cpumask, exclude_cpumask, tmigr_available_cpumask);
+ cpumask_and(cpumask, cpumask, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE));
+ /*
+ * Handle this here and not in the cpuset code because exclude_cpumask
+ * might include also the tick CPU if included in isolcpus.
+ */
+ for_each_cpu(cpu, cpumask) {
+ if (!tick_nohz_cpu_hotpluggable(cpu)) {
+ cpumask_clear_cpu(cpu, cpumask);
+ break;
+ }
+ }
+
+ for_each_cpu(cpu, cpumask) {
+ struct work_struct *work = per_cpu_ptr(works, cpu);
+
+ INIT_WORK(work, tmigr_cpu_isolate);
+ schedule_work_on(cpu, work);
+ }
+ for_each_cpu(cpu, cpumask)
+ flush_work(per_cpu_ptr(works, cpu));
+
return 0;
}
+static int __init tmigr_init_isolation(void)
+{
+ cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL;
+
+ static_branch_enable(&tmigr_exclude_isolated);
+
+ if (!housekeeping_enabled(HK_TYPE_DOMAIN))
+ return 0;
+ if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ cpumask_andnot(cpumask, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN));
+
+ /* Protect against RCU torture hotplug testing */
+ return tmigr_isolated_exclude_cpumask(cpumask);
+}
+late_initcall(tmigr_init_isolation);
+
static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
int node)
{
@@ -1501,21 +1650,6 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
s.seq = 0;
atomic_set(&group->migr_state, s.state);
- /*
- * If this is a new top-level, prepare its groupmask in advance.
- * This avoids accidents where yet another new top-level is
- * created in the future and made visible before the current groupmask.
- */
- if (list_empty(&tmigr_level_list[lvl])) {
- group->groupmask = BIT(0);
- /*
- * The previous top level has prepared its groupmask already,
- * simply account it as the first child.
- */
- if (lvl > 0)
- group->num_children = 1;
- }
-
timerqueue_init_head(&group->events);
timerqueue_init(&group->groupevt.nextevt);
group->groupevt.nextevt.expires = KTIME_MAX;
@@ -1523,8 +1657,7 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
group->groupevt.ignore = true;
}
-static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node,
- unsigned int lvl)
+static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl)
{
struct tmigr_group *tmp, *group = NULL;
@@ -1570,25 +1703,51 @@ static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node,
return group;
}
+static bool tmigr_init_root(struct tmigr_group *group, bool activate)
+{
+ if (!group->parent && group != tmigr_root) {
+ /*
+ * This is the new top-level, prepare its groupmask in advance
+ * to avoid accidents where yet another new top-level is
+ * created in the future and made visible before this groupmask.
+ */
+ group->groupmask = BIT(0);
+ WARN_ON_ONCE(activate);
+
+ return true;
+ }
+
+ return false;
+
+}
+
static void tmigr_connect_child_parent(struct tmigr_group *child,
struct tmigr_group *parent,
bool activate)
{
- struct tmigr_walk data;
-
- raw_spin_lock_irq(&child->lock);
- raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING);
+ if (tmigr_init_root(parent, activate)) {
+ /*
+ * The previous top level had prepared its groupmask already,
+ * simply account it in advance as the first child. If some groups
+ * have been created between the old and new root due to node
+ * mismatch, the new root's child will be intialized accordingly.
+ */
+ parent->num_children = 1;
+ }
- if (activate) {
+ /* Connecting old root to new root ? */
+ if (!parent->parent && activate) {
/*
- * @child is the old top and @parent the new one. In this
- * case groupmask is pre-initialized and @child already
- * accounted, along with its new sibling corresponding to the
- * CPU going up.
+ * @child is the old top, or in case of node mismatch, some
+ * intermediate group between the old top and the new one in
+ * @parent. In this case the @child must be pre-accounted above
+ * as the first child. Its new inactive sibling corresponding
+ * to the CPU going up has been accounted as the second child.
*/
- WARN_ON_ONCE(child->groupmask != BIT(0) || parent->num_children != 2);
+ WARN_ON_ONCE(parent->num_children != 2);
+ child->groupmask = BIT(0);
} else {
- /* Adding @child for the CPU going up to @parent. */
+ /* Common case adding @child for the CPU going up to @parent. */
child->groupmask = BIT(parent->num_children++);
}
@@ -1599,87 +1758,61 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
*/
smp_store_release(&child->parent, parent);
- raw_spin_unlock(&parent->lock);
- raw_spin_unlock_irq(&child->lock);
-
trace_tmigr_connect_child_parent(child);
-
- if (!activate)
- return;
-
- /*
- * To prevent inconsistent states, active children need to be active in
- * the new parent as well. Inactive children are already marked inactive
- * in the parent group:
- *
- * * When new groups were created by tmigr_setup_groups() starting from
- * the lowest level (and not higher then one level below the current
- * top level), then they are not active. They will be set active when
- * the new online CPU comes active.
- *
- * * But if a new group above the current top level is required, it is
- * mandatory to propagate the active state of the already existing
- * child to the new parent. So tmigr_connect_child_parent() is
- * executed with the formerly top level group (child) and the newly
- * created group (parent).
- *
- * * It is ensured that the child is active, as this setup path is
- * executed in hotplug prepare callback. This is exectued by an
- * already connected and !idle CPU. Even if all other CPUs go idle,
- * the CPU executing the setup will be responsible up to current top
- * level group. And the next time it goes inactive, it will release
- * the new childmask and parent to subsequent walkers through this
- * @child. Therefore propagate active state unconditionally.
- */
- data.childmask = child->groupmask;
-
- /*
- * There is only one new level per time (which is protected by
- * tmigr_mutex). When connecting the child and the parent and set the
- * child active when the parent is inactive, the parent needs to be the
- * uppermost level. Otherwise there went something wrong!
- */
- WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent);
}
-static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
+static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
+ struct tmigr_group *start, bool activate)
{
struct tmigr_group *group, *child, **stack;
- int top = 0, err = 0, i = 0;
- struct list_head *lvllist;
+ int i, top = 0, err = 0, start_lvl = 0;
+ bool root_mismatch = false;
- stack = kcalloc(tmigr_hierarchy_levels, sizeof(*stack), GFP_KERNEL);
+ stack = kzalloc_objs(*stack, tmigr_hierarchy_levels);
if (!stack)
return -ENOMEM;
- do {
- group = tmigr_get_group(cpu, node, i);
+ if (start) {
+ stack[start->level] = start;
+ start_lvl = start->level + 1;
+ }
+
+ if (tmigr_root)
+ root_mismatch = tmigr_root->numa_node != node;
+
+ for (i = start_lvl; i < tmigr_hierarchy_levels; i++) {
+ group = tmigr_get_group(node, i);
if (IS_ERR(group)) {
err = PTR_ERR(group);
+ i--;
break;
}
top = i;
- stack[i++] = group;
+ stack[i] = group;
/*
* When booting only less CPUs of a system than CPUs are
- * available, not all calculated hierarchy levels are required.
+ * available, not all calculated hierarchy levels are required,
+ * unless a node mismatch is detected.
*
* The loop is aborted as soon as the highest level, which might
* be different from tmigr_hierarchy_levels, contains only a
- * single group.
+ * single group, unless the nodes mismatch below tmigr_crossnode_level
*/
- if (group->parent || list_is_singular(&tmigr_level_list[i - 1]))
+ if (group->parent)
break;
+ if ((!root_mismatch || i >= tmigr_crossnode_level) &&
+ list_is_singular(&tmigr_level_list[i]))
+ break;
+ }
- } while (i < tmigr_hierarchy_levels);
-
- /* Assert single root */
- WARN_ON_ONCE(!err && !group->parent && !list_is_singular(&tmigr_level_list[top]));
+ /* Assert single root without parent */
+ if (WARN_ON_ONCE(i >= tmigr_hierarchy_levels))
+ return -EINVAL;
- while (i > 0) {
- group = stack[--i];
+ for (; i >= start_lvl; i--) {
+ group = stack[i];
if (err < 0) {
list_del(&group->list);
@@ -1695,12 +1828,10 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
if (i == 0) {
struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu);
- raw_spin_lock_irq(&group->lock);
-
tmc->tmgroup = group;
tmc->groupmask = BIT(group->num_children++);
- raw_spin_unlock_irq(&group->lock);
+ tmigr_init_root(group, activate);
trace_tmigr_connect_cpu_parent(tmc);
@@ -1708,42 +1839,76 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
continue;
} else {
child = stack[i - 1];
- /* Will be activated at online time */
- tmigr_connect_child_parent(child, group, false);
+ tmigr_connect_child_parent(child, group, activate);
}
+ }
- /* check if uppermost level was newly created */
- if (top != i)
- continue;
-
- WARN_ON_ONCE(top == 0);
+ if (err < 0)
+ goto out;
- lvllist = &tmigr_level_list[top];
+ if (activate) {
+ struct tmigr_walk data;
+ union tmigr_state state;
/*
- * Newly created root level should have accounted the upcoming
- * CPU's child group and pre-accounted the old root.
+ * To prevent inconsistent states, active children need to be active in
+ * the new parent as well. Inactive children are already marked inactive
+ * in the parent group:
+ *
+ * * When new groups were created by tmigr_setup_groups() starting from
+ * the lowest level, then they are not active. They will be set active
+ * when the new online CPU comes active.
+ *
+ * * But if new groups above the current top level are required, it is
+ * mandatory to propagate the active state of the already existing
+ * child to the new parents. So tmigr_active_up() activates the
+ * new parents while walking up from the old root to the new.
+ *
+ * * It is ensured that @start is active, (or on the way to be activated
+ * by another CPU that woke up before the current one) as this setup path
+ * is executed in hotplug prepare callback. This is executed by an already
+ * connected and !idle CPU in the hierarchy.
+ *
+ * * The below RmW atomic operation ensures that:
+ *
+ * 1) If the old root has been completely activated, the latest state is
+ * acquired (the below implicit acquire pairs with the implicit release
+ * from cmpxchg() in tmigr_active_up()).
+ *
+ * 2) If the old root is still on the way to be activated, the lagging behind
+ * CPU performing the activation will acquire the links up to the new root.
+ * (The below implicit release pairs with the implicit acquire from cmpxchg()
+ * in tmigr_active_up()).
+ *
+ * 3) Every subsequent CPU below the old root will acquire the new links while
+ * walking through the old root (The below implicit release pairs with the
+ * implicit acquire from cmpxchg() in either tmigr_active_up()) or
+ * tmigr_inactive_up().
+ */
+ state.state = atomic_fetch_or(0, &start->migr_state);
+ WARN_ON_ONCE(!start->parent);
+ /*
+ * If the state of the old root is inactive, another CPU is on its way to activate
+ * it and propagate to the new root.
*/
- if (group->num_children == 2 && list_is_singular(lvllist)) {
- /*
- * The target CPU must never do the prepare work, except
- * on early boot when the boot CPU is the target. Otherwise
- * it may spuriously activate the old top level group inside
- * the new one (nevertheless whether old top level group is
- * active or not) and/or release an uninitialized childmask.
- */
- WARN_ON_ONCE(cpu == raw_smp_processor_id());
-
- lvllist = &tmigr_level_list[top - 1];
- list_for_each_entry(child, lvllist, list) {
- if (child->parent)
- continue;
-
- tmigr_connect_child_parent(child, group, true);
- }
+ if (state.active) {
+ data.childmask = start->groupmask;
+ __walk_groups_from(tmigr_active_up, &data, start, start->parent);
}
}
+ /* Root update */
+ if (list_is_singular(&tmigr_level_list[top])) {
+ group = list_first_entry(&tmigr_level_list[top],
+ typeof(*group), list);
+ WARN_ON_ONCE(group->parent);
+ if (tmigr_root) {
+ /* Old root should be the same or below */
+ WARN_ON_ONCE(tmigr_root->level > top);
+ }
+ tmigr_root = group;
+ }
+out:
kfree(stack);
return err;
@@ -1751,12 +1916,31 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
static int tmigr_add_cpu(unsigned int cpu)
{
+ struct tmigr_group *old_root = tmigr_root;
int node = cpu_to_node(cpu);
int ret;
- mutex_lock(&tmigr_mutex);
- ret = tmigr_setup_groups(cpu, node);
- mutex_unlock(&tmigr_mutex);
+ guard(mutex)(&tmigr_mutex);
+
+ ret = tmigr_setup_groups(cpu, node, NULL, false);
+
+ /* Root has changed? Connect the old one to the new */
+ if (ret >= 0 && old_root && old_root != tmigr_root) {
+ /*
+ * The target CPU must never do the prepare work, except
+ * on early boot when the boot CPU is the target. Otherwise
+ * it may spuriously activate the old top level group inside
+ * the new one (nevertheless whether old top level group is
+ * active or not) and/or release an uninitialized childmask.
+ */
+ WARN_ON_ONCE(cpu == raw_smp_processor_id());
+ /*
+ * The (likely) current CPU is expected to be online in the hierarchy,
+ * otherwise the old root may not be active as expected.
+ */
+ WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available);
+ ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true);
+ }
return ret;
}
@@ -1801,6 +1985,11 @@ static int __init tmigr_init(void)
if (ncpus == 1)
return 0;
+ if (!zalloc_cpumask_var(&tmigr_available_cpumask, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
/*
* Calculate the required hierarchy levels. Unfortunately there is no
* reliable information available, unless all possible CPUs have been
@@ -1832,7 +2021,8 @@ static int __init tmigr_init(void)
*/
tmigr_crossnode_level = cpulvl;
- tmigr_level_list = kcalloc(tmigr_hierarchy_levels, sizeof(struct list_head), GFP_KERNEL);
+ tmigr_level_list = kzalloc_objs(struct list_head,
+ tmigr_hierarchy_levels);
if (!tmigr_level_list)
goto err;
@@ -1850,7 +2040,7 @@ static int __init tmigr_init(void)
goto err;
ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online",
- tmigr_cpu_online, tmigr_cpu_offline);
+ tmigr_set_cpu_available, tmigr_clear_cpu_available);
if (ret)
goto err;
diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h
index ae19f70f8170..70879cde6fdd 100644
--- a/kernel/time/timer_migration.h
+++ b/kernel/time/timer_migration.h
@@ -97,7 +97,7 @@ struct tmigr_group {
*/
struct tmigr_cpu {
raw_spinlock_t lock;
- bool online;
+ bool available;
bool idle;
bool remote;
struct tmigr_group *tmgroup;
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c
index 05d383143165..aa59919b8f2c 100644
--- a/kernel/time/vsyscall.c
+++ b/kernel/time/vsyscall.c
@@ -15,29 +15,28 @@
#include "timekeeping_internal.h"
-static inline void update_vdso_data(struct vdso_data *vdata,
- struct timekeeper *tk)
+static inline void fill_clock_configuration(struct vdso_clock *vc, const struct tk_read_base *base)
{
+ vc->cycle_last = base->cycle_last;
+#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
+ vc->max_cycles = base->clock->max_cycles;
+#endif
+ vc->mask = base->mask;
+ vc->mult = base->mult;
+ vc->shift = base->shift;
+}
+
+static inline void update_vdso_time_data(struct vdso_time_data *vdata, struct timekeeper *tk)
+{
+ struct vdso_clock *vc = vdata->clock_data;
struct vdso_timestamp *vdso_ts;
u64 nsec, sec;
- vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last;
-#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
- vdata[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles;
-#endif
- vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask;
- vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult;
- vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift;
- vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last;
-#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT
- vdata[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles;
-#endif
- vdata[CS_RAW].mask = tk->tkr_raw.mask;
- vdata[CS_RAW].mult = tk->tkr_raw.mult;
- vdata[CS_RAW].shift = tk->tkr_raw.shift;
+ fill_clock_configuration(&vc[CS_HRES_COARSE], &tk->tkr_mono);
+ fill_clock_configuration(&vc[CS_RAW], &tk->tkr_raw);
/* CLOCK_MONOTONIC */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC];
+ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC];
vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
nsec = tk->tkr_mono.xtime_nsec;
@@ -55,7 +54,7 @@ static inline void update_vdso_data(struct vdso_data *vdata,
nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift;
/* CLOCK_BOOTTIME */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME];
+ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME];
vdso_ts->sec = sec;
while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
@@ -65,19 +64,20 @@ static inline void update_vdso_data(struct vdso_data *vdata,
vdso_ts->nsec = nsec;
/* CLOCK_MONOTONIC_RAW */
- vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW];
+ vdso_ts = &vc[CS_RAW].basetime[CLOCK_MONOTONIC_RAW];
vdso_ts->sec = tk->raw_sec;
vdso_ts->nsec = tk->tkr_raw.xtime_nsec;
/* CLOCK_TAI */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI];
+ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_TAI];
vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset;
vdso_ts->nsec = tk->tkr_mono.xtime_nsec;
}
void update_vsyscall(struct timekeeper *tk)
{
- struct vdso_data *vdata = __arch_get_k_vdso_data();
+ struct vdso_time_data *vdata = vdso_k_time_data;
+ struct vdso_clock *vc = vdata->clock_data;
struct vdso_timestamp *vdso_ts;
s32 clock_mode;
u64 nsec;
@@ -86,55 +86,95 @@ void update_vsyscall(struct timekeeper *tk)
vdso_write_begin(vdata);
clock_mode = tk->tkr_mono.clock->vdso_clock_mode;
- vdata[CS_HRES_COARSE].clock_mode = clock_mode;
- vdata[CS_RAW].clock_mode = clock_mode;
+ vc[CS_HRES_COARSE].clock_mode = clock_mode;
+ vc[CS_RAW].clock_mode = clock_mode;
/* CLOCK_REALTIME also required for time() */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME];
+ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME];
vdso_ts->sec = tk->xtime_sec;
vdso_ts->nsec = tk->tkr_mono.xtime_nsec;
/* CLOCK_REALTIME_COARSE */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE];
+ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE];
vdso_ts->sec = tk->xtime_sec;
- vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ vdso_ts->nsec = tk->coarse_nsec;
/* CLOCK_MONOTONIC_COARSE */
- vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE];
+ vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE];
vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
- nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ nsec = tk->coarse_nsec;
nsec = nsec + tk->wall_to_monotonic.tv_nsec;
vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec);
/*
* Read without the seqlock held by clock_getres().
- * Note: No need to have a second copy.
*/
- WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution);
+ WRITE_ONCE(vdata->hrtimer_res, hrtimer_resolution);
/*
* If the current clocksource is not VDSO capable, then spare the
* update of the high resolution parts.
*/
if (clock_mode != VDSO_CLOCKMODE_NONE)
- update_vdso_data(vdata, tk);
+ update_vdso_time_data(vdata, tk);
- __arch_update_vsyscall(vdata);
+ __arch_update_vdso_clock(&vc[CS_HRES_COARSE]);
+ __arch_update_vdso_clock(&vc[CS_RAW]);
vdso_write_end(vdata);
- __arch_sync_vdso_data(vdata);
+ __arch_sync_vdso_time_data(vdata);
}
void update_vsyscall_tz(void)
{
- struct vdso_data *vdata = __arch_get_k_vdso_data();
+ struct vdso_time_data *vdata = vdso_k_time_data;
+
+ vdata->tz_minuteswest = sys_tz.tz_minuteswest;
+ vdata->tz_dsttime = sys_tz.tz_dsttime;
+
+ __arch_sync_vdso_time_data(vdata);
+}
+
+#ifdef CONFIG_POSIX_AUX_CLOCKS
+void vdso_time_update_aux(struct timekeeper *tk)
+{
+ struct vdso_time_data *vdata = vdso_k_time_data;
+ struct vdso_timestamp *vdso_ts;
+ struct vdso_clock *vc;
+ s32 clock_mode;
+ u64 nsec;
+
+ vc = &vdata->aux_clock_data[tk->id - TIMEKEEPER_AUX_FIRST];
+ vdso_ts = &vc->basetime[VDSO_BASE_AUX];
+ clock_mode = tk->tkr_mono.clock->vdso_clock_mode;
+ if (!tk->clock_valid)
+ clock_mode = VDSO_CLOCKMODE_NONE;
+
+ /* copy vsyscall data */
+ vdso_write_begin_clock(vc);
- vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest;
- vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime;
+ vc->clock_mode = clock_mode;
- __arch_sync_vdso_data(vdata);
+ if (clock_mode != VDSO_CLOCKMODE_NONE) {
+ fill_clock_configuration(vc, &tk->tkr_mono);
+
+ vdso_ts->sec = tk->xtime_sec + tk->monotonic_to_aux.tv_sec;
+
+ nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
+ nsec += tk->monotonic_to_aux.tv_nsec;
+ vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec);
+ nsec = nsec << tk->tkr_mono.shift;
+ vdso_ts->nsec = nsec;
+ }
+
+ __arch_update_vdso_clock(vc);
+
+ vdso_write_end_clock(vc);
+
+ __arch_sync_vdso_time_data(vdata);
}
+#endif
/**
* vdso_update_begin - Start of a VDSO update section
@@ -150,7 +190,7 @@ void update_vsyscall_tz(void)
*/
unsigned long vdso_update_begin(void)
{
- struct vdso_data *vdata = __arch_get_k_vdso_data();
+ struct vdso_time_data *vdata = vdso_k_time_data;
unsigned long flags = timekeeper_lock_irqsave();
vdso_write_begin(vdata);
@@ -167,9 +207,9 @@ unsigned long vdso_update_begin(void)
*/
void vdso_update_end(unsigned long flags)
{
- struct vdso_data *vdata = __arch_get_k_vdso_data();
+ struct vdso_time_data *vdata = vdso_k_time_data;
vdso_write_end(vdata);
- __arch_sync_vdso_data(vdata);
+ __arch_sync_vdso_time_data(vdata);
timekeeper_unlock_irqrestore(flags);
}