diff options
Diffstat (limited to 'kernel/time')
38 files changed, 3712 insertions, 2227 deletions
diff --git a/kernel/time/.kunitconfig b/kernel/time/.kunitconfig new file mode 100644 index 000000000000..d60a611b2853 --- /dev/null +++ b/kernel/time/.kunitconfig @@ -0,0 +1,2 @@ +CONFIG_KUNIT=y +CONFIG_TIME_KUNIT_TEST=y diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index b0b97a60aaa6..02aac7c5aa76 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -9,14 +9,13 @@ config CLOCKSOURCE_WATCHDOG bool -# Architecture has extra clocksource data -config ARCH_CLOCKSOURCE_DATA - bool - # Architecture has extra clocksource init called from registration config ARCH_CLOCKSOURCE_INIT bool +config ARCH_WANTS_CLOCKSOURCE_READ_INLINE + bool + # Timekeeping vsyscall support config GENERIC_TIME_VSYSCALL bool @@ -44,10 +43,23 @@ config GENERIC_CLOCKEVENTS_BROADCAST_IDLE config GENERIC_CLOCKEVENTS_MIN_ADJUST bool +config GENERIC_CLOCKEVENTS_COUPLED + bool + +config GENERIC_CLOCKEVENTS_COUPLED_INLINE + select GENERIC_CLOCKEVENTS_COUPLED + bool + # Generic update of CMOS clock config GENERIC_CMOS_UPDATE bool +# Deferred rearming of the hrtimer interrupt +config HRTIMER_REARM_DEFERRED + def_bool y + depends on GENERIC_ENTRY && HAVE_GENERIC_TIF_BITS + depends on HIGH_RES_TIMERS && SCHED_HRTICK + # Select to handle posix CPU timers from task_work # and not from the timer interrupt context config HAVE_POSIX_CPU_TIMERS_TASK_WORK @@ -82,9 +94,9 @@ config CONTEXT_TRACKING_IDLE help Tracks idle state on behalf of RCU. -if GENERIC_CLOCKEVENTS menu "Timers subsystem" +if GENERIC_CLOCKEVENTS # Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is # only related to the tick functionality. Oneshot clockevent devices # are supported independent of this. @@ -196,18 +208,17 @@ config HIGH_RES_TIMERS hardware is not capable then this option only increases the size of the kernel image. -config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US - int "Clocksource watchdog maximum allowable skew (in microseconds)" - depends on CLOCKSOURCE_WATCHDOG - range 50 1000 - default 125 +endif + +config POSIX_AUX_CLOCKS + bool "Enable auxiliary POSIX clocks" + depends on POSIX_TIMERS help - Specify the maximum amount of allowable watchdog skew in - microseconds before reporting the clocksource to be unstable. - The default is based on a half-second clocksource watchdog - interval and NTP's maximum frequency drift of 500 parts - per million. If the clocksource is good enough for NTP, - it is good enough for the clocksource watchdog! + Auxiliary POSIX clocks are clocks which can be steered + independently of the core timekeeper, which controls the + MONOTONIC, REALTIME, BOOTTIME and TAI clocks. They are useful to + provide e.g. lockless time accessors to independent PTP clocks + and other clock domains, which are not correlated to the TAI/NTP + notion of time. endmenu -endif diff --git a/kernel/time/Makefile b/kernel/time/Makefile index fe0ae82124fe..eaf290c972f9 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 + +# Branch profiling isn't noinstr-safe +ifdef CONFIG_TRACE_BRANCH_PROFILING +CFLAGS_sched_clock.o += -DDISABLE_BRANCH_PROFILING +endif + obj-y += time.o timer.o hrtimer.o sleep_timeout.o obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o obj-y += timeconv.o timecounter.o alarmtimer.o @@ -20,9 +26,10 @@ obj-$(CONFIG_LEGACY_TIMER_TICK) += tick-legacy.o ifeq ($(CONFIG_SMP),y) obj-$(CONFIG_NO_HZ_COMMON) += timer_migration.o endif -obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o +obj-$(CONFIG_GENERIC_GETTIMEOFDAY) += vsyscall.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o obj-$(CONFIG_TIME_NS) += namespace.o +obj-$(CONFIG_TIME_NS_VDSO) += namespace_vdso.o obj-$(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) += clocksource-wdtest.o obj-$(CONFIG_TIME_KUNIT_TEST) += time_test.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 0ddccdff119a..6e173d70d825 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -35,7 +35,7 @@ /** * struct alarm_base - Alarm timer bases - * @lock: Lock for syncrhonized access to the base + * @lock: Lock for synchronized access to the base * @timerqueue: Timerqueue head managing the list of events * @get_ktime: Function to read the time correlating to the base * @get_timespec: Function to read the namespace time correlating to the base @@ -70,12 +70,10 @@ static DEFINE_SPINLOCK(rtcdev_lock); */ struct rtc_device *alarmtimer_get_rtcdev(void) { - unsigned long flags; struct rtc_device *ret; - spin_lock_irqsave(&rtcdev_lock, flags); + guard(spinlock_irqsave)(&rtcdev_lock); ret = rtcdev; - spin_unlock_irqrestore(&rtcdev_lock, flags); return ret; } @@ -83,7 +81,6 @@ EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev); static int alarmtimer_rtc_add_device(struct device *dev) { - unsigned long flags; struct rtc_device *rtc = to_rtc_device(dev); struct platform_device *pdev; int ret = 0; @@ -101,25 +98,18 @@ static int alarmtimer_rtc_add_device(struct device *dev) if (!IS_ERR(pdev)) device_init_wakeup(&pdev->dev, true); - spin_lock_irqsave(&rtcdev_lock, flags); - if (!IS_ERR(pdev) && !rtcdev) { - if (!try_module_get(rtc->owner)) { + scoped_guard(spinlock_irqsave, &rtcdev_lock) { + if (!IS_ERR(pdev) && !rtcdev && try_module_get(rtc->owner)) { + rtcdev = rtc; + /* hold a reference so it doesn't go away */ + get_device(dev); + pdev = NULL; + } else { ret = -1; - goto unlock; } - - rtcdev = rtc; - /* hold a reference so it doesn't go away */ - get_device(dev); - pdev = NULL; - } else { - ret = -1; } -unlock: - spin_unlock_irqrestore(&rtcdev_lock, flags); platform_device_unregister(pdev); - return ret; } @@ -198,7 +188,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) struct alarm *alarm = container_of(timer, struct alarm, timer); struct alarm_base *base = &alarm_bases[alarm->type]; - scoped_guard (spinlock_irqsave, &base->lock) + scoped_guard(spinlock_irqsave, &base->lock) alarmtimer_dequeue(base, alarm); if (alarm->function) @@ -228,37 +218,39 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining); static int alarmtimer_suspend(struct device *dev) { ktime_t min, now, expires; - int i, ret, type; struct rtc_device *rtc; - unsigned long flags; struct rtc_time tm; + int i, ret, type; - spin_lock_irqsave(&freezer_delta_lock, flags); - min = freezer_delta; - expires = freezer_expires; - type = freezer_alarmtype; - freezer_delta = 0; - spin_unlock_irqrestore(&freezer_delta_lock, flags); + scoped_guard(spinlock_irqsave, &freezer_delta_lock) { + min = freezer_delta; + expires = freezer_expires; + type = freezer_alarmtype; + freezer_delta = 0; + } rtc = alarmtimer_get_rtcdev(); /* If we have no rtcdev, just return */ if (!rtc) return 0; - /* Find the soonest timer to expire*/ + /* Find the soonest timer to expire */ for (i = 0; i < ALARM_NUMTYPE; i++) { struct alarm_base *base = &alarm_bases[i]; struct timerqueue_node *next; + ktime_t next_expires; ktime_t delta; - spin_lock_irqsave(&base->lock, flags); - next = timerqueue_getnext(&base->timerqueue); - spin_unlock_irqrestore(&base->lock, flags); + scoped_guard(spinlock_irqsave, &base->lock) { + next = timerqueue_getnext(&base->timerqueue); + if (next) + next_expires = next->expires; + } if (!next) continue; - delta = ktime_sub(next->expires, base->get_ktime()); + delta = ktime_sub(next_expires, base->get_ktime()); if (!min || (delta < min)) { - expires = next->expires; + expires = next_expires; min = delta; type = i; } @@ -352,13 +344,12 @@ EXPORT_SYMBOL_GPL(alarm_init); void alarm_start(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; - unsigned long flags; - spin_lock_irqsave(&base->lock, flags); - alarm->node.expires = start; - alarmtimer_enqueue(base, alarm); - hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); - spin_unlock_irqrestore(&base->lock, flags); + scoped_guard(spinlock_irqsave, &base->lock) { + alarm->node.expires = start; + alarmtimer_enqueue(base, alarm); + hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); + } trace_alarmtimer_start(alarm, base->get_ktime()); } @@ -381,13 +372,11 @@ EXPORT_SYMBOL_GPL(alarm_start_relative); void alarm_restart(struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; - unsigned long flags; - spin_lock_irqsave(&base->lock, flags); + guard(spinlock_irqsave)(&base->lock); hrtimer_set_expires(&alarm->timer, alarm->node.expires); hrtimer_restart(&alarm->timer); alarmtimer_enqueue(base, alarm); - spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(alarm_restart); @@ -401,14 +390,13 @@ EXPORT_SYMBOL_GPL(alarm_restart); int alarm_try_to_cancel(struct alarm *alarm) { struct alarm_base *base = &alarm_bases[alarm->type]; - unsigned long flags; int ret; - spin_lock_irqsave(&base->lock, flags); - ret = hrtimer_try_to_cancel(&alarm->timer); - if (ret >= 0) - alarmtimer_dequeue(base, alarm); - spin_unlock_irqrestore(&base->lock, flags); + scoped_guard(spinlock_irqsave, &base->lock) { + ret = hrtimer_try_to_cancel(&alarm->timer); + if (ret >= 0) + alarmtimer_dequeue(base, alarm); + } trace_alarmtimer_cancel(alarm, base->get_ktime()); return ret; @@ -479,7 +467,6 @@ EXPORT_SYMBOL_GPL(alarm_forward_now); static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) { struct alarm_base *base; - unsigned long flags; ktime_t delta; switch(type) { @@ -498,13 +485,12 @@ static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) delta = ktime_sub(absexp, base->get_ktime()); - spin_lock_irqsave(&freezer_delta_lock, flags); + guard(spinlock_irqsave)(&freezer_delta_lock); if (!freezer_delta || (delta < freezer_delta)) { freezer_delta = delta; freezer_expires = absexp; freezer_alarmtype = type; } - spin_unlock_irqrestore(&freezer_delta_lock, flags); } /** @@ -515,9 +501,9 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) { if (clockid == CLOCK_REALTIME_ALARM) return ALARM_REALTIME; - if (clockid == CLOCK_BOOTTIME_ALARM) - return ALARM_BOOTTIME; - return -1; + + WARN_ON_ONCE(clockid != CLOCK_BOOTTIME_ALARM); + return ALARM_BOOTTIME; } /** @@ -558,7 +544,7 @@ static s64 alarm_timer_forward(struct k_itimer *timr, ktime_t now) { struct alarm *alarm = &timr->it.alarm.alarmtimer; - return alarm_forward(alarm, timr->it_interval, now); + return alarm_forward(alarm, now, timr->it_interval); } /** diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index f3e831f62906..0014d163f989 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -2,7 +2,7 @@ /* * This file contains functions which manage clock event devices. * - * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ @@ -94,6 +94,9 @@ static int __clockevents_switch_state(struct clock_event_device *dev, if (dev->features & CLOCK_EVT_FEAT_DUMMY) return 0; + /* On state transitions clear the forced flag unconditionally */ + dev->next_event_forced = 0; + /* Transition with new state-specific callbacks */ switch (state) { case CLOCK_EVT_STATE_DETACHED: @@ -172,6 +175,7 @@ void clockevents_shutdown(struct clock_event_device *dev) { clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); dev->next_event = KTIME_MAX; + dev->next_event_forced = 0; } /** @@ -292,6 +296,38 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) #endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ +#ifdef CONFIG_GENERIC_CLOCKEVENTS_COUPLED +#ifdef CONFIG_GENERIC_CLOCKEVENTS_COUPLED_INLINE +#include <asm/clock_inlined.h> +#else +static __always_inline void +arch_inlined_clockevent_set_next_coupled(u64 cycles, struct clock_event_device *dev) { } +#endif + +static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires) +{ + u64 cycles; + + if (unlikely(!(dev->features & CLOCK_EVT_FEAT_CLOCKSOURCE_COUPLED))) + return false; + + if (unlikely(!ktime_expiry_to_cycles(dev->cs_id, expires, &cycles))) + return false; + + if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED_INLINE)) + arch_inlined_clockevent_set_next_coupled(cycles, dev); + else + dev->set_next_coupled(cycles, dev); + return true; +} + +#else +static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires) +{ + return false; +} +#endif + /** * clockevents_program_event - Reprogram the clock event device. * @dev: device to program @@ -300,12 +336,10 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) * * Returns 0 on success, -ETIME when the event is in the past. */ -int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, - bool force) +int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force) { - unsigned long long clc; int64_t delta; - int rc; + u64 cycles; if (WARN_ON_ONCE(expires < 0)) return -ETIME; @@ -319,21 +353,37 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", clockevent_get_state(dev)); - /* Shortcut for clockevent devices that can deal with ktime. */ - if (dev->features & CLOCK_EVT_FEAT_KTIME) + /* ktime_t based reprogramming for the broadcast hrtimer device */ + if (unlikely(dev->features & CLOCK_EVT_FEAT_HRTIMER)) return dev->set_next_ktime(expires, dev); + if (likely(clockevent_set_next_coupled(dev, expires))) + return 0; + delta = ktime_to_ns(ktime_sub(expires, ktime_get())); - if (delta <= 0) - return force ? clockevents_program_min_delta(dev) : -ETIME; - delta = min(delta, (int64_t) dev->max_delta_ns); - delta = max(delta, (int64_t) dev->min_delta_ns); + /* Required for tick_periodic() during early boot */ + if (delta <= 0 && !force) + return -ETIME; - clc = ((unsigned long long) delta * dev->mult) >> dev->shift; - rc = dev->set_next_event((unsigned long) clc, dev); + if (delta > (int64_t)dev->min_delta_ns) { + delta = min(delta, (int64_t) dev->max_delta_ns); + cycles = ((u64)delta * dev->mult) >> dev->shift; + if (!dev->set_next_event((unsigned long) cycles, dev)) { + dev->next_event_forced = 0; + return 0; + } + } + + if (dev->next_event_forced) + return 0; - return (rc && force) ? clockevents_program_min_delta(dev) : rc; + if (dev->set_next_event(dev->min_delta_ticks, dev)) { + if (!force || clockevents_program_min_delta(dev)) + return -ETIME; + } + dev->next_event_forced = 1; + return 0; } /* @@ -633,7 +683,7 @@ void tick_offline_cpu(unsigned int cpu) raw_spin_lock(&clockevents_lock); tick_broadcast_offline(cpu); - tick_shutdown(cpu); + tick_shutdown(); /* * Unregister the clock event devices which were diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c index 38dae590b29f..b4cf17b4aeed 100644 --- a/kernel/time/clocksource-wdtest.c +++ b/kernel/time/clocksource-wdtest.c @@ -3,202 +3,196 @@ * Unit test for the clocksource watchdog. * * Copyright (C) 2021 Facebook, Inc. + * Copyright (C) 2026 Intel Corp. * * Author: Paul E. McKenney <paulmck@kernel.org> + * Author: Thomas Gleixner <tglx@kernel.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/device.h> #include <linux/clocksource.h> -#include <linux/init.h> +#include <linux/delay.h> #include <linux/module.h> -#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ -#include <linux/tick.h> #include <linux/kthread.h> -#include <linux/delay.h> -#include <linux/prandom.h> -#include <linux/cpu.h> #include "tick-internal.h" +#include "timekeeping_internal.h" MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Clocksource watchdog unit test"); MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>"); +MODULE_AUTHOR("Thomas Gleixner <tglx@kernel.org>"); + +enum wdtest_states { + WDTEST_INJECT_NONE, + WDTEST_INJECT_DELAY, + WDTEST_INJECT_POSITIVE, + WDTEST_INJECT_NEGATIVE, + WDTEST_INJECT_PERCPU = 0x100, +}; -static int holdoff = IS_BUILTIN(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) ? 10 : 0; -module_param(holdoff, int, 0444); -MODULE_PARM_DESC(holdoff, "Time to wait to start test (s)."); +static enum wdtest_states wdtest_state; +static unsigned long wdtest_test_count; +static ktime_t wdtest_last_ts, wdtest_offset; -/* Watchdog kthread's task_struct pointer for debug purposes. */ -static struct task_struct *wdtest_task; +#define SHIFT_4000PPM 8 -static u64 wdtest_jiffies_read(struct clocksource *cs) +static ktime_t wdtest_get_offset(struct clocksource *cs) { - return (u64)jiffies; -} - -static struct clocksource clocksource_wdtest_jiffies = { - .name = "wdtest-jiffies", - .rating = 1, /* lowest valid rating*/ - .uncertainty_margin = TICK_NSEC, - .read = wdtest_jiffies_read, - .mask = CLOCKSOURCE_MASK(32), - .flags = CLOCK_SOURCE_MUST_VERIFY, - .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */ - .shift = JIFFIES_SHIFT, - .max_cycles = 10, -}; + if (wdtest_state < WDTEST_INJECT_PERCPU) + return wdtest_test_count & 0x1 ? 0 : wdtest_offset >> SHIFT_4000PPM; -static int wdtest_ktime_read_ndelays; -static bool wdtest_ktime_read_fuzz; + /* Only affect the readout of the "remote" CPU */ + return cs->wd_cpu == smp_processor_id() ? 0 : NSEC_PER_MSEC; +} static u64 wdtest_ktime_read(struct clocksource *cs) { - int wkrn = READ_ONCE(wdtest_ktime_read_ndelays); - static int sign = 1; - u64 ret; + ktime_t now = ktime_get_raw_fast_ns(); + ktime_t intv = now - wdtest_last_ts; - if (wkrn) { - udelay(cs->uncertainty_margin / 250); - WRITE_ONCE(wdtest_ktime_read_ndelays, wkrn - 1); - } - ret = ktime_get_real_fast_ns(); - if (READ_ONCE(wdtest_ktime_read_fuzz)) { - sign = -sign; - ret = ret + sign * 100 * NSEC_PER_MSEC; + /* + * Only increment the test counter once per watchdog interval and + * store the interval for the offset calculation of this step. This + * guarantees a consistent behaviour even if the other side needs + * to repeat due to a watchdog read timeout. + */ + if (intv > (NSEC_PER_SEC / 4)) { + WRITE_ONCE(wdtest_test_count, wdtest_test_count + 1); + wdtest_last_ts = now; + wdtest_offset = intv; } - return ret; -} -static void wdtest_ktime_cs_mark_unstable(struct clocksource *cs) -{ - pr_info("--- Marking %s unstable due to clocksource watchdog.\n", cs->name); + switch (wdtest_state & ~WDTEST_INJECT_PERCPU) { + case WDTEST_INJECT_POSITIVE: + return now + wdtest_get_offset(cs); + case WDTEST_INJECT_NEGATIVE: + return now - wdtest_get_offset(cs); + case WDTEST_INJECT_DELAY: + udelay(500); + return now; + default: + return now; + } } -#define KTIME_FLAGS (CLOCK_SOURCE_IS_CONTINUOUS | \ - CLOCK_SOURCE_VALID_FOR_HRES | \ - CLOCK_SOURCE_MUST_VERIFY | \ - CLOCK_SOURCE_VERIFY_PERCPU) +#define KTIME_FLAGS (CLOCK_SOURCE_IS_CONTINUOUS | \ + CLOCK_SOURCE_CALIBRATED | \ + CLOCK_SOURCE_MUST_VERIFY | \ + CLOCK_SOURCE_WDTEST) static struct clocksource clocksource_wdtest_ktime = { .name = "wdtest-ktime", - .rating = 300, + .rating = 10, .read = wdtest_ktime_read, .mask = CLOCKSOURCE_MASK(64), .flags = KTIME_FLAGS, - .mark_unstable = wdtest_ktime_cs_mark_unstable, .list = LIST_HEAD_INIT(clocksource_wdtest_ktime.list), }; -/* Reset the clocksource if needed. */ -static void wdtest_ktime_clocksource_reset(void) +static void wdtest_clocksource_reset(enum wdtest_states which, bool percpu) +{ + clocksource_unregister(&clocksource_wdtest_ktime); + + pr_info("Test: State %d percpu %d\n", which, percpu); + + wdtest_state = which; + if (percpu) + wdtest_state |= WDTEST_INJECT_PERCPU; + wdtest_test_count = 0; + wdtest_last_ts = 0; + + clocksource_wdtest_ktime.rating = 10; + clocksource_wdtest_ktime.flags = KTIME_FLAGS; + if (percpu) + clocksource_wdtest_ktime.flags |= CLOCK_SOURCE_WDTEST_PERCPU; + clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000); +} + +static bool wdtest_execute(enum wdtest_states which, bool percpu, unsigned int expect, + unsigned long calls) { - if (clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE) { - clocksource_unregister(&clocksource_wdtest_ktime); - clocksource_wdtest_ktime.flags = KTIME_FLAGS; - schedule_timeout_uninterruptible(HZ / 10); - clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000); + wdtest_clocksource_reset(which, percpu); + + for (; READ_ONCE(wdtest_test_count) < calls; msleep(100)) { + unsigned int flags = READ_ONCE(clocksource_wdtest_ktime.flags); + + if (kthread_should_stop()) + return false; + + if (flags & CLOCK_SOURCE_UNSTABLE) { + if (expect & CLOCK_SOURCE_UNSTABLE) + return true; + pr_warn("Fail: Unexpected unstable\n"); + return false; + } + if (flags & CLOCK_SOURCE_VALID_FOR_HRES) { + if (expect & CLOCK_SOURCE_VALID_FOR_HRES) + return true; + pr_warn("Fail: Unexpected valid for highres\n"); + return false; + } } + + if (!expect) + return true; + + pr_warn("Fail: Timed out\n"); + return false; } -/* Run the specified series of watchdog tests. */ -static int wdtest_func(void *arg) +static bool wdtest_run(bool percpu) { - unsigned long j1, j2; - int i, max_retries; - char *s; + if (!wdtest_execute(WDTEST_INJECT_NONE, percpu, CLOCK_SOURCE_VALID_FOR_HRES, 8)) + return false; - schedule_timeout_uninterruptible(holdoff * HZ); + if (!wdtest_execute(WDTEST_INJECT_DELAY, percpu, 0, 4)) + return false; - /* - * Verify that jiffies-like clocksources get the manually - * specified uncertainty margin. - */ - pr_info("--- Verify jiffies-like uncertainty margin.\n"); - __clocksource_register(&clocksource_wdtest_jiffies); - WARN_ON_ONCE(clocksource_wdtest_jiffies.uncertainty_margin != TICK_NSEC); + if (!wdtest_execute(WDTEST_INJECT_POSITIVE, percpu, CLOCK_SOURCE_UNSTABLE, 8)) + return false; - j1 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies); - schedule_timeout_uninterruptible(HZ); - j2 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies); - WARN_ON_ONCE(j1 == j2); + if (!wdtest_execute(WDTEST_INJECT_NEGATIVE, percpu, CLOCK_SOURCE_UNSTABLE, 8)) + return false; - clocksource_unregister(&clocksource_wdtest_jiffies); + return true; +} - /* - * Verify that tsc-like clocksources are assigned a reasonable - * uncertainty margin. - */ - pr_info("--- Verify tsc-like uncertainty margin.\n"); +static int wdtest_func(void *arg) +{ clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000); - WARN_ON_ONCE(clocksource_wdtest_ktime.uncertainty_margin < NSEC_PER_USEC); - - j1 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime); - udelay(1); - j2 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime); - pr_info("--- tsc-like times: %lu - %lu = %lu.\n", j2, j1, j2 - j1); - WARN_ONCE(time_before(j2, j1 + NSEC_PER_USEC), - "Expected at least 1000ns, got %lu.\n", j2 - j1); - - /* Verify tsc-like stability with various numbers of errors injected. */ - max_retries = clocksource_get_max_watchdog_retry(); - for (i = 0; i <= max_retries + 1; i++) { - if (i <= 1 && i < max_retries) - s = ""; - else if (i <= max_retries) - s = ", expect message"; - else - s = ", expect clock skew"; - pr_info("--- Watchdog with %dx error injection, %d retries%s.\n", i, max_retries, s); - WRITE_ONCE(wdtest_ktime_read_ndelays, i); - schedule_timeout_uninterruptible(2 * HZ); - WARN_ON_ONCE(READ_ONCE(wdtest_ktime_read_ndelays)); - WARN_ON_ONCE((i <= max_retries) != - !(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE)); - wdtest_ktime_clocksource_reset(); + if (wdtest_run(false)) { + if (wdtest_run(true)) + pr_info("Success: All tests passed\n"); } - - /* Verify tsc-like stability with clock-value-fuzz error injection. */ - pr_info("--- Watchdog clock-value-fuzz error injection, expect clock skew and per-CPU mismatches.\n"); - WRITE_ONCE(wdtest_ktime_read_fuzz, true); - schedule_timeout_uninterruptible(2 * HZ); - WARN_ON_ONCE(!(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE)); - clocksource_verify_percpu(&clocksource_wdtest_ktime); - WRITE_ONCE(wdtest_ktime_read_fuzz, false); - clocksource_unregister(&clocksource_wdtest_ktime); - pr_info("--- Done with test.\n"); - return 0; -} + if (!IS_MODULE(CONFIG_TEST_CLOCKSOURCE_WATCHDOG)) + return 0; -static void wdtest_print_module_parms(void) -{ - pr_alert("--- holdoff=%d\n", holdoff); + while (!kthread_should_stop()) + schedule_timeout_interruptible(3600 * HZ); + return 0; } -/* Cleanup function. */ -static void clocksource_wdtest_cleanup(void) -{ -} +static struct task_struct *wdtest_thread; static int __init clocksource_wdtest_init(void) { - int ret = 0; - - wdtest_print_module_parms(); + struct task_struct *t = kthread_run(wdtest_func, NULL, "wdtest"); - /* Create watchdog-test task. */ - wdtest_task = kthread_run(wdtest_func, NULL, "wdtest"); - if (IS_ERR(wdtest_task)) { - ret = PTR_ERR(wdtest_task); - pr_warn("%s: Failed to create wdtest kthread.\n", __func__); - wdtest_task = NULL; - return ret; + if (IS_ERR(t)) { + pr_warn("Failed to create wdtest kthread.\n"); + return PTR_ERR(t); } - + wdtest_thread = t; return 0; } - module_init(clocksource_wdtest_init); + +static void clocksource_wdtest_cleanup(void) +{ + if (wdtest_thread) + kthread_stop(wdtest_thread); +} module_exit(clocksource_wdtest_cleanup); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 2a7802ec480c..baee13a1f87f 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -7,15 +7,17 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/device.h> #include <linux/clocksource.h> +#include <linux/cpu.h> +#include <linux/delay.h> +#include <linux/device.h> #include <linux/init.h> -#include <linux/module.h> -#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */ -#include <linux/tick.h> #include <linux/kthread.h> +#include <linux/module.h> #include <linux/prandom.h> -#include <linux/cpu.h> +#include <linux/sched.h> +#include <linux/tick.h> +#include <linux/topology.h> #include "tick-internal.h" #include "timekeeping_internal.h" @@ -107,48 +109,6 @@ static char override_name[CS_NAME_LEN]; static int finished_booting; static u64 suspend_start; -/* - * Interval: 0.5sec. - */ -#define WATCHDOG_INTERVAL (HZ >> 1) -#define WATCHDOG_INTERVAL_MAX_NS ((2 * WATCHDOG_INTERVAL) * (NSEC_PER_SEC / HZ)) - -/* - * Threshold: 0.0312s, when doubled: 0.0625s. - */ -#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5) - -/* - * Maximum permissible delay between two readouts of the watchdog - * clocksource surrounding a read of the clocksource being validated. - * This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as - * a lower bound for cs->uncertainty_margin values when registering clocks. - * - * The default of 500 parts per million is based on NTP's limits. - * If a clocksource is good enough for NTP, it is good enough for us! - * - * In other words, by default, even if a clocksource is extremely - * precise (for example, with a sub-nanosecond period), the maximum - * permissible skew between the clocksource watchdog and the clocksource - * under test is not permitted to go below the 500ppm minimum defined - * by MAX_SKEW_USEC. This 500ppm minimum may be overridden using the - * CLOCKSOURCE_WATCHDOG_MAX_SKEW_US Kconfig option. - */ -#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US -#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US -#else -#define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ) -#endif - -/* - * Default for maximum permissible skew when cs->uncertainty_margin is - * not specified, and the lower bound even when cs->uncertainty_margin - * is specified. This is also the default that is used when registering - * clocks with unspecifed cs->uncertainty_margin, so this macro is used - * even in CONFIG_CLOCKSOURCE_WATCHDOG=n kernels. - */ -#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC) - #ifdef CONFIG_CLOCKSOURCE_WATCHDOG static void clocksource_watchdog_work(struct work_struct *work); static void clocksource_select(void); @@ -160,7 +120,42 @@ static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); static DEFINE_SPINLOCK(watchdog_lock); static int watchdog_running; static atomic_t watchdog_reset_pending; -static int64_t watchdog_max_interval; + +/* Watchdog interval: 0.5sec. */ +#define WATCHDOG_INTERVAL (HZ >> 1) +#define WATCHDOG_INTERVAL_NS (WATCHDOG_INTERVAL * (NSEC_PER_SEC / HZ)) + +/* Maximum time between two reference watchdog readouts */ +#define WATCHDOG_READOUT_MAX_NS (50U * NSEC_PER_USEC) + +/* + * Maximum time between two remote readouts for NUMA=n. On NUMA enabled systems + * the timeout is calculated from the numa distance. + */ +#define WATCHDOG_DEFAULT_TIMEOUT_NS (50U * NSEC_PER_USEC) + +/* + * Remote timeout NUMA distance multiplier. The local distance is 10. The + * default remote distance is 20. ACPI tables provide more accurate numbers + * which are guaranteed to be greater than the local distance. + * + * This results in a 5us base value, which is equivalent to the above !NUMA + * default. + */ +#define WATCHDOG_NUMA_MULTIPLIER_NS ((u64)(WATCHDOG_DEFAULT_TIMEOUT_NS / LOCAL_DISTANCE)) + +/* Limit the NUMA timeout in case the distance values are insanely big */ +#define WATCHDOG_NUMA_MAX_TIMEOUT_NS ((u64)(500U * NSEC_PER_USEC)) + +/* Shift values to calculate the approximate $N ppm of a given delta. */ +#define SHIFT_500PPM 11 +#define SHIFT_4000PPM 8 + +/* Number of attempts to read the watchdog */ +#define WATCHDOG_FREQ_RETRIES 3 + +/* Five reads local and remote for inter CPU skew detection */ +#define WATCHDOG_REMOTE_MAX_SEQ 10 static inline void clocksource_watchdog_lock(unsigned long *flags) { @@ -241,210 +236,422 @@ void clocksource_mark_unstable(struct clocksource *cs) spin_unlock_irqrestore(&watchdog_lock, flags); } -static int verify_n_cpus = 8; -module_param(verify_n_cpus, int, 0644); +static inline void clocksource_reset_watchdog(void) +{ + struct clocksource *cs; -enum wd_read_status { - WD_READ_SUCCESS, - WD_READ_UNSTABLE, - WD_READ_SKIP + list_for_each_entry(cs, &watchdog_list, wd_list) + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; +} + +enum wd_result { + WD_SUCCESS, + WD_FREQ_NO_WATCHDOG, + WD_FREQ_TIMEOUT, + WD_FREQ_RESET, + WD_FREQ_SKEWED, + WD_CPU_TIMEOUT, + WD_CPU_SKEWED, +}; + +struct watchdog_cpu_data { + /* Keep first as it is 32 byte aligned */ + call_single_data_t csd; + atomic_t remote_inprogress; + enum wd_result result; + u64 cpu_ts[2]; + struct clocksource *cs; + /* Ensure that the sequence is in a separate cache line */ + atomic_t seq ____cacheline_aligned; + /* Set by the control CPU according to NUMA distance */ + u64 timeout_ns; }; -static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow) -{ - int64_t md = 2 * watchdog->uncertainty_margin; - unsigned int nretries, max_retries; - int64_t wd_delay, wd_seq_delay; - u64 wd_end, wd_end2; - - max_retries = clocksource_get_max_watchdog_retry(); - for (nretries = 0; nretries <= max_retries; nretries++) { - local_irq_disable(); - *wdnow = watchdog->read(watchdog); - *csnow = cs->read(cs); - wd_end = watchdog->read(watchdog); - wd_end2 = watchdog->read(watchdog); - local_irq_enable(); - - wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end); - if (wd_delay <= md + cs->uncertainty_margin) { - if (nretries > 1 && nretries >= max_retries) { - pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n", - smp_processor_id(), watchdog->name, nretries); +struct watchdog_data { + raw_spinlock_t lock; + enum wd_result result; + + u64 wd_seq; + u64 wd_delta; + u64 cs_delta; + u64 cpu_ts[2]; + + unsigned int curr_cpu; +} ____cacheline_aligned_in_smp; + +static void watchdog_check_skew_remote(void *unused); + +static DEFINE_PER_CPU_ALIGNED(struct watchdog_cpu_data, watchdog_cpu_data) = { + .csd = CSD_INIT(watchdog_check_skew_remote, NULL), +}; + +static struct watchdog_data watchdog_data = { + .lock = __RAW_SPIN_LOCK_UNLOCKED(watchdog_data.lock), +}; + +static inline void watchdog_set_result(struct watchdog_cpu_data *wd, enum wd_result result) +{ + guard(raw_spinlock)(&watchdog_data.lock); + if (!wd->result) { + atomic_set(&wd->seq, WATCHDOG_REMOTE_MAX_SEQ); + WRITE_ONCE(wd->result, result); + } +} + +/* Wait for the sequence number to hand over control. */ +static bool watchdog_wait_seq(struct watchdog_cpu_data *wd, u64 start, int seq) +{ + for(int cnt = 0; atomic_read(&wd->seq) < seq; cnt++) { + /* Bail if the other side set an error result */ + if (READ_ONCE(wd->result) != WD_SUCCESS) + return false; + + /* Prevent endless loops if the other CPU does not react. */ + if (cnt == 5000) { + u64 nsecs = ktime_get_raw_fast_ns(); + + if (nsecs - start >=wd->timeout_ns) { + watchdog_set_result(wd, WD_CPU_TIMEOUT); + return false; } - return WD_READ_SUCCESS; + cnt = 0; } + cpu_relax(); + } + return seq < WATCHDOG_REMOTE_MAX_SEQ; +} - /* - * Now compute delay in consecutive watchdog read to see if - * there is too much external interferences that cause - * significant delay in reading both clocksource and watchdog. - * - * If consecutive WD read-back delay > md, report - * system busy, reinit the watchdog and skip the current - * watchdog test. - */ - wd_seq_delay = cycles_to_nsec_safe(watchdog, wd_end, wd_end2); - if (wd_seq_delay > md) - goto skip_test; +static void watchdog_check_skew(struct watchdog_cpu_data *wd, int index) +{ + u64 prev, now, delta, start = ktime_get_raw_fast_ns(); + int local = index, remote = (index + 1) & 0x1; + struct clocksource *cs = wd->cs; + + /* Set the local timestamp so that the first iteration works correctly */ + wd->cpu_ts[local] = cs->read(cs); + + /* Signal arrival */ + atomic_inc(&wd->seq); + + for (int seq = local + 2; seq < WATCHDOG_REMOTE_MAX_SEQ; seq += 2) { + if (!watchdog_wait_seq(wd, start, seq)) + return; + + /* Capture local timestamp before possible non-local coherency overhead */ + now = cs->read(cs); + + /* Store local timestamp before reading remote to limit coherency stalls */ + wd->cpu_ts[local] = now; + + prev = wd->cpu_ts[remote]; + delta = (now - prev) & cs->mask; + + if (delta > cs->max_raw_delta) { + watchdog_set_result(wd, WD_CPU_SKEWED); + return; + } + + /* Hand over to the remote CPU */ + atomic_inc(&wd->seq); } +} - pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n", - smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name); - return WD_READ_UNSTABLE; +static void watchdog_check_skew_remote(void *unused) +{ + struct watchdog_cpu_data *wd = this_cpu_ptr(&watchdog_cpu_data); -skip_test: - pr_info("timekeeping watchdog on CPU%d: %s wd-wd read-back delay of %lldns\n", - smp_processor_id(), watchdog->name, wd_seq_delay); - pr_info("wd-%s-wd read-back delay of %lldns, clock-skew test skipped!\n", - cs->name, wd_delay); - return WD_READ_SKIP; + atomic_inc(&wd->remote_inprogress); + watchdog_check_skew(wd, 1); + atomic_dec(&wd->remote_inprogress); } -static u64 csnow_mid; -static cpumask_t cpus_ahead; -static cpumask_t cpus_behind; -static cpumask_t cpus_chosen; +static inline bool wd_csd_locked(struct watchdog_cpu_data *wd) +{ + return READ_ONCE(wd->csd.node.u_flags) & CSD_FLAG_LOCK; +} + +/* + * This is only invoked for remote CPUs. See watchdog_check_cpu_skew(). + */ +static inline u64 wd_get_remote_timeout(unsigned int remote_cpu) +{ + unsigned int n1, n2; + u64 ns; + + if (nr_node_ids == 1) + return WATCHDOG_DEFAULT_TIMEOUT_NS; + + n1 = cpu_to_node(smp_processor_id()); + n2 = cpu_to_node(remote_cpu); + ns = WATCHDOG_NUMA_MULTIPLIER_NS * node_distance(n1, n2); + return min(ns, WATCHDOG_NUMA_MAX_TIMEOUT_NS); +} -static void clocksource_verify_choose_cpus(void) +static void __watchdog_check_cpu_skew(struct clocksource *cs, unsigned int cpu) { - int cpu, i, n = verify_n_cpus; + struct watchdog_cpu_data *wd; - if (n < 0) { - /* Check all of the CPUs. */ - cpumask_copy(&cpus_chosen, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), &cpus_chosen); + wd = per_cpu_ptr(&watchdog_cpu_data, cpu); + if (atomic_read(&wd->remote_inprogress) || wd_csd_locked(wd)) { + watchdog_data.result = WD_CPU_TIMEOUT; return; } - /* If no checking desired, or no other CPU to check, leave. */ - cpumask_clear(&cpus_chosen); - if (n == 0 || num_online_cpus() <= 1) + atomic_set(&wd->seq, 0); + wd->result = WD_SUCCESS; + wd->cs = cs; + /* Store the current CPU ID for the watchdog test unit */ + cs->wd_cpu = smp_processor_id(); + + wd->timeout_ns = wd_get_remote_timeout(cpu); + + /* Kick the remote CPU into the watchdog function */ + if (WARN_ON_ONCE(smp_call_function_single_async(cpu, &wd->csd))) { + watchdog_data.result = WD_CPU_TIMEOUT; return; + } + + scoped_guard(irq) + watchdog_check_skew(wd, 0); + + scoped_guard(raw_spinlock_irq, &watchdog_data.lock) { + watchdog_data.result = wd->result; + memcpy(watchdog_data.cpu_ts, wd->cpu_ts, sizeof(wd->cpu_ts)); + } +} + +static void watchdog_check_cpu_skew(struct clocksource *cs) +{ + unsigned int cpu = watchdog_data.curr_cpu; + + cpu = cpumask_next_wrap(cpu, cpu_online_mask); + watchdog_data.curr_cpu = cpu; - /* Make sure to select at least one CPU other than the current CPU. */ - cpu = cpumask_first(cpu_online_mask); + /* Skip the current CPU. Handles num_online_cpus() == 1 as well */ if (cpu == smp_processor_id()) - cpu = cpumask_next(cpu, cpu_online_mask); - if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) return; - cpumask_set_cpu(cpu, &cpus_chosen); - /* Force a sane value for the boot parameter. */ - if (n > nr_cpu_ids) - n = nr_cpu_ids; + /* Don't interfere with the test mechanics */ + if ((cs->flags & CLOCK_SOURCE_WDTEST) && !(cs->flags & CLOCK_SOURCE_WDTEST_PERCPU)) + return; + + __watchdog_check_cpu_skew(cs, cpu); +} + +static bool watchdog_check_freq(struct clocksource *cs, bool reset_pending) +{ + unsigned int ppm_shift = SHIFT_4000PPM; + u64 wd_ts0, wd_ts1, cs_ts; + + watchdog_data.result = WD_SUCCESS; + if (!watchdog) { + watchdog_data.result = WD_FREQ_NO_WATCHDOG; + return false; + } + + if (cs->flags & CLOCK_SOURCE_WDTEST_PERCPU) + return true; /* - * Randomly select the specified number of CPUs. If the same - * CPU is selected multiple times, that CPU is checked only once, - * and no replacement CPU is selected. This gracefully handles - * situations where verify_n_cpus is greater than the number of - * CPUs that are currently online. + * If both the clocksource and the watchdog claim they are + * calibrated use 500ppm limit. Uncalibrated clocksources need a + * larger allowance because thefirmware supplied frequencies can be + * way off. */ - for (i = 1; i < n; i++) { - cpu = get_random_u32_below(nr_cpu_ids); - cpu = cpumask_next(cpu - 1, cpu_online_mask); - if (cpu >= nr_cpu_ids) - cpu = cpumask_first(cpu_online_mask); - if (!WARN_ON_ONCE(cpu >= nr_cpu_ids)) - cpumask_set_cpu(cpu, &cpus_chosen); + if (watchdog->flags & CLOCK_SOURCE_CALIBRATED && cs->flags & CLOCK_SOURCE_CALIBRATED) + ppm_shift = SHIFT_500PPM; + + for (int retries = 0; retries < WATCHDOG_FREQ_RETRIES; retries++) { + s64 wd_last, cs_last, wd_seq, wd_delta, cs_delta, max_delta; + + scoped_guard(irq) { + wd_ts0 = watchdog->read(watchdog); + cs_ts = cs->read(cs); + wd_ts1 = watchdog->read(watchdog); + } + + wd_last = cs->wd_last; + cs_last = cs->cs_last; + + /* Validate the watchdog readout window */ + wd_seq = cycles_to_nsec_safe(watchdog, wd_ts0, wd_ts1); + if (wd_seq > WATCHDOG_READOUT_MAX_NS) { + /* Store for printout in case all retries fail */ + watchdog_data.wd_seq = wd_seq; + continue; + } + + /* Store for subsequent processing */ + cs->wd_last = wd_ts0; + cs->cs_last = cs_ts; + + /* First round or reset pending? */ + if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || reset_pending) + goto reset; + + /* Calculate the nanosecond deltas from the last invocation */ + wd_delta = cycles_to_nsec_safe(watchdog, wd_last, wd_ts0); + cs_delta = cycles_to_nsec_safe(cs, cs_last, cs_ts); + + watchdog_data.wd_delta = wd_delta; + watchdog_data.cs_delta = cs_delta; + + /* + * Ensure that the deltas are within the readout limits of + * the clocksource and the watchdog. Long delays can cause + * clocksources to overflow. + */ + max_delta = max(wd_delta, cs_delta); + if (max_delta > cs->max_idle_ns || max_delta > watchdog->max_idle_ns) + goto reset; + + /* + * Calculate and validate the skew against the allowed PPM + * value of the maximum delta plus the watchdog readout + * time. + */ + if (abs(wd_delta - cs_delta) < (max_delta >> ppm_shift) + wd_seq) + return true; + + watchdog_data.result = WD_FREQ_SKEWED; + return false; } - /* Don't verify ourselves. */ - cpumask_clear_cpu(smp_processor_id(), &cpus_chosen); + watchdog_data.result = WD_FREQ_TIMEOUT; + return false; + +reset: + cs->flags |= CLOCK_SOURCE_WATCHDOG; + watchdog_data.result = WD_FREQ_RESET; + return false; } -static void clocksource_verify_one_cpu(void *csin) +/* Synchronization for sched clock */ +static void clocksource_tick_stable(struct clocksource *cs) { - struct clocksource *cs = (struct clocksource *)csin; - - csnow_mid = cs->read(cs); + if (cs == curr_clocksource && cs->tick_stable) + cs->tick_stable(cs); } -void clocksource_verify_percpu(struct clocksource *cs) +/* Conditionaly enable high resolution mode */ +static void clocksource_enable_highres(struct clocksource *cs) { - int64_t cs_nsec, cs_nsec_max = 0, cs_nsec_min = LLONG_MAX; - u64 csnow_begin, csnow_end; - int cpu, testcpu; - s64 delta; + if ((cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) || + !(cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) || + !watchdog || !(watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) + return; + + /* Mark it valid for high-res. */ + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; - if (verify_n_cpus == 0) + /* + * Can't schedule work before finished_booting is + * true. clocksource_done_booting will take care of it. + */ + if (!finished_booting) return; - cpumask_clear(&cpus_ahead); - cpumask_clear(&cpus_behind); - cpus_read_lock(); - migrate_disable(); - clocksource_verify_choose_cpus(); - if (cpumask_empty(&cpus_chosen)) { - migrate_enable(); - cpus_read_unlock(); - pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name); + + if (cs->flags & CLOCK_SOURCE_WDTEST) return; + + /* + * If this is not the current clocksource let the watchdog thread + * reselect it. Due to the change to high res this clocksource + * might be preferred now. If it is the current clocksource let the + * tick code know about that change. + */ + if (cs != curr_clocksource) { + cs->flags |= CLOCK_SOURCE_RESELECT; + schedule_work(&watchdog_work); + } else { + tick_clock_notify(); } - testcpu = smp_processor_id(); - pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", - cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); - preempt_disable(); - for_each_cpu(cpu, &cpus_chosen) { - if (cpu == testcpu) - continue; - csnow_begin = cs->read(cs); - smp_call_function_single(cpu, clocksource_verify_one_cpu, cs, 1); - csnow_end = cs->read(cs); - delta = (s64)((csnow_mid - csnow_begin) & cs->mask); - if (delta < 0) - cpumask_set_cpu(cpu, &cpus_behind); - delta = (csnow_end - csnow_mid) & cs->mask; - if (delta < 0) - cpumask_set_cpu(cpu, &cpus_ahead); - cs_nsec = cycles_to_nsec_safe(cs, csnow_begin, csnow_end); - if (cs_nsec > cs_nsec_max) - cs_nsec_max = cs_nsec; - if (cs_nsec < cs_nsec_min) - cs_nsec_min = cs_nsec; +} + +static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 2); + +static void watchdog_print_freq_timeout(struct clocksource *cs) +{ + if (!__ratelimit(&ratelimit_state)) + return; + pr_info("Watchdog %s read timed out. Readout sequence took: %lluns\n", + watchdog->name, watchdog_data.wd_seq); +} + +static void watchdog_print_freq_skew(struct clocksource *cs) +{ + pr_warn("Marking clocksource %s unstable due to frequency skew\n", cs->name); + pr_warn("Watchdog %20s interval: %16lluns\n", watchdog->name, watchdog_data.wd_delta); + pr_warn("Clocksource %20s interval: %16lluns\n", cs->name, watchdog_data.cs_delta); +} + +static void watchdog_handle_remote_timeout(struct clocksource *cs) +{ + pr_info_once("Watchdog remote CPU %u read timed out\n", watchdog_data.curr_cpu); +} + +static void watchdog_print_remote_skew(struct clocksource *cs) +{ + pr_warn("Marking clocksource %s unstable due to inter CPU skew\n", cs->name); + if (watchdog_data.cpu_ts[0] < watchdog_data.cpu_ts[1]) { + pr_warn("CPU%u %16llu < CPU%u %16llu (cycles)\n", smp_processor_id(), + watchdog_data.cpu_ts[0], watchdog_data.curr_cpu, watchdog_data.cpu_ts[1]); + } else { + pr_warn("CPU%u %16llu < CPU%u %16llu (cycles)\n", watchdog_data.curr_cpu, + watchdog_data.cpu_ts[1], smp_processor_id(), watchdog_data.cpu_ts[0]); } - preempt_enable(); - migrate_enable(); - cpus_read_unlock(); - if (!cpumask_empty(&cpus_ahead)) - pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n", - cpumask_pr_args(&cpus_ahead), testcpu, cs->name); - if (!cpumask_empty(&cpus_behind)) - pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n", - cpumask_pr_args(&cpus_behind), testcpu, cs->name); - if (!cpumask_empty(&cpus_ahead) || !cpumask_empty(&cpus_behind)) - pr_warn(" CPU %d check durations %lldns - %lldns for clocksource %s.\n", - testcpu, cs_nsec_min, cs_nsec_max, cs->name); -} -EXPORT_SYMBOL_GPL(clocksource_verify_percpu); +} -static inline void clocksource_reset_watchdog(void) +static void watchdog_check_result(struct clocksource *cs) { - struct clocksource *cs; + switch (watchdog_data.result) { + case WD_SUCCESS: + clocksource_tick_stable(cs); + clocksource_enable_highres(cs); + return; - list_for_each_entry(cs, &watchdog_list, wd_list) + case WD_FREQ_TIMEOUT: + watchdog_print_freq_timeout(cs); + /* Try again later and invalidate the reference timestamps. */ cs->flags &= ~CLOCK_SOURCE_WATCHDOG; -} + return; + case WD_FREQ_NO_WATCHDOG: + case WD_FREQ_RESET: + /* + * Nothing to do when the reference timestamps were reset + * or no watchdog clocksource registered. + */ + return; + + case WD_FREQ_SKEWED: + watchdog_print_freq_skew(cs); + break; + + case WD_CPU_TIMEOUT: + /* Remote check timed out. Try again next cycle. */ + watchdog_handle_remote_timeout(cs); + return; + + case WD_CPU_SKEWED: + watchdog_print_remote_skew(cs); + break; + } + __clocksource_unstable(cs); +} static void clocksource_watchdog(struct timer_list *unused) { - int64_t wd_nsec, cs_nsec, interval; - u64 csnow, wdnow, cslast, wdlast; - int next_cpu, reset_pending; struct clocksource *cs; - enum wd_read_status read_ret; - unsigned long extra_wait = 0; - u32 md; + bool reset_pending; - spin_lock(&watchdog_lock); + guard(spinlock)(&watchdog_lock); if (!watchdog_running) - goto out; + return; reset_pending = atomic_read(&watchdog_reset_pending); list_for_each_entry(cs, &watchdog_list, wd_list) { - /* Clocksource already marked unstable? */ if (cs->flags & CLOCK_SOURCE_UNSTABLE) { if (finished_booting) @@ -452,174 +659,42 @@ static void clocksource_watchdog(struct timer_list *unused) continue; } - read_ret = cs_watchdog_read(cs, &csnow, &wdnow); - - if (read_ret == WD_READ_UNSTABLE) { - /* Clock readout unreliable, so give it up. */ - __clocksource_unstable(cs); - continue; - } - - /* - * When WD_READ_SKIP is returned, it means the system is likely - * under very heavy load, where the latency of reading - * watchdog/clocksource is very big, and affect the accuracy of - * watchdog check. So give system some space and suspend the - * watchdog check for 5 minutes. - */ - if (read_ret == WD_READ_SKIP) { - /* - * As the watchdog timer will be suspended, and - * cs->last could keep unchanged for 5 minutes, reset - * the counters. - */ - clocksource_reset_watchdog(); - extra_wait = HZ * 300; - break; - } - - /* Clocksource initialized ? */ - if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || - atomic_read(&watchdog_reset_pending)) { - cs->flags |= CLOCK_SOURCE_WATCHDOG; - cs->wd_last = wdnow; - cs->cs_last = csnow; - continue; + /* Compare against watchdog clocksource if available */ + if (watchdog_check_freq(cs, reset_pending)) { + /* Check for inter CPU skew */ + watchdog_check_cpu_skew(cs); } - wd_nsec = cycles_to_nsec_safe(watchdog, cs->wd_last, wdnow); - cs_nsec = cycles_to_nsec_safe(cs, cs->cs_last, csnow); - wdlast = cs->wd_last; /* save these in case we print them */ - cslast = cs->cs_last; - cs->cs_last = csnow; - cs->wd_last = wdnow; - - if (atomic_read(&watchdog_reset_pending)) - continue; - - /* - * The processing of timer softirqs can get delayed (usually - * on account of ksoftirqd not getting to run in a timely - * manner), which causes the watchdog interval to stretch. - * Skew detection may fail for longer watchdog intervals - * on account of fixed margins being used. - * Some clocksources, e.g. acpi_pm, cannot tolerate - * watchdog intervals longer than a few seconds. - */ - interval = max(cs_nsec, wd_nsec); - if (unlikely(interval > WATCHDOG_INTERVAL_MAX_NS)) { - if (system_state > SYSTEM_SCHEDULING && - interval > 2 * watchdog_max_interval) { - watchdog_max_interval = interval; - pr_warn("Long readout interval, skipping watchdog check: cs_nsec: %lld wd_nsec: %lld\n", - cs_nsec, wd_nsec); - } - watchdog_timer.expires = jiffies; - continue; - } - - /* Check the deviation from the watchdog clocksource. */ - md = cs->uncertainty_margin + watchdog->uncertainty_margin; - if (abs(cs_nsec - wd_nsec) > md) { - s64 cs_wd_msec; - s64 wd_msec; - u32 wd_rem; - - pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n", - smp_processor_id(), cs->name); - pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n", - watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask); - pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n", - cs->name, cs_nsec, csnow, cslast, cs->mask); - cs_wd_msec = div_s64_rem(cs_nsec - wd_nsec, 1000 * 1000, &wd_rem); - wd_msec = div_s64_rem(wd_nsec, 1000 * 1000, &wd_rem); - pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n", - cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec); - if (curr_clocksource == cs) - pr_warn(" '%s' is current clocksource.\n", cs->name); - else if (curr_clocksource) - pr_warn(" '%s' (not '%s') is current clocksource.\n", curr_clocksource->name, cs->name); - else - pr_warn(" No current clocksource.\n"); - __clocksource_unstable(cs); - continue; - } - - if (cs == curr_clocksource && cs->tick_stable) - cs->tick_stable(cs); - - if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && - (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && - (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { - /* Mark it valid for high-res. */ - cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; - - /* - * clocksource_done_booting() will sort it if - * finished_booting is not set yet. - */ - if (!finished_booting) - continue; - - /* - * If this is not the current clocksource let - * the watchdog thread reselect it. Due to the - * change to high res this clocksource might - * be preferred now. If it is the current - * clocksource let the tick code know about - * that change. - */ - if (cs != curr_clocksource) { - cs->flags |= CLOCK_SOURCE_RESELECT; - schedule_work(&watchdog_work); - } else { - tick_clock_notify(); - } - } + watchdog_check_result(cs); } - /* - * We only clear the watchdog_reset_pending, when we did a - * full cycle through all clocksources. - */ + /* Clear after the full clocksource walk */ if (reset_pending) atomic_dec(&watchdog_reset_pending); - /* - * Cycle through CPUs to check if the CPUs stay synchronized - * to each other. - */ - next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); - if (next_cpu >= nr_cpu_ids) - next_cpu = cpumask_first(cpu_online_mask); - - /* - * Arm timer if not already pending: could race with concurrent - * pair clocksource_stop_watchdog() clocksource_start_watchdog(). - */ + /* Could have been rearmed by a stop/start cycle */ if (!timer_pending(&watchdog_timer)) { - watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait; - add_timer_on(&watchdog_timer, next_cpu); + watchdog_timer.expires += WATCHDOG_INTERVAL; + add_timer_local(&watchdog_timer); } -out: - spin_unlock(&watchdog_lock); } static inline void clocksource_start_watchdog(void) { - if (watchdog_running || !watchdog || list_empty(&watchdog_list)) + if (watchdog_running || list_empty(&watchdog_list)) return; - timer_setup(&watchdog_timer, clocksource_watchdog, 0); + timer_setup(&watchdog_timer, clocksource_watchdog, TIMER_PINNED); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); + + add_timer_on(&watchdog_timer, get_boot_cpu_id()); watchdog_running = 1; } static inline void clocksource_stop_watchdog(void) { - if (!watchdog_running || (watchdog && !list_empty(&watchdog_list))) + if (!watchdog_running || !list_empty(&watchdog_list)) return; - del_timer(&watchdog_timer); + timer_delete(&watchdog_timer); watchdog_running = 0; } @@ -659,6 +734,13 @@ static void clocksource_select_watchdog(bool fallback) if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) continue; + /* + * If it's not continuous, don't put the fox in charge of + * the henhouse. + */ + if (!(cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)) + continue; + /* Skip current if we were requested for a fallback. */ if (fallback && cs == old_wd) continue; @@ -698,12 +780,6 @@ static int __clocksource_watchdog_kthread(void) unsigned long flags; int select = 0; - /* Do any required per-CPU skew verification. */ - if (curr_clocksource && - curr_clocksource->flags & CLOCK_SOURCE_UNSTABLE && - curr_clocksource->flags & CLOCK_SOURCE_VERIFY_PERCPU) - clocksource_verify_percpu(curr_clocksource); - spin_lock_irqsave(&watchdog_lock, flags); list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { if (cs->flags & CLOCK_SOURCE_UNSTABLE) { @@ -1024,6 +1100,8 @@ static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) continue; if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES)) continue; + if (cs->flags & CLOCK_SOURCE_WDTEST) + continue; return cs; } return NULL; @@ -1048,6 +1126,8 @@ static void __clocksource_select(bool skipcur) continue; if (strcmp(cs->name, override_name) != 0) continue; + if (cs->flags & CLOCK_SOURCE_WDTEST) + continue; /* * Check to make sure we don't switch to a non-highres * capable clocksource if the tick code is in oneshot @@ -1177,31 +1257,10 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, NSEC_PER_SEC / scale, sec * scale); - } - /* - * If the uncertainty margin is not specified, calculate it. If - * both scale and freq are non-zero, calculate the clock period, but - * bound below at 2*WATCHDOG_MAX_SKEW, that is, 500ppm by default. - * However, if either of scale or freq is zero, be very conservative - * and take the tens-of-milliseconds WATCHDOG_THRESHOLD value - * for the uncertainty margin. Allow stupidly small uncertainty - * margins to be specified by the caller for testing purposes, - * but warn to discourage production use of this capability. - * - * Bottom line: The sum of the uncertainty margins of the - * watchdog clocksource and the clocksource under test will be at - * least 500ppm by default. For more information, please see the - * comment preceding CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US above. - */ - if (scale && freq && !cs->uncertainty_margin) { - cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq); - if (cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW) - cs->uncertainty_margin = 2 * WATCHDOG_MAX_SKEW; - } else if (!cs->uncertainty_margin) { - cs->uncertainty_margin = WATCHDOG_THRESHOLD; + /* Update cs::freq_khz */ + cs->freq_khz = div_u64((u64)freq * scale, 1000); } - WARN_ON_ONCE(cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW); /* * Ensure clocksources that have large 'mult' values don't overflow @@ -1249,6 +1308,10 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX)) cs->id = CSID_GENERIC; + + if (WARN_ON_ONCE(!freq && cs->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT)) + cs->flags &= ~CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT; + if (cs->vdso_clock_mode < 0 || cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) { pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n", @@ -1510,7 +1573,7 @@ static int __init boot_override_clocksource(char* str) { mutex_lock(&clocksource_mutex); if (str) - strscpy(override_name, str, sizeof(override_name)); + strscpy(override_name, str); mutex_unlock(&clocksource_mutex); return 1; } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index deb1aa32814e..5bd6efe598f0 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * @@ -50,6 +50,36 @@ #include "tick-internal.h" /* + * Constants to set the queued state of the timer (INACTIVE, ENQUEUED) + * + * The callback state is kept separate in the CPU base because having it in + * the timer would required touching the timer after the callback, which + * makes it impossible to free the timer from the callback function. + * + * Therefore we track the callback state in: + * + * timer->base->cpu_base->running == timer + * + * On SMP it is possible to have a "callback function running and enqueued" + * status. It happens for example when a posix timer expired and the callback + * queued a signal. Between dropping the lock which protects the posix timer + * and reacquiring the base lock of the hrtimer, another CPU can deliver the + * signal and rearm the timer. + * + * All state transitions are protected by cpu_base->lock. + */ +#define HRTIMER_STATE_INACTIVE false +#define HRTIMER_STATE_ENQUEUED true + +/* + * The resolution of the clocks. The resolution value is returned in + * the clock_getres() system call to give application programmers an + * idea of the (in)accuracy of timers. Timer values are rounded up to + * this resolution values. + */ +#define HIGH_RES_NSEC 1 + +/* * Masks for selecting the soft and hard context timers from * cpu_base->active */ @@ -59,6 +89,7 @@ #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) static void retrigger_next_event(void *arg); +static ktime_t __hrtimer_cb_get_time(clockid_t clock_id); /* * The timer bases: @@ -68,65 +99,26 @@ static void retrigger_next_event(void *arg); * to reach a base using a clockid, hrtimer_clockid_to_base() * is used to convert from clockid to the proper hrtimer_base_type. */ + +#define BASE_INIT(idx, cid) \ + [idx] = { .index = idx, .clockid = cid } + DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), - .clock_base = - { - { - .index = HRTIMER_BASE_MONOTONIC, - .clockid = CLOCK_MONOTONIC, - .get_time = &ktime_get, - }, - { - .index = HRTIMER_BASE_REALTIME, - .clockid = CLOCK_REALTIME, - .get_time = &ktime_get_real, - }, - { - .index = HRTIMER_BASE_BOOTTIME, - .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, - }, - { - .index = HRTIMER_BASE_TAI, - .clockid = CLOCK_TAI, - .get_time = &ktime_get_clocktai, - }, - { - .index = HRTIMER_BASE_MONOTONIC_SOFT, - .clockid = CLOCK_MONOTONIC, - .get_time = &ktime_get, - }, - { - .index = HRTIMER_BASE_REALTIME_SOFT, - .clockid = CLOCK_REALTIME, - .get_time = &ktime_get_real, - }, - { - .index = HRTIMER_BASE_BOOTTIME_SOFT, - .clockid = CLOCK_BOOTTIME, - .get_time = &ktime_get_boottime, - }, - { - .index = HRTIMER_BASE_TAI_SOFT, - .clockid = CLOCK_TAI, - .get_time = &ktime_get_clocktai, - }, + .clock_base = { + BASE_INIT(HRTIMER_BASE_MONOTONIC, CLOCK_MONOTONIC), + BASE_INIT(HRTIMER_BASE_REALTIME, CLOCK_REALTIME), + BASE_INIT(HRTIMER_BASE_BOOTTIME, CLOCK_BOOTTIME), + BASE_INIT(HRTIMER_BASE_TAI, CLOCK_TAI), + BASE_INIT(HRTIMER_BASE_MONOTONIC_SOFT, CLOCK_MONOTONIC), + BASE_INIT(HRTIMER_BASE_REALTIME_SOFT, CLOCK_REALTIME), + BASE_INIT(HRTIMER_BASE_BOOTTIME_SOFT, CLOCK_BOOTTIME), + BASE_INIT(HRTIMER_BASE_TAI_SOFT, CLOCK_TAI), }, .csd = CSD_INIT(retrigger_next_event, NULL) }; -static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { - /* Make sure we catch unsupported clockids */ - [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, - - [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, - [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, - [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, - [CLOCK_TAI] = HRTIMER_BASE_TAI, -}; - static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) { if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) @@ -135,23 +127,43 @@ static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) return likely(base->online); } +#ifdef CONFIG_HIGH_RES_TIMERS +DEFINE_STATIC_KEY_FALSE(hrtimer_highres_enabled_key); + +static void hrtimer_hres_workfn(struct work_struct *work) +{ + static_branch_enable(&hrtimer_highres_enabled_key); +} + +static DECLARE_WORK(hrtimer_hres_work, hrtimer_hres_workfn); + +static inline void hrtimer_schedule_hres_work(void) +{ + if (!hrtimer_highres_enabled()) + schedule_work(&hrtimer_hres_work); +} +#else +static inline void hrtimer_schedule_hres_work(void) { } +#endif + /* * Functions and macros which are different for UP/SMP systems are kept in a * single place */ #ifdef CONFIG_SMP - /* * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base() * such that hrtimer_callback_running() can unconditionally dereference * timer->base->cpu_base */ static struct hrtimer_cpu_base migration_cpu_base = { - .clock_base = { { - .cpu_base = &migration_cpu_base, - .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, - &migration_cpu_base.lock), - }, }, + .clock_base = { + [0] = { + .cpu_base = &migration_cpu_base, + .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, + &migration_cpu_base.lock), + }, + }, }; #define migration_base migration_cpu_base.clock_base[0] @@ -168,15 +180,13 @@ static struct hrtimer_cpu_base migration_cpu_base = { * possible to set timer->base = &migration_base and drop the lock: the timer * remains locked. */ -static -struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, - unsigned long *flags) +static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, + unsigned long *flags) __acquires(&timer->base->lock) { - struct hrtimer_clock_base *base; - for (;;) { - base = READ_ONCE(timer->base); + struct hrtimer_clock_base *base = READ_ONCE(timer->base); + if (likely(base != &migration_base)) { raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) @@ -218,7 +228,7 @@ static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_ /* * The offline local CPU can't be the default target if the * next remote target event is after this timer. Keep the - * elected new base. An IPI will we issued to reprogram + * elected new base. An IPI will be issued to reprogram * it as a last resort. */ if (!hrtimer_base_is_online(this_cpu_base)) @@ -229,7 +239,7 @@ static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_ return expires >= new_base->cpu_base->expires_next; } -static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) +static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, bool pinned) { if (!hrtimer_base_is_online(base)) { int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER)); @@ -257,8 +267,7 @@ static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base * * the timer callback is currently running. */ static inline struct hrtimer_clock_base * -switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, - int pinned) +switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, bool pinned) { struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base; struct hrtimer_clock_base *new_base; @@ -271,13 +280,12 @@ again: if (base != new_base) { /* - * We are trying to move timer to new_base. - * However we can't change timer's base while it is running, - * so we keep it on the same CPU. No hassle vs. reprogramming - * the event source in the high resolution case. The softirq - * code will take care of this when the timer function has - * completed. There is no conflict as we hold the lock until - * the timer is enqueued. + * We are trying to move timer to new_base. However we can't + * change timer's base while it is running, so we keep it on + * the same CPU. No hassle vs. reprogramming the event source + * in the high resolution case. The remote CPU will take care + * of this when the timer function has completed. There is no + * conflict as we hold the lock until the timer is enqueued. */ if (unlikely(hrtimer_callback_running(timer))) return base; @@ -287,8 +295,7 @@ again: raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, - this_cpu_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; @@ -307,14 +314,13 @@ again: #else /* CONFIG_SMP */ -static inline struct hrtimer_clock_base * -lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) +static inline struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, + unsigned long *flags) __acquires(&timer->base->cpu_base->lock) { struct hrtimer_clock_base *base = timer->base; raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); - return base; } @@ -349,7 +355,7 @@ s64 __ktime_divns(const ktime_t kt, s64 div) return dclc < 0 ? -tmp : tmp; } EXPORT_SYMBOL_GPL(__ktime_divns); -#endif /* BITS_PER_LONG >= 64 */ +#endif /* BITS_PER_LONG < 64 */ /* * Add two ktime values and do a safety check for overflow: @@ -376,7 +382,7 @@ static const struct debug_obj_descr hrtimer_debug_descr; static void *hrtimer_debug_hint(void *addr) { - return ((struct hrtimer *) addr)->function; + return ACCESS_PRIVATE((struct hrtimer *)addr, function); } /* @@ -431,12 +437,37 @@ static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state) } } +/* Stub timer callback for improperly used timers. */ +static enum hrtimer_restart stub_timer(struct hrtimer *unused) +{ + WARN_ON_ONCE(1); + return HRTIMER_NORESTART; +} + +/* + * hrtimer_fixup_assert_init is called when: + * - an untracked/uninit-ed object is found + */ +static bool hrtimer_fixup_assert_init(void *addr, enum debug_obj_state state) +{ + struct hrtimer *timer = addr; + + switch (state) { + case ODEBUG_STATE_NOTAVAILABLE: + hrtimer_setup(timer, stub_timer, CLOCK_MONOTONIC, 0); + return true; + default: + return false; + } +} + static const struct debug_obj_descr hrtimer_debug_descr = { - .name = "hrtimer", - .debug_hint = hrtimer_debug_hint, - .fixup_init = hrtimer_fixup_init, - .fixup_activate = hrtimer_fixup_activate, - .fixup_free = hrtimer_fixup_free, + .name = "hrtimer", + .debug_hint = hrtimer_debug_hint, + .fixup_init = hrtimer_fixup_init, + .fixup_activate = hrtimer_fixup_activate, + .fixup_free = hrtimer_fixup_free, + .fixup_assert_init = hrtimer_fixup_assert_init, }; static inline void debug_hrtimer_init(struct hrtimer *timer) @@ -449,8 +480,7 @@ static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) debug_object_init_on_stack(timer, &hrtimer_debug_descr); } -static inline void debug_hrtimer_activate(struct hrtimer *timer, - enum hrtimer_mode mode) +static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { debug_object_activate(timer, &hrtimer_debug_descr); } @@ -460,6 +490,11 @@ static inline void debug_hrtimer_deactivate(struct hrtimer *timer) debug_object_deactivate(timer, &hrtimer_debug_descr); } +static inline void debug_hrtimer_assert_init(struct hrtimer *timer) +{ + debug_object_assert_init(timer, &hrtimer_debug_descr); +} + void destroy_hrtimer_on_stack(struct hrtimer *timer) { debug_object_free(timer, &hrtimer_debug_descr); @@ -470,100 +505,98 @@ EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); static inline void debug_hrtimer_init(struct hrtimer *timer) { } static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { } -static inline void debug_hrtimer_activate(struct hrtimer *timer, - enum hrtimer_mode mode) { } +static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } +static inline void debug_hrtimer_assert_init(struct hrtimer *timer) { } #endif -static inline void -debug_init(struct hrtimer *timer, clockid_t clockid, - enum hrtimer_mode mode) +static inline void debug_setup(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) { debug_hrtimer_init(timer); - trace_hrtimer_init(timer, clockid, mode); + trace_hrtimer_setup(timer, clockid, mode); } -static inline void debug_init_on_stack(struct hrtimer *timer, clockid_t clockid, - enum hrtimer_mode mode) +static inline void debug_setup_on_stack(struct hrtimer *timer, clockid_t clockid, + enum hrtimer_mode mode) { debug_hrtimer_init_on_stack(timer); - trace_hrtimer_init(timer, clockid, mode); + trace_hrtimer_setup(timer, clockid, mode); } -static inline void debug_activate(struct hrtimer *timer, - enum hrtimer_mode mode) +static inline void debug_activate(struct hrtimer *timer, enum hrtimer_mode mode, bool was_armed) { debug_hrtimer_activate(timer, mode); - trace_hrtimer_start(timer, mode); + trace_hrtimer_start(timer, mode, was_armed); } -static inline void debug_deactivate(struct hrtimer *timer) -{ - debug_hrtimer_deactivate(timer); - trace_hrtimer_cancel(timer); -} +#define for_each_active_base(base, cpu_base, active) \ + for (unsigned int idx = ffs(active); idx--; idx = ffs((active))) \ + for (bool done = false; !done; active &= ~(1U << idx)) \ + for (base = &cpu_base->clock_base[idx]; !done; done = true) + +#define hrtimer_from_timerqueue_node(_n) container_of_const(_n, struct hrtimer, node) -static struct hrtimer_clock_base * -__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) +#if defined(CONFIG_NO_HZ_COMMON) +/* + * Same as hrtimer_bases_next_event() below, but skips the excluded timer and + * does not update cpu_base->next_timer/expires. + */ +static ktime_t hrtimer_bases_next_event_without(struct hrtimer_cpu_base *cpu_base, + const struct hrtimer *exclude, + unsigned int active, ktime_t expires_next) { - unsigned int idx; + struct hrtimer_clock_base *base; + ktime_t expires; - if (!*active) - return NULL; + lockdep_assert_held(&cpu_base->lock); - idx = __ffs(*active); - *active &= ~(1U << idx); + for_each_active_base(base, cpu_base, active) { + expires = ktime_sub(base->expires_next, base->offset); + if (expires >= expires_next) + continue; - return &cpu_base->clock_base[idx]; + /* + * If the excluded timer is the first on this base evaluate the + * next timer. + */ + struct timerqueue_linked_node *node = timerqueue_linked_first(&base->active); + + if (unlikely(&exclude->node == node)) { + node = timerqueue_linked_next(node); + if (!node) + continue; + expires = ktime_sub(node->expires, base->offset); + if (expires >= expires_next) + continue; + } + expires_next = expires; + } + /* If base->offset changed, the result might be negative */ + return max(expires_next, 0); } +#endif -#define for_each_active_base(base, cpu_base, active) \ - while ((base = __next_base((cpu_base), &(active)))) +static __always_inline struct hrtimer *clock_base_next_timer(struct hrtimer_clock_base *base) +{ + struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active); -static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, - const struct hrtimer *exclude, - unsigned int active, - ktime_t expires_next) + return hrtimer_from_timerqueue_node(next); +} + +/* Find the base with the earliest expiry */ +static void hrtimer_bases_first(struct hrtimer_cpu_base *cpu_base,unsigned int active, + ktime_t *expires_next, struct hrtimer **next_timer) { struct hrtimer_clock_base *base; ktime_t expires; for_each_active_base(base, cpu_base, active) { - struct timerqueue_node *next; - struct hrtimer *timer; - - next = timerqueue_getnext(&base->active); - timer = container_of(next, struct hrtimer, node); - if (timer == exclude) { - /* Get to the next timer in the queue. */ - next = timerqueue_iterate_next(next); - if (!next) - continue; - - timer = container_of(next, struct hrtimer, node); - } - expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - if (expires < expires_next) { - expires_next = expires; - - /* Skip cpu_base update if a timer is being excluded. */ - if (exclude) - continue; - - if (timer->is_soft) - cpu_base->softirq_next_timer = timer; - else - cpu_base->next_timer = timer; + expires = ktime_sub(base->expires_next, base->offset); + if (expires < *expires_next) { + *expires_next = expires; + *next_timer = clock_base_next_timer(base); } } - /* - * clock_was_set() might have changed base->offset of any of - * the clock bases so the result might be negative. Fix it up - * to prevent a false positive in clockevents_program_event(). - */ - if (expires_next < 0) - expires_next = 0; - return expires_next; } /* @@ -586,30 +619,28 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, * - HRTIMER_ACTIVE_SOFT, or * - HRTIMER_ACTIVE_HARD. */ -static ktime_t -__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) +static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) { - unsigned int active; struct hrtimer *next_timer = NULL; ktime_t expires_next = KTIME_MAX; + unsigned int active; + + lockdep_assert_held(&cpu_base->lock); if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; - cpu_base->softirq_next_timer = NULL; - expires_next = __hrtimer_next_event_base(cpu_base, NULL, - active, KTIME_MAX); - - next_timer = cpu_base->softirq_next_timer; + if (active) + hrtimer_bases_first(cpu_base, active, &expires_next, &next_timer); + cpu_base->softirq_next_timer = next_timer; } if (active_mask & HRTIMER_ACTIVE_HARD) { active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; + if (active) + hrtimer_bases_first(cpu_base, active, &expires_next, &next_timer); cpu_base->next_timer = next_timer; - expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, - expires_next); } - - return expires_next; + return max(expires_next, 0); } static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base) @@ -649,8 +680,8 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, - offs_real, offs_boot, offs_tai); + ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, offs_real, + offs_boot, offs_tai); base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; @@ -660,7 +691,9 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) } /* - * Is the high resolution mode active ? + * Is the high resolution mode active in the CPU base. This cannot use the + * static key as the CPUs are switched to high resolution mode + * asynchronously. */ static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { @@ -668,8 +701,13 @@ static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) cpu_base->hres_active : 0; } -static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, - struct hrtimer *next_timer, +static inline void hrtimer_rearm_event(ktime_t expires_next, bool deferred) +{ + trace_hrtimer_rearm(expires_next, deferred); + tick_program_event(expires_next, 1); +} + +static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, struct hrtimer *next_timer, ktime_t expires_next) { cpu_base->expires_next = expires_next; @@ -694,20 +732,13 @@ static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) return; - tick_program_event(expires_next, 1); + hrtimer_rearm_event(expires_next, false); } -/* - * Reprogram the event source with checking both queues for the - * next event - * Called with interrupts disabled and base->lock held - */ -static void -hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) +/* Reprogram the event source with a evaluation of all clock bases */ +static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, bool skip_equal) { - ktime_t expires_next; - - expires_next = hrtimer_update_next_event(cpu_base); + ktime_t expires_next = hrtimer_update_next_event(cpu_base); if (skip_equal && expires_next == cpu_base->expires_next) return; @@ -718,57 +749,49 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS -/* - * High resolution timer enabled ? - */ +/* High resolution timer enabled ? */ static bool hrtimer_hres_enabled __read_mostly = true; unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; EXPORT_SYMBOL_GPL(hrtimer_resolution); -/* - * Enable / Disable high resolution mode - */ +/* Enable / Disable high resolution mode */ static int __init setup_hrtimer_hres(char *str) { return (kstrtobool(str, &hrtimer_hres_enabled) == 0); } - __setup("highres=", setup_hrtimer_hres); -/* - * hrtimer_high_res_enabled - query, if the highres mode is enabled - */ -static inline int hrtimer_is_hres_enabled(void) +/* hrtimer_high_res_enabled - query, if the highres mode is enabled */ +static inline bool hrtimer_is_hres_enabled(void) { return hrtimer_hres_enabled; } -/* - * Switch to high resolution mode - */ +/* Switch to high resolution mode */ static void hrtimer_switch_to_hres(void) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (tick_init_highres()) { - pr_warn("Could not switch to high resolution mode on CPU %u\n", - base->cpu); + pr_warn("Could not switch to high resolution mode on CPU %u\n", base->cpu); return; } - base->hres_active = 1; + base->hres_active = true; hrtimer_resolution = HIGH_RES_NSEC; tick_setup_sched_timer(true); /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); + hrtimer_schedule_hres_work(); } #else -static inline int hrtimer_is_hres_enabled(void) { return 0; } +static inline bool hrtimer_is_hres_enabled(void) { return 0; } static inline void hrtimer_switch_to_hres(void) { } #endif /* CONFIG_HIGH_RES_TIMERS */ + /* * Retrigger next event is called after clock was set with interrupts * disabled through an SMP function call or directly from low level @@ -799,17 +822,16 @@ static void retrigger_next_event(void *arg) * of the next expiring timer is enough. The return from the SMP * function call will take care of the reprogramming in case the * CPU was in a NOHZ idle sleep. + * + * In periodic low resolution mode, the next softirq expiration + * must also be updated. */ - if (!hrtimer_hres_active(base) && !tick_nohz_active) - return; - - raw_spin_lock(&base->lock); + guard(raw_spinlock)(&base->lock); hrtimer_update_base(base); if (hrtimer_hres_active(base)) - hrtimer_force_reprogram(base, 0); + hrtimer_force_reprogram(base, /* skip_equal */ false); else hrtimer_update_next_event(base); - raw_spin_unlock(&base->lock); } /* @@ -823,10 +845,11 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *base = timer->base; - ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + ktime_t expires = hrtimer_get_expires(timer); - WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); + WARN_ON_ONCE(expires < 0); + expires = ktime_sub(expires, base->offset); /* * CLOCK_REALTIME timer might be requested with an absolute * expiry time which is less than base->offset. Set it to 0. @@ -853,8 +876,7 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) timer_cpu_base->softirq_next_timer = timer; timer_cpu_base->softirq_expires_next = expires; - if (!ktime_before(expires, timer_cpu_base->expires_next) || - !reprogram) + if (!ktime_before(expires, timer_cpu_base->expires_next) || !reprogram) return; } @@ -868,11 +890,8 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) if (expires >= cpu_base->expires_next) return; - /* - * If the hrtimer interrupt is running, then it will reevaluate the - * clock bases and reprogram the clock event device. - */ - if (cpu_base->in_hrtirq) + /* If a deferred rearm is pending skip reprogramming the device */ + if (cpu_base->deferred_rearm) return; cpu_base->next_timer = timer; @@ -880,8 +899,7 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) __hrtimer_reprogram(cpu_base, timer, expires); } -static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, - unsigned int active) +static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int active) { struct hrtimer_clock_base *base; unsigned int seq; @@ -907,13 +925,11 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, if (seq == cpu_base->clock_was_set_seq) return false; - /* - * If the remote CPU is currently handling an hrtimer interrupt, it - * will reevaluate the first expiring timer of all clock bases - * before reprogramming. Nothing to do here. - */ - if (cpu_base->in_hrtirq) + /* If a deferred rearm is pending the remote CPU will take care of it */ + if (cpu_base->deferred_rearm) { + cpu_base->deferred_needs_update = true; return false; + } /* * Walk the affected clock bases and check whether the first expiring @@ -924,15 +940,15 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, active &= cpu_base->active_bases; for_each_active_base(base, cpu_base, active) { - struct timerqueue_node *next; + struct timerqueue_linked_node *next; - next = timerqueue_getnext(&base->active); + next = timerqueue_linked_first(&base->active); expires = ktime_sub(next->expires, base->offset); if (expires < cpu_base->expires_next) return true; /* Extra check for softirq clock bases */ - if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT) + if (base->index < HRTIMER_BASE_MONOTONIC_SOFT) continue; if (cpu_base->softirq_activated) continue; @@ -958,11 +974,9 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, */ void clock_was_set(unsigned int bases) { - struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases); cpumask_var_t mask; - int cpu; - if (!hrtimer_hres_active(cpu_base) && !tick_nohz_active) + if (!hrtimer_highres_enabled() && !tick_nohz_is_active()) goto out_timerfd; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { @@ -971,23 +985,19 @@ void clock_was_set(unsigned int bases) } /* Avoid interrupting CPUs if possible */ - cpus_read_lock(); - for_each_online_cpu(cpu) { - unsigned long flags; - - cpu_base = &per_cpu(hrtimer_bases, cpu); - raw_spin_lock_irqsave(&cpu_base->lock, flags); + scoped_guard(cpus_read_lock) { + int cpu; - if (update_needs_ipi(cpu_base, bases)) - cpumask_set_cpu(cpu, mask); + for_each_online_cpu(cpu) { + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); - raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + guard(raw_spinlock_irqsave)(&cpu_base->lock); + if (update_needs_ipi(cpu_base, bases)) + cpumask_set_cpu(cpu, mask); + } + scoped_guard(preempt) + smp_call_function_many(mask, retrigger_next_event, NULL, 1); } - - preempt_disable(); - smp_call_function_many(mask, retrigger_next_event, NULL, 1); - preempt_enable(); - cpus_read_unlock(); free_cpumask_var(mask); out_timerfd: @@ -1022,11 +1032,8 @@ void hrtimers_resume_local(void) retrigger_next_event(NULL); } -/* - * Counterpart to lock_hrtimer_base above: - */ -static inline -void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) +/* Counterpart to lock_hrtimer_base above */ +static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) __releases(&timer->base->cpu_base->lock) { raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); @@ -1043,7 +1050,7 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) * .. note:: * This only updates the timer expiry value and does not requeue the timer. * - * There is also a variant of the function hrtimer_forward_now(). + * There is also a variant of this function: hrtimer_forward_now(). * * Context: Can be safely called from the callback function of @timer. If called * from other contexts @timer must neither be enqueued nor running the @@ -1053,15 +1060,15 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) */ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) { - u64 orun = 1; ktime_t delta; + u64 orun = 1; delta = ktime_sub(now, hrtimer_get_expires(timer)); if (delta < 0) return 0; - if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)) + if (WARN_ON(timer->is_queued)) return 0; if (interval < hrtimer_resolution) @@ -1072,7 +1079,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) orun = ktime_divns(delta, incr); hrtimer_add_expires_ns(timer, incr * orun); - if (hrtimer_get_expires_tv64(timer) > now) + if (hrtimer_get_expires(timer) > now) return orun; /* * This (and the ktime_add() below) is the @@ -1090,73 +1097,98 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); * enqueue_hrtimer - internal function to (re)start a timer * * The timer is inserted in expiry order. Insertion into the - * red black tree is O(log(n)). Must hold the base lock. + * red black tree is O(log(n)). * * Returns true when the new timer is the leftmost timer in the tree. */ static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, - enum hrtimer_mode mode) + enum hrtimer_mode mode, bool was_armed) { - debug_activate(timer, mode); + lockdep_assert_held(&base->cpu_base->lock); + + debug_activate(timer, mode, was_armed); WARN_ON_ONCE(!base->cpu_base->online); base->cpu_base->active_bases |= 1 << base->index; /* Pairs with the lockless read in hrtimer_is_queued() */ - WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); + WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); + + if (!timerqueue_linked_add(&base->active, &timer->node)) + return false; + + base->expires_next = hrtimer_get_expires(timer); + return true; +} + +static inline void base_update_next_timer(struct hrtimer_clock_base *base) +{ + struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active); - return timerqueue_add(&base->active, &timer->node); + base->expires_next = next ? next->expires : KTIME_MAX; } /* * __remove_hrtimer - internal function to remove a timer * - * Caller must hold the base lock. - * * High resolution timer mode reprograms the clock event device when the * timer is the one which expires next. The caller can disable this by setting * reprogram to zero. This is useful, when the context does a reprogramming * anyway (e.g. timer interrupt) */ -static void __remove_hrtimer(struct hrtimer *timer, - struct hrtimer_clock_base *base, - u8 newstate, int reprogram) +static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, + bool newstate, bool reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; - u8 state = timer->state; + bool was_first; - /* Pairs with the lockless read in hrtimer_is_queued() */ - WRITE_ONCE(timer->state, newstate); - if (!(state & HRTIMER_STATE_ENQUEUED)) + lockdep_assert_held(&cpu_base->lock); + + if (!timer->is_queued) return; - if (!timerqueue_del(&base->active, &timer->node)) + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->is_queued, newstate); + + was_first = !timerqueue_linked_prev(&timer->node); + + if (!timerqueue_linked_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); + /* Nothing to update if this was not the first timer in the base */ + if (!was_first) + return; + + base_update_next_timer(base); + /* - * Note: If reprogram is false we do not update - * cpu_base->next_timer. This happens when we remove the first - * timer on a remote cpu. No harm as we never dereference - * cpu_base->next_timer. So the worst thing what can happen is - * an superfluous call to hrtimer_force_reprogram() on the - * remote cpu later on if the same timer gets enqueued again. + * If reprogram is false don't update cpu_base->next_timer and do not + * touch the clock event device. + * + * This happens when removing the first timer on a remote CPU, which + * will be handled by the remote CPU's interrupt. It also happens when + * a local timer is removed to be immediately restarted. That's handled + * at the call site. */ - if (reprogram && timer == cpu_base->next_timer) - hrtimer_force_reprogram(cpu_base, 1); + if (!reprogram || timer != cpu_base->next_timer || timer->is_lazy) + return; + + if (cpu_base->deferred_rearm) + cpu_base->deferred_needs_update = true; + else + hrtimer_force_reprogram(cpu_base, /* skip_equal */ true); } -/* - * remove hrtimer, called with base lock held - */ -static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, - bool restart, bool keep_local) +static inline bool remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, + bool newstate) { - u8 state = timer->state; + lockdep_assert_held(&base->cpu_base->lock); - if (state & HRTIMER_STATE_ENQUEUED) { + if (timer->is_queued) { bool reprogram; + debug_hrtimer_deactivate(timer); + /* * Remove the timer and force reprogramming when high * resolution mode is active and the timer is on the current @@ -1165,24 +1197,81 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, * reprogramming happens in the interrupt handler. This is a * rare case and less expensive than a smp call. */ - debug_deactivate(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); - /* - * If the timer is not restarted then reprogramming is - * required if the timer is local. If it is local and about - * to be restarted, avoid programming it twice (on removal - * and a moment later when it's requeued). - */ - if (!restart) - state = HRTIMER_STATE_INACTIVE; - else - reprogram &= !keep_local; + __remove_hrtimer(timer, base, newstate, reprogram); + return true; + } + return false; +} + +/* + * Update in place has to retrieve the expiry times of the neighbour nodes + * if they exist. That is cache line neutral because the dequeue/enqueue + * operation is going to need the same cache lines. But there is a big win + * when the dequeue/enqueue can be avoided because the RB tree does not + * have to be rebalanced twice. + */ +static inline bool +hrtimer_can_update_in_place(struct hrtimer *timer, struct hrtimer_clock_base *base, ktime_t expires) +{ + struct timerqueue_linked_node *next = timerqueue_linked_next(&timer->node); + struct timerqueue_linked_node *prev = timerqueue_linked_prev(&timer->node); - __remove_hrtimer(timer, base, state, reprogram); - return 1; + /* If the new expiry goes behind the next timer, requeue is required */ + if (next && expires > next->expires) + return false; + + /* If this is the first timer, update in place */ + if (!prev) + return true; + + /* Update in place when it does not go ahead of the previous one */ + return expires >= prev->expires; +} + +static inline bool +remove_and_enqueue_same_base(struct hrtimer *timer, struct hrtimer_clock_base *base, + const enum hrtimer_mode mode, ktime_t expires, u64 delta_ns) +{ + bool was_first = false; + + /* Remove it from the timer queue if active */ + if (timer->is_queued) { + was_first = !timerqueue_linked_prev(&timer->node); + + /* Try to update in place to avoid the de/enqueue dance */ + if (hrtimer_can_update_in_place(timer, base, expires)) { + hrtimer_set_expires_range_ns(timer, expires, delta_ns); + trace_hrtimer_start(timer, mode, true); + if (was_first) + base->expires_next = expires; + return was_first; + } + + debug_hrtimer_deactivate(timer); + timerqueue_linked_del(&base->active, &timer->node); } - return 0; + + /* Set the new expiry time */ + hrtimer_set_expires_range_ns(timer, expires, delta_ns); + + debug_activate(timer, mode, timer->is_queued); + base->cpu_base->active_bases |= 1 << base->index; + + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); + + /* If it's the first expiring timer now or again, update base */ + if (timerqueue_linked_add(&base->active, &timer->node)) { + base->expires_next = expires; + return true; + } + + if (was_first) + base_update_next_timer(base); + + return false; } static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, @@ -1201,55 +1290,93 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, return tim; } -static void -hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) +static void hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) { - ktime_t expires; + ktime_t expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); /* - * Find the next SOFT expiration. - */ - expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); - - /* - * reprogramming needs to be triggered, even if the next soft - * hrtimer expires at the same time than the next hard + * Reprogramming needs to be triggered, even if the next soft + * hrtimer expires at the same time as the next hard * hrtimer. cpu_base->softirq_expires_next needs to be updated! */ if (expires == KTIME_MAX) return; /* - * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event() - * cpu_base->*expires_next is only set by hrtimer_reprogram() + * cpu_base->next_timer is recomputed by __hrtimer_get_next_event() + * cpu_base->expires_next is only set by hrtimer_reprogram() */ hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram); } -static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - u64 delta_ns, const enum hrtimer_mode mode, - struct hrtimer_clock_base *base) +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +static __always_inline bool hrtimer_prefer_local(bool is_local, bool is_first, bool is_pinned) +{ + if (static_branch_likely(&timers_migration_enabled)) { + /* + * If it is local and the first expiring timer keep it on the local + * CPU to optimize reprogramming of the clockevent device. Also + * avoid switch_hrtimer_base() overhead when local and pinned. + */ + if (!is_local) + return false; + if (is_first || is_pinned) + return true; + + /* Honour the NOHZ full restrictions */ + if (!housekeeping_cpu(smp_processor_id(), HK_TYPE_KERNEL_NOISE)) + return false; + + /* + * If the tick is not stopped or need_resched() is set, then + * there is no point in moving the timer somewhere else. + */ + return !tick_nohz_tick_stopped() || need_resched(); + } + return is_local; +} +#else +static __always_inline bool hrtimer_prefer_local(bool is_local, bool is_first, bool is_pinned) +{ + return is_local; +} +#endif + +static inline bool hrtimer_keep_base(struct hrtimer *timer, bool is_local, bool is_first, + bool is_pinned) +{ + /* If the timer is running the callback it has to stay on its CPU base. */ + if (unlikely(timer->base->running == timer)) + return true; + + return hrtimer_prefer_local(is_local, is_first, is_pinned); +} + +static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, + const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases); - struct hrtimer_clock_base *new_base; - bool force_local, first; + bool is_pinned, first, was_first, keep_base = false; + struct hrtimer_cpu_base *cpu_base = base->cpu_base; - /* - * If the timer is on the local cpu base and is the first expiring - * timer then this might end up reprogramming the hardware twice - * (on removal and on enqueue). To avoid that by prevent the - * reprogram on removal, keep the timer local to the current CPU - * and enforce reprogramming after it is queued no matter whether - * it is the new first expiring timer again or not. - */ - force_local = base->cpu_base == this_cpu_base; - force_local &= base->cpu_base->next_timer == timer; + was_first = cpu_base->next_timer == timer; + is_pinned = !!(mode & HRTIMER_MODE_PINNED); /* - * Don't force local queuing if this enqueue happens on a unplugged - * CPU after hrtimer_cpu_dying() has been invoked. + * Don't keep it local if this enqueue happens on a unplugged CPU + * after hrtimer_cpu_dying() has been invoked. */ - force_local &= this_cpu_base->online; + if (likely(this_cpu_base->online)) { + bool is_local = cpu_base == this_cpu_base; + + keep_base = hrtimer_keep_base(timer, is_local, was_first, is_pinned); + } + + /* Calculate absolute expiry time for relative timers */ + if (mode & HRTIMER_MODE_REL) + tim = ktime_add_safe(tim, __hrtimer_cb_get_time(base->clockid)); + /* Compensate for low resolution granularity */ + tim = hrtimer_update_lowres(timer, tim, mode); /* * Remove an active timer from the queue. In case it is not queued @@ -1261,32 +1388,41 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * reprogramming later if it was the first expiring timer. This * avoids programming the underlying clock event twice (once at * removal and once after enqueue). + * + * @keep_base is also true if the timer callback is running on a + * remote CPU and for local pinned timers. */ - remove_hrtimer(timer, base, true, force_local); + if (likely(keep_base)) { + first = remove_and_enqueue_same_base(timer, base, mode, tim, delta_ns); + } else { + /* Keep the ENQUEUED state in case it is queued */ + bool was_armed = remove_hrtimer(timer, base, HRTIMER_STATE_ENQUEUED); - if (mode & HRTIMER_MODE_REL) - tim = ktime_add_safe(tim, base->get_time()); + hrtimer_set_expires_range_ns(timer, tim, delta_ns); - tim = hrtimer_update_lowres(timer, tim, mode); + /* Switch the timer base, if necessary: */ + base = switch_hrtimer_base(timer, base, is_pinned); + cpu_base = base->cpu_base; - hrtimer_set_expires_range_ns(timer, tim, delta_ns); + first = enqueue_hrtimer(timer, base, mode, was_armed); + } - /* Switch the timer base, if necessary: */ - if (!force_local) { - new_base = switch_hrtimer_base(timer, base, - mode & HRTIMER_MODE_PINNED); - } else { - new_base = base; + /* If a deferred rearm is pending skip reprogramming the device */ + if (cpu_base->deferred_rearm) { + cpu_base->deferred_needs_update = true; + return false; } - first = enqueue_hrtimer(timer, new_base, mode); - if (!force_local) { + if (!was_first || cpu_base != this_cpu_base) { /* - * If the current CPU base is online, then the timer is - * never queued on a remote CPU if it would be the first - * expiring timer there. + * If the current CPU base is online, then the timer is never + * queued on a remote CPU if it would be the first expiring + * timer there unless the timer callback is currently executed + * on the remote CPU. In the latter case the remote CPU will + * re-evaluate the first expiring timer after completing the + * callbacks. */ - if (hrtimer_base_is_online(this_cpu_base)) + if (likely(hrtimer_base_is_online(this_cpu_base))) return first; /* @@ -1294,21 +1430,33 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * already offline. If the timer is the first to expire, * kick the remote CPU to reprogram the clock event. */ - if (first) { - struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base; + if (first) + smp_call_function_single_async(cpu_base->cpu, &cpu_base->csd); + return false; + } - smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd); - } - return 0; + /* + * Special case for the HRTICK timer. It is frequently rearmed and most + * of the time moves the expiry into the future. That's expensive in + * virtual machines and it's better to take the pointless already armed + * interrupt than reprogramming the hardware on every context switch. + * + * If the new expiry is before the armed time, then reprogramming is + * required. + */ + if (timer->is_lazy) { + if (cpu_base->expires_next <= hrtimer_get_expires(timer)) + return false; } /* - * Timer was forced to stay on the current CPU to avoid - * reprogramming on removal and enqueue. Force reprogram the - * hardware by evaluating the new first expiring timer. + * Timer was the first expiring timer and forced to stay on the + * current CPU to avoid reprogramming on removal and enqueue. Force + * reprogram the hardware by evaluating the new first expiring + * timer. */ - hrtimer_force_reprogram(new_base->cpu_base, 1); - return 0; + hrtimer_force_reprogram(cpu_base, /* skip_equal */ true); + return false; } /** @@ -1320,14 +1468,14 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); * softirq based mode is considered for debug purpose only! */ -void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - u64 delta_ns, const enum hrtimer_mode mode) +void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, + const enum hrtimer_mode mode) { struct hrtimer_clock_base *base; unsigned long flags; - if (WARN_ON_ONCE(!timer->function)) - return; + debug_hrtimer_assert_init(timer); + /* * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard @@ -1375,8 +1523,11 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) base = lock_hrtimer_base(timer, &flags); - if (!hrtimer_callback_running(timer)) - ret = remove_hrtimer(timer, base, false, false); + if (!hrtimer_callback_running(timer)) { + ret = remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE); + if (ret) + trace_hrtimer_cancel(timer); + } unlock_hrtimer_base(timer, &flags); @@ -1410,8 +1561,7 @@ static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) * the timer callback to finish. Drop expiry_lock and reacquire it. That * allows the waiter to acquire the lock and make progress. */ -static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, - unsigned long flags) +static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, unsigned long flags) { if (atomic_read(&cpu_base->timer_waiters)) { raw_spin_unlock_irqrestore(&cpu_base->lock, flags); @@ -1439,7 +1589,7 @@ static __always_inline bool is_migration_base(struct hrtimer_clock_base *base) * running. * * This prevents priority inversion: if the soft irq thread is preempted - * in the middle of a timer callback, then calling del_timer_sync() can + * in the middle of a timer callback, then calling hrtimer_cancel() can * lead to two issues: * * - If the caller is on a remote CPU then it has to spin wait for the timer @@ -1476,14 +1626,10 @@ void hrtimer_cancel_wait_running(const struct hrtimer *timer) spin_unlock_bh(&base->cpu_base->softirq_expiry_lock); } #else -static inline void -hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { } -static inline void -hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { } -static inline void -hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { } -static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, - unsigned long flags) { } +static inline void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { } +static inline void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { } +static inline void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { } +static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, unsigned long fl) { } #endif /** @@ -1539,15 +1685,11 @@ u64 hrtimer_get_next_event(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); u64 expires = KTIME_MAX; - unsigned long flags; - - raw_spin_lock_irqsave(&cpu_base->lock, flags); + guard(raw_spinlock_irqsave)(&cpu_base->lock); if (!hrtimer_hres_active(cpu_base)) expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); - raw_spin_unlock_irqrestore(&cpu_base->lock, flags); - return expires; } @@ -1562,48 +1704,65 @@ u64 hrtimer_next_event_without(const struct hrtimer *exclude) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); u64 expires = KTIME_MAX; - unsigned long flags; - - raw_spin_lock_irqsave(&cpu_base->lock, flags); - - if (hrtimer_hres_active(cpu_base)) { - unsigned int active; + unsigned int active; - if (!cpu_base->softirq_activated) { - active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; - expires = __hrtimer_next_event_base(cpu_base, exclude, - active, KTIME_MAX); - } - active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; - expires = __hrtimer_next_event_base(cpu_base, exclude, active, - expires); - } + guard(raw_spinlock_irqsave)(&cpu_base->lock); + if (!hrtimer_hres_active(cpu_base)) + return expires; - raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; + if (active && !cpu_base->softirq_activated) + expires = hrtimer_bases_next_event_without(cpu_base, exclude, active, KTIME_MAX); - return expires; + active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; + if (!active) + return expires; + return hrtimer_bases_next_event_without(cpu_base, exclude, active, expires); } #endif static inline int hrtimer_clockid_to_base(clockid_t clock_id) { - if (likely(clock_id < MAX_CLOCKS)) { - int base = hrtimer_clock_to_base_table[clock_id]; + switch (clock_id) { + case CLOCK_MONOTONIC: + return HRTIMER_BASE_MONOTONIC; + case CLOCK_REALTIME: + return HRTIMER_BASE_REALTIME; + case CLOCK_BOOTTIME: + return HRTIMER_BASE_BOOTTIME; + case CLOCK_TAI: + return HRTIMER_BASE_TAI; + default: + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); + return HRTIMER_BASE_MONOTONIC; + } +} - if (likely(base != HRTIMER_MAX_CLOCK_BASES)) - return base; +static ktime_t __hrtimer_cb_get_time(clockid_t clock_id) +{ + switch (clock_id) { + case CLOCK_MONOTONIC: + return ktime_get(); + case CLOCK_REALTIME: + return ktime_get_real(); + case CLOCK_BOOTTIME: + return ktime_get_boottime(); + case CLOCK_TAI: + return ktime_get_clocktai(); + default: + WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); + return ktime_get(); } - WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); - return HRTIMER_BASE_MONOTONIC; } -static enum hrtimer_restart hrtimer_dummy_timeout(struct hrtimer *unused) +ktime_t hrtimer_cb_get_time(const struct hrtimer *timer) { - return HRTIMER_NORESTART; + return __hrtimer_cb_get_time(timer->base->clockid); } +EXPORT_SYMBOL_GPL(hrtimer_cb_get_time); -static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) +static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*fn)(struct hrtimer *), + clockid_t clock_id, enum hrtimer_mode mode) { bool softtimer = !!(mode & HRTIMER_MODE_SOFT); struct hrtimer_cpu_base *cpu_base; @@ -1634,41 +1793,15 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, base += hrtimer_clockid_to_base(clock_id); timer->is_soft = softtimer; timer->is_hard = !!(mode & HRTIMER_MODE_HARD); + timer->is_lazy = !!(mode & HRTIMER_MODE_LAZY_REARM); timer->base = &cpu_base->clock_base[base]; - timerqueue_init(&timer->node); -} - -static void __hrtimer_setup(struct hrtimer *timer, - enum hrtimer_restart (*function)(struct hrtimer *), - clockid_t clock_id, enum hrtimer_mode mode) -{ - __hrtimer_init(timer, clock_id, mode); + timerqueue_linked_init(&timer->node); - if (WARN_ON_ONCE(!function)) - timer->function = hrtimer_dummy_timeout; + if (WARN_ON_ONCE(!fn)) + ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout; else - timer->function = function; -} - -/** - * hrtimer_init - initialize a timer to the given clock - * @timer: the timer to be initialized - * @clock_id: the clock to be used - * @mode: The modes which are relevant for initialization: - * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, - * HRTIMER_MODE_REL_SOFT - * - * The PINNED variants of the above can be handed in, - * but the PINNED bit is ignored as pinning happens - * when the hrtimer is started - */ -void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, - enum hrtimer_mode mode) -{ - debug_init(timer, clock_id, mode); - __hrtimer_init(timer, clock_id, mode); + ACCESS_PRIVATE(timer, function) = fn; } -EXPORT_SYMBOL_GPL(hrtimer_init); /** * hrtimer_setup - initialize a timer to the given clock @@ -1686,7 +1819,7 @@ EXPORT_SYMBOL_GPL(hrtimer_init); void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { - debug_init(timer, clock_id, mode); + debug_setup(timer, clock_id, mode); __hrtimer_setup(timer, function, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup); @@ -1705,7 +1838,7 @@ void hrtimer_setup_on_stack(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *), clockid_t clock_id, enum hrtimer_mode mode) { - debug_init_on_stack(timer, clock_id, mode); + debug_setup_on_stack(timer, clock_id, mode); __hrtimer_setup(timer, function, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup_on_stack); @@ -1726,12 +1859,10 @@ bool hrtimer_active(const struct hrtimer *timer) base = READ_ONCE(timer->base); seq = raw_read_seqcount_begin(&base->seq); - if (timer->state != HRTIMER_STATE_INACTIVE || - base->running == timer) + if (timer->is_queued || base->running == timer) return true; - } while (read_seqcount_retry(&base->seq, seq) || - base != READ_ONCE(timer->base)); + } while (read_seqcount_retry(&base->seq, seq) || base != READ_ONCE(timer->base)); return false; } @@ -1745,7 +1876,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active); * - callback: the timer is being ran * - post: the timer is inactive or (re)queued * - * On the read side we ensure we observe timer->state and cpu_base->running + * On the read side we ensure we observe timer->is_queued and cpu_base->running * from the same section, if anything changed while we looked at it, we retry. * This includes timer->base changing because sequence numbers alone are * insufficient for that. @@ -1754,11 +1885,9 @@ EXPORT_SYMBOL_GPL(hrtimer_active); * a false negative if the read side got smeared over multiple consecutive * __run_hrtimer() invocations. */ - -static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, - struct hrtimer_clock_base *base, - struct hrtimer *timer, ktime_t *now, - unsigned long flags) __must_hold(&cpu_base->lock) +static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, + struct hrtimer *timer, ktime_t now, unsigned long flags) + __must_hold(&cpu_base->lock) { enum hrtimer_restart (*fn)(struct hrtimer *); bool expires_in_hardirq; @@ -1766,20 +1895,20 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, lockdep_assert_held(&cpu_base->lock); - debug_deactivate(timer); + debug_hrtimer_deactivate(timer); base->running = timer; /* - * Separate the ->running assignment from the ->state assignment. + * Separate the ->running assignment from the ->is_queued assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running == NULL && - * timer->state == INACTIVE. + * timer->is_queued == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); - __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); - fn = timer->function; + __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, false); + fn = ACCESS_PRIVATE(timer, function); /* * Clear the 'is relative' flag for the TIME_LOW_RES case. If the @@ -1813,16 +1942,15 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, * hrtimer_start_range_ns() can have popped in and enqueued the timer * for us already. */ - if (restart != HRTIMER_NORESTART && - !(timer->state & HRTIMER_STATE_ENQUEUED)) - enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS); + if (restart == HRTIMER_RESTART && !timer->is_queued) + enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS, false); /* - * Separate the ->running assignment from the ->state assignment. + * Separate the ->running assignment from the ->is_queued assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running.timer == NULL && - * timer->state == INACTIVE. + * timer->is_queued == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); @@ -1830,23 +1958,24 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, base->running = NULL; } +static __always_inline struct hrtimer *clock_base_next_timer_safe(struct hrtimer_clock_base *base) +{ + struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active); + + return next ? hrtimer_from_timerqueue_node(next) : NULL; +} + static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, unsigned long flags, unsigned int active_mask) { - struct hrtimer_clock_base *base; unsigned int active = cpu_base->active_bases & active_mask; + struct hrtimer_clock_base *base; for_each_active_base(base, cpu_base, active) { - struct timerqueue_node *node; - ktime_t basenow; - - basenow = ktime_add(now, base->offset); - - while ((node = timerqueue_getnext(&base->active))) { - struct hrtimer *timer; - - timer = container_of(node, struct hrtimer, node); + ktime_t basenow = ktime_add(now, base->offset); + struct hrtimer *timer; + while ((timer = clock_base_next_timer(base))) { /* * The immediate goal for using the softexpires is * minimizing wakeups, not running timers at the @@ -1859,10 +1988,10 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, * are right-of a not yet expired timer, because that * timer will have to trigger a wakeup anyway. */ - if (basenow < hrtimer_get_softexpires_tv64(timer)) + if (basenow < hrtimer_get_softexpires(timer)) break; - __run_hrtimer(cpu_base, base, timer, &basenow, flags); + __run_hrtimer(cpu_base, base, timer, basenow, flags); if (active_mask == HRTIMER_ACTIVE_SOFT) hrtimer_sync_wait_running(cpu_base, flags); } @@ -1881,7 +2010,7 @@ static __latent_entropy void hrtimer_run_softirq(void) now = hrtimer_update_base(cpu_base); __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT); - cpu_base->softirq_activated = 0; + cpu_base->softirq_activated = false; hrtimer_update_softirq_timer(cpu_base, true); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); @@ -1891,6 +2020,63 @@ static __latent_entropy void hrtimer_run_softirq(void) #ifdef CONFIG_HIGH_RES_TIMERS /* + * Very similar to hrtimer_force_reprogram(), except it deals with + * deferred_rearm and hang_detected. + */ +static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next, bool deferred) +{ + cpu_base->expires_next = expires_next; + cpu_base->deferred_rearm = false; + + if (unlikely(cpu_base->hang_detected)) { + /* + * Give the system a chance to do something else than looping + * on hrtimer interrupts. + */ + expires_next = ktime_add_ns(ktime_get(), + min(100 * NSEC_PER_MSEC, cpu_base->max_hang_time)); + } + hrtimer_rearm_event(expires_next, deferred); +} + +#ifdef CONFIG_HRTIMER_REARM_DEFERRED +void __hrtimer_rearm_deferred(void) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + ktime_t expires_next; + + if (!cpu_base->deferred_rearm) + return; + + guard(raw_spinlock)(&cpu_base->lock); + if (cpu_base->deferred_needs_update) { + hrtimer_update_base(cpu_base); + expires_next = hrtimer_update_next_event(cpu_base); + } else { + /* No timer added/removed. Use the cached value */ + expires_next = cpu_base->deferred_expires_next; + } + hrtimer_rearm(cpu_base, expires_next, true); +} + +static __always_inline void +hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next) +{ + /* hrtimer_interrupt() just re-evaluated the first expiring timer */ + cpu_base->deferred_needs_update = false; + /* Cache the expiry time */ + cpu_base->deferred_expires_next = expires_next; + set_thread_flag(TIF_HRTIMER_REARM); +} +#else /* CONFIG_HRTIMER_REARM_DEFERRED */ +static __always_inline void +hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next) +{ + hrtimer_rearm(cpu_base, expires_next, false); +} +#endif /* !CONFIG_HRTIMER_REARM_DEFERRED */ + +/* * High resolution timer interrupt * Called with interrupts disabled */ @@ -1904,86 +2090,55 @@ void hrtimer_interrupt(struct clock_event_device *dev) BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; dev->next_event = KTIME_MAX; + dev->next_event_forced = 0; raw_spin_lock_irqsave(&cpu_base->lock, flags); entry_time = now = hrtimer_update_base(cpu_base); retry: - cpu_base->in_hrtirq = 1; + cpu_base->deferred_rearm = true; /* - * We set expires_next to KTIME_MAX here with cpu_base->lock - * held to prevent that a timer is enqueued in our queue via - * the migration code. This does not affect enqueueing of - * timers which run their callback and need to be requeued on - * this CPU. + * Set expires_next to KTIME_MAX, which prevents that remote CPUs queue + * timers while __hrtimer_run_queues() is expiring the clock bases. + * Timers which are re/enqueued on the local CPU are not affected by + * this. */ cpu_base->expires_next = KTIME_MAX; if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; - cpu_base->softirq_activated = 1; + cpu_base->softirq_activated = true; raise_timer_softirq(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); - /* Reevaluate the clock bases for the [soft] next expiry */ - expires_next = hrtimer_update_next_event(cpu_base); - /* - * Store the new expiry value so the migration code can verify - * against it. - */ - cpu_base->expires_next = expires_next; - cpu_base->in_hrtirq = 0; - raw_spin_unlock_irqrestore(&cpu_base->lock, flags); - - /* Reprogramming necessary ? */ - if (!tick_program_event(expires_next, 0)) { - cpu_base->hang_detected = 0; - return; - } - /* * The next timer was already expired due to: * - tracing * - long lasting callbacks * - being scheduled away when running in a VM * - * We need to prevent that we loop forever in the hrtimer - * interrupt routine. We give it 3 attempts to avoid - * overreacting on some spurious event. - * - * Acquire base lock for updating the offsets and retrieving - * the current time. + * We need to prevent that we loop forever in the hrtiner interrupt + * routine. We give it 3 attempts to avoid overreacting on some + * spurious event. */ - raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); - cpu_base->nr_retries++; - if (++retries < 3) - goto retry; - /* - * Give the system a chance to do something else than looping - * here. We stored the entry time, so we know exactly how long - * we spent here. We schedule the next event this amount of - * time away. - */ - cpu_base->nr_hangs++; - cpu_base->hang_detected = 1; - raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + expires_next = hrtimer_update_next_event(cpu_base); + cpu_base->hang_detected = false; + if (expires_next < now) { + if (++retries < 3) + goto retry; + + delta = ktime_sub(now, entry_time); + cpu_base->max_hang_time = max_t(unsigned int, cpu_base->max_hang_time, delta); + cpu_base->nr_hangs++; + cpu_base->hang_detected = true; + } - delta = ktime_sub(now, entry_time); - if ((unsigned int)delta > cpu_base->max_hang_time) - cpu_base->max_hang_time = (unsigned int) delta; - /* - * Limit it to a sensible value as we enforce a longer - * delay. Give the CPU at least 100ms to catch up. - */ - if (delta > 100 * NSEC_PER_MSEC) - expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); - else - expires_next = ktime_add(now, delta); - tick_program_event(expires_next, 1); - pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta)); + hrtimer_interrupt_rearm(cpu_base, expires_next); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } + #endif /* !CONFIG_HIGH_RES_TIMERS */ /* @@ -2015,7 +2170,7 @@ void hrtimer_run_queues(void) if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; - cpu_base->softirq_activated = 1; + cpu_base->softirq_activated = true; raise_timer_softirq(HRTIMER_SOFTIRQ); } @@ -2028,8 +2183,7 @@ void hrtimer_run_queues(void) */ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) { - struct hrtimer_sleeper *t = - container_of(timer, struct hrtimer_sleeper, timer); + struct hrtimer_sleeper *t = container_of(timer, struct hrtimer_sleeper, timer); struct task_struct *task = t->task; t->task = NULL; @@ -2047,14 +2201,13 @@ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context) */ -void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, - enum hrtimer_mode mode) +void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode) { /* * Make the enqueue delivery mode check work on RT. If the sleeper * was initialized for hard interrupt delivery, force the mode bit. * This is a special case for hrtimer_sleepers because - * __hrtimer_init_sleeper() determines the delivery mode on RT so the + * __hrtimer_setup_sleeper() determines the delivery mode on RT so the * fiddling with this decision is avoided at the call sites. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) @@ -2064,8 +2217,8 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, } EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); -static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, - clockid_t clock_id, enum hrtimer_mode mode) +static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, + enum hrtimer_mode mode) { /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitly @@ -2091,8 +2244,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, mode |= HRTIMER_MODE_HARD; } - __hrtimer_init(&sl->timer, clock_id, mode); - sl->timer.function = hrtimer_wakeup; + __hrtimer_setup(&sl->timer, hrtimer_wakeup, clock_id, mode); sl->task = current; } @@ -2102,11 +2254,11 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, * @clock_id: the clock to be used * @mode: timer mode abs/rel */ -void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, - clockid_t clock_id, enum hrtimer_mode mode) +void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, + enum hrtimer_mode mode) { - debug_init_on_stack(&sl->timer, clock_id, mode); - __hrtimer_init_sleeper(sl, clock_id, mode); + debug_setup_on_stack(&sl->timer, clock_id, mode); + __hrtimer_setup_sleeper(sl, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack); @@ -2170,18 +2322,17 @@ static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) int ret; hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS); - hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); + hrtimer_set_expires(&t.timer, restart->nanosleep.expires); ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); return ret; } -long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, - const clockid_t clockid) +long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid) { struct restart_block *restart; struct hrtimer_sleeper t; - int ret = 0; + int ret; hrtimer_setup_sleeper_on_stack(&t, clockid, mode); hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns); @@ -2197,7 +2348,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, restart = ¤t->restart_block; restart->nanosleep.clockid = t.timer.base->clockid; - restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); + restart->nanosleep.expires = hrtimer_get_expires(&t.timer); set_restart_fn(restart, hrtimer_nanosleep_restart); out: destroy_hrtimer_on_stack(&t.timer); @@ -2220,8 +2371,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; - return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, - CLOCK_MONOTONIC); + return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif @@ -2229,7 +2379,7 @@ SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, - struct old_timespec32 __user *, rmtp) + struct old_timespec32 __user *, rmtp) { struct timespec64 tu; @@ -2242,8 +2392,7 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; - return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, - CLOCK_MONOTONIC); + return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif @@ -2253,14 +2402,13 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, int hrtimers_prepare_cpu(unsigned int cpu) { struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); - int i; - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + for (int i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i]; clock_b->cpu_base = cpu_base; seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); - timerqueue_init_head(&clock_b->active); + timerqueue_linked_init_head(&clock_b->active); } cpu_base->cpu = cpu; @@ -2274,13 +2422,14 @@ int hrtimers_cpu_starting(unsigned int cpu) /* Clear out any left over state from a CPU down operation */ cpu_base->active_bases = 0; - cpu_base->hres_active = 0; - cpu_base->hang_detected = 0; + cpu_base->hres_active = false; + cpu_base->hang_detected = false; cpu_base->next_timer = NULL; cpu_base->softirq_next_timer = NULL; cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; - cpu_base->online = 1; + cpu_base->softirq_activated = false; + cpu_base->online = true; return 0; } @@ -2289,20 +2438,20 @@ int hrtimers_cpu_starting(unsigned int cpu) static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { + struct timerqueue_linked_node *node; struct hrtimer *timer; - struct timerqueue_node *node; - while ((node = timerqueue_getnext(&old_base->active))) { - timer = container_of(node, struct hrtimer, node); + while ((node = timerqueue_linked_first(&old_base->active))) { + timer = hrtimer_from_timerqueue_node(node); BUG_ON(hrtimer_callback_running(timer)); - debug_deactivate(timer); + debug_hrtimer_deactivate(timer); /* * Mark it as ENQUEUED not INACTIVE otherwise the * timer could be seen as !active and just vanish away * under us on another CPU */ - __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); + __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, false); timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not @@ -2312,13 +2461,13 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * sort out already expired timers and reprogram the * event device. */ - enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); + enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS, true); } } int hrtimers_cpu_dying(unsigned int dying_cpu) { - int i, ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER)); + int ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER)); struct hrtimer_cpu_base *old_base, *new_base; old_base = this_cpu_ptr(&hrtimer_bases); @@ -2331,21 +2480,14 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) raw_spin_lock(&old_base->lock); raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING); - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - migrate_hrtimer_list(&old_base->clock_base[i], - &new_base->clock_base[i]); - } + for (int i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) + migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]); - /* - * The migration might have changed the first expiring softirq - * timer on this CPU. Update it. - */ - __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); /* Tell the other CPU to retrigger the next event */ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); raw_spin_unlock(&new_base->lock); - old_base->online = 0; + old_base->online = false; raw_spin_unlock(&old_base->lock); return 0; diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 876d389b2e21..7c6110e964e7 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -163,8 +163,7 @@ void posixtimer_rearm_itimer(struct task_struct *tsk) struct hrtimer *tmr = &tsk->signal->real_timer; if (!hrtimer_is_queued(tmr) && tsk->signal->it_real_incr != 0) { - hrtimer_forward(tmr, tmr->base->get_time(), - tsk->signal->it_real_incr); + hrtimer_forward_now(tmr, tsk->signal->it_real_incr); hrtimer_restart(tmr); } } diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index bc4db9e5ab70..1c954f330dfe 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -32,7 +32,6 @@ static u64 jiffies_read(struct clocksource *cs) static struct clocksource clocksource_jiffies = { .name = "jiffies", .rating = 1, /* lowest valid rating*/ - .uncertainty_margin = 32 * NSEC_PER_MSEC, .read = jiffies_read, .mask = CLOCKSOURCE_MASK(32), .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */ @@ -75,13 +74,11 @@ struct clocksource * __init __weak clocksource_default_clock(void) static struct clocksource refined_jiffies; -int register_refined_jiffies(long cycles_per_second) +void __init register_refined_jiffies(long cycles_per_second) { u64 nsec_per_tick, shift_hz; long cycles_per_tick; - - refined_jiffies = clocksource_jiffies; refined_jiffies.name = "refined-jiffies"; refined_jiffies.rating++; @@ -100,5 +97,221 @@ int register_refined_jiffies(long cycles_per_second) refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; __clocksource_register(&refined_jiffies); - return 0; } + +#ifdef CONFIG_PROC_SYSCTL +static ulong mult_hz(const ulong val) +{ + return val * HZ; +} + +static ulong div_hz(const ulong val) +{ + return val / HZ; +} + +static int sysctl_u2k_int_conv_hz(const bool *negp, const ulong *u_ptr, int *k_ptr) +{ + return proc_int_u2k_conv_uop(u_ptr, k_ptr, negp, mult_hz); +} + +static int sysctl_k2u_int_conv_hz(bool *negp, ulong *u_ptr, const int *k_ptr) +{ + return proc_int_k2u_conv_kop(u_ptr, k_ptr, negp, div_hz); +} + +static int sysctl_u2k_int_conv_userhz(const bool *negp, const ulong *u_ptr, int *k_ptr) +{ + return proc_int_u2k_conv_uop(u_ptr, k_ptr, negp, clock_t_to_jiffies); +} + +static ulong sysctl_jiffies_to_clock_t(const ulong val) +{ + return jiffies_to_clock_t(val); +} + +static int sysctl_k2u_int_conv_userhz(bool *negp, ulong *u_ptr, const int *k_ptr) +{ + return proc_int_k2u_conv_kop(u_ptr, k_ptr, negp, sysctl_jiffies_to_clock_t); +} + +static ulong sysctl_msecs_to_jiffies(const ulong val) +{ + return msecs_to_jiffies(val); +} + +static int sysctl_u2k_int_conv_ms(const bool *negp, const ulong *u_ptr, int *k_ptr) +{ + return proc_int_u2k_conv_uop(u_ptr, k_ptr, negp, sysctl_msecs_to_jiffies); +} + +static ulong sysctl_jiffies_to_msecs(const ulong val) +{ + return jiffies_to_msecs(val); +} + +static int sysctl_k2u_int_conv_ms(bool *negp, ulong *u_ptr, const int *k_ptr) +{ + return proc_int_k2u_conv_kop(u_ptr, k_ptr, negp, sysctl_jiffies_to_msecs); +} + +static int do_proc_int_conv_jiffies(bool *negp, ulong *u_ptr, int *k_ptr, + int dir, const struct ctl_table *tbl) +{ + return proc_int_conv(negp, u_ptr, k_ptr, dir, tbl, false, + sysctl_u2k_int_conv_hz, sysctl_k2u_int_conv_hz); +} + +static int do_proc_int_conv_userhz_jiffies(bool *negp, ulong *u_ptr, + int *k_ptr, int dir, + const struct ctl_table *tbl) +{ + return proc_int_conv(negp, u_ptr, k_ptr, dir, tbl, false, + sysctl_u2k_int_conv_userhz, + sysctl_k2u_int_conv_userhz); +} + +static int do_proc_int_conv_ms_jiffies(bool *negp, ulong *u_ptr, int *k_ptr, + int dir, const struct ctl_table *tbl) +{ + return proc_int_conv(negp, u_ptr, k_ptr, dir, tbl, false, + sysctl_u2k_int_conv_ms, sysctl_k2u_int_conv_ms); +} + +static int do_proc_int_conv_ms_jiffies_minmax(bool *negp, ulong *u_ptr, + int *k_ptr, int dir, + const struct ctl_table *tbl) +{ + return proc_int_conv(negp, u_ptr, k_ptr, dir, tbl, false, + sysctl_u2k_int_conv_ms, sysctl_k2u_int_conv_ms); +} + +#else // CONFIG_PROC_SYSCTL +static int do_proc_int_conv_jiffies(bool *negp, ulong *u_ptr, int *k_ptr, + int dir, const struct ctl_table *tbl) +{ + return -ENOSYS; +} + +static int do_proc_int_conv_userhz_jiffies(bool *negp, ulong *u_ptr, + int *k_ptr, int dir, + const struct ctl_table *tbl) +{ + return -ENOSYS; +} + +static int do_proc_int_conv_ms_jiffies(bool *negp, ulong *u_ptr, int *k_ptr, + int dir, const struct ctl_table *tbl) +{ + return -ENOSYS; +} + +static int do_proc_int_conv_ms_jiffies_minmax(bool *negp, ulong *u_ptr, + int *k_ptr, int dir, + const struct ctl_table *tbl) +{ + return -ENOSYS; +} +#endif + +/** + * proc_dointvec_jiffies - read a vector of integers as seconds + * @table: the sysctl table + * @dir: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in seconds, and are converted into + * jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_jiffies(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_dointvec_conv(table, dir, buffer, lenp, ppos, + do_proc_int_conv_jiffies); +} +EXPORT_SYMBOL(proc_dointvec_jiffies); + +/** + * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds + * @table: the sysctl table + * @dir: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: pointer to the file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/USER_HZ seconds, and + * are converted into jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_dointvec_conv(table, dir, buffer, lenp, ppos, + do_proc_int_conv_userhz_jiffies); +} +EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); + +/** + * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds + * @table: the sysctl table + * @dir: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: the current position in the file + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/1000 seconds, and + * are converted into jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_ms_jiffies(const struct ctl_table *table, int dir, void *buffer, + size_t *lenp, loff_t *ppos) +{ + return proc_dointvec_conv(table, dir, buffer, lenp, ppos, + do_proc_int_conv_ms_jiffies); +} +EXPORT_SYMBOL(proc_dointvec_ms_jiffies); + +int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_dointvec_conv(table, dir, buffer, lenp, ppos, + do_proc_int_conv_ms_jiffies_minmax); +} + +/** + * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values + * @table: the sysctl table + * @dir: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long + * values from/to the user buffer, treated as an ASCII string. The values + * are treated as milliseconds, and converted to jiffies when they are stored. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int dir, + void *buffer, size_t *lenp, loff_t *ppos) +{ + return proc_doulongvec_minmax_conv(table, dir, buffer, lenp, ppos, + HZ, 1000l); +} +EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); + diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 0775b9ec952a..4bca3f78c8ea 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -12,13 +12,15 @@ #include <linux/seq_file.h> #include <linux/proc_ns.h> #include <linux/export.h> +#include <linux/nstree.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/err.h> #include <linux/mm.h> +#include <linux/cleanup.h> -#include <vdso/datapage.h> +#include "namespace_internal.h" ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, struct timens_offsets *ns_offsets) @@ -88,29 +90,27 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, goto fail; err = -ENOMEM; - ns = kmalloc(sizeof(*ns), GFP_KERNEL_ACCOUNT); + ns = kzalloc_obj(*ns, GFP_KERNEL_ACCOUNT); if (!ns) goto fail_dec; - refcount_set(&ns->ns.count, 1); - - ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); - if (!ns->vvar_page) + err = timens_vdso_alloc_vvar_page(ns); + if (err) goto fail_free; - err = ns_alloc_inum(&ns->ns); + err = ns_common_init(ns); if (err) goto fail_free_page; ns->ucounts = ucounts; - ns->ns.ops = &timens_operations; ns->user_ns = get_user_ns(user_ns); ns->offsets = old_ns->offsets; ns->frozen_offsets = false; + ns_tree_add(ns); return ns; fail_free_page: - __free_page(ns->vvar_page); + timens_vdso_free_vvar_page(ns); fail_free: kfree(ns); fail_dec: @@ -130,7 +130,7 @@ fail: * * Return: timens_for_children namespace or ERR_PTR. */ -struct time_namespace *copy_time_ns(unsigned long flags, +struct time_namespace *copy_time_ns(u64 flags, struct user_namespace *user_ns, struct time_namespace *old_ns) { if (!(flags & CLONE_NEWTIME)) @@ -139,155 +139,47 @@ struct time_namespace *copy_time_ns(unsigned long flags, return clone_time_ns(user_ns, old_ns); } -static struct timens_offset offset_from_ts(struct timespec64 off) -{ - struct timens_offset ret; - - ret.sec = off.tv_sec; - ret.nsec = off.tv_nsec; - - return ret; -} - -/* - * A time namespace VVAR page has the same layout as the VVAR page which - * contains the system wide VDSO data. - * - * For a normal task the VVAR pages are installed in the normal ordering: - * VVAR - * PVCLOCK - * HVCLOCK - * TIMENS <- Not really required - * - * Now for a timens task the pages are installed in the following order: - * TIMENS - * PVCLOCK - * HVCLOCK - * VVAR - * - * The check for vdso_data->clock_mode is in the unlikely path of - * the seq begin magic. So for the non-timens case most of the time - * 'seq' is even, so the branch is not taken. - * - * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check - * for vdso_data->clock_mode is a non-issue. The task is spin waiting for the - * update to finish and for 'seq' to become even anyway. - * - * Timens page has vdso_data->clock_mode set to VDSO_CLOCKMODE_TIMENS which - * enforces the time namespace handling path. - */ -static void timens_setup_vdso_data(struct vdso_data *vdata, - struct time_namespace *ns) -{ - struct timens_offset *offset = vdata->offset; - struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); - struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); - - vdata->seq = 1; - vdata->clock_mode = VDSO_CLOCKMODE_TIMENS; - offset[CLOCK_MONOTONIC] = monotonic; - offset[CLOCK_MONOTONIC_RAW] = monotonic; - offset[CLOCK_MONOTONIC_COARSE] = monotonic; - offset[CLOCK_BOOTTIME] = boottime; - offset[CLOCK_BOOTTIME_ALARM] = boottime; -} - -struct page *find_timens_vvar_page(struct vm_area_struct *vma) -{ - if (likely(vma->vm_mm == current->mm)) - return current->nsproxy->time_ns->vvar_page; - - /* - * VM_PFNMAP | VM_IO protect .fault() handler from being called - * through interfaces like /proc/$pid/mem or - * process_vm_{readv,writev}() as long as there's no .access() - * in special_mapping_vmops(). - * For more details check_vma_flags() and __access_remote_vm() - */ - - WARN(1, "vvar_page accessed remotely"); - - return NULL; -} - -/* - * Protects possibly multiple offsets writers racing each other - * and tasks entering the namespace. - */ -static DEFINE_MUTEX(offset_lock); - -static void timens_set_vvar_page(struct task_struct *task, - struct time_namespace *ns) -{ - struct vdso_data *vdata; - unsigned int i; - - if (ns == &init_time_ns) - return; - - /* Fast-path, taken by every task in namespace except the first. */ - if (likely(ns->frozen_offsets)) - return; - - mutex_lock(&offset_lock); - /* Nothing to-do: vvar_page has been already initialized. */ - if (ns->frozen_offsets) - goto out; - - ns->frozen_offsets = true; - vdata = arch_get_vdso_data(page_address(ns->vvar_page)); - - for (i = 0; i < CS_BASES; i++) - timens_setup_vdso_data(&vdata[i], ns); - -out: - mutex_unlock(&offset_lock); -} +DEFINE_MUTEX(timens_offset_lock); void free_time_ns(struct time_namespace *ns) { + ns_tree_remove(ns); dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); - ns_free_inum(&ns->ns); - __free_page(ns->vvar_page); - kfree(ns); -} - -static struct time_namespace *to_time_ns(struct ns_common *ns) -{ - return container_of(ns, struct time_namespace, ns); + ns_common_free(ns); + timens_vdso_free_vvar_page(ns); + /* Concurrent nstree traversal depends on a grace period. */ + kfree_rcu(ns, ns.ns_rcu); } static struct ns_common *timens_get(struct task_struct *task) { - struct time_namespace *ns = NULL; + struct time_namespace *ns; struct nsproxy *nsproxy; - task_lock(task); + guard(task_lock)(task); nsproxy = task->nsproxy; - if (nsproxy) { - ns = nsproxy->time_ns; - get_time_ns(ns); - } - task_unlock(task); + if (!nsproxy) + return NULL; - return ns ? &ns->ns : NULL; + ns = nsproxy->time_ns; + get_time_ns(ns); + return &ns->ns; } static struct ns_common *timens_for_children_get(struct task_struct *task) { - struct time_namespace *ns = NULL; + struct time_namespace *ns; struct nsproxy *nsproxy; - task_lock(task); + guard(task_lock)(task); nsproxy = task->nsproxy; - if (nsproxy) { - ns = nsproxy->time_ns_for_children; - get_time_ns(ns); - } - task_unlock(task); + if (!nsproxy) + return NULL; - return ns ? &ns->ns : NULL; + ns = nsproxy->time_ns_for_children; + get_time_ns(ns); + return &ns->ns; } static void timens_put(struct ns_common *ns) @@ -295,12 +187,6 @@ static void timens_put(struct ns_common *ns) put_time_ns(to_time_ns(ns)); } -void timens_commit(struct task_struct *tsk, struct time_namespace *ns) -{ - timens_set_vvar_page(tsk, ns); - vdso_join_timens(tsk, ns); -} - static int timens_install(struct nsset *nsset, struct ns_common *new) { struct nsproxy *nsproxy = nsset->nsproxy; @@ -364,36 +250,33 @@ static void show_offset(struct seq_file *m, int clockid, struct timespec64 *ts) void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m) { - struct ns_common *ns; - struct time_namespace *time_ns; + struct time_namespace *time_ns __free(time_ns) = NULL; + struct ns_common *ns = timens_for_children_get(p); - ns = timens_for_children_get(p); if (!ns) return; + time_ns = to_time_ns(ns); show_offset(m, CLOCK_MONOTONIC, &time_ns->offsets.monotonic); show_offset(m, CLOCK_BOOTTIME, &time_ns->offsets.boottime); - put_time_ns(time_ns); } int proc_timens_set_offset(struct file *file, struct task_struct *p, struct proc_timens_offset *offsets, int noffsets) { - struct ns_common *ns; - struct time_namespace *time_ns; + struct time_namespace *time_ns __free(time_ns) = NULL; + struct ns_common *ns = timens_for_children_get(p); struct timespec64 tp; - int i, err; + int i; - ns = timens_for_children_get(p); if (!ns) return -ESRCH; + time_ns = to_time_ns(ns); - if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) { - put_time_ns(time_ns); + if (!file_ns_capable(file, time_ns->user_ns, CAP_SYS_TIME)) return -EPERM; - } for (i = 0; i < noffsets; i++) { struct proc_timens_offset *off = &offsets[i]; @@ -406,15 +289,12 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p, ktime_get_boottime_ts64(&tp); break; default: - err = -EINVAL; - goto out; + return -EINVAL; } - err = -ERANGE; - if (off->val.tv_sec > KTIME_SEC_MAX || off->val.tv_sec < -KTIME_SEC_MAX) - goto out; + return -ERANGE; tp = timespec64_add(tp, off->val); /* @@ -422,16 +302,13 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p, * still unreachable. */ if (tp.tv_sec < 0 || tp.tv_sec > KTIME_SEC_MAX / 2) - goto out; + return -ERANGE; } - mutex_lock(&offset_lock); - if (time_ns->frozen_offsets) { - err = -EACCES; - goto out_unlock; - } + guard(mutex)(&timens_offset_lock); + if (time_ns->frozen_offsets) + return -EACCES; - err = 0; /* Don't report errors after this line */ for (i = 0; i < noffsets; i++) { struct proc_timens_offset *off = &offsets[i]; @@ -449,17 +326,11 @@ int proc_timens_set_offset(struct file *file, struct task_struct *p, *offset = off->val; } -out_unlock: - mutex_unlock(&offset_lock); -out: - put_time_ns(time_ns); - - return err; + return 0; } const struct proc_ns_operations timens_operations = { .name = "time", - .type = CLONE_NEWTIME, .get = timens_get, .put = timens_put, .install = timens_install, @@ -469,7 +340,6 @@ const struct proc_ns_operations timens_operations = { const struct proc_ns_operations timens_for_children_operations = { .name = "time_for_children", .real_ns_name = "time", - .type = CLONE_NEWTIME, .get = timens_for_children_get, .put = timens_put, .install = timens_install, @@ -477,9 +347,12 @@ const struct proc_ns_operations timens_for_children_operations = { }; struct time_namespace init_time_ns = { - .ns.count = REFCOUNT_INIT(3), + .ns = NS_COMMON_INIT(init_time_ns), .user_ns = &init_user_ns, - .ns.inum = PROC_TIME_INIT_INO, - .ns.ops = &timens_operations, .frozen_offsets = true, }; + +void __init time_ns_init(void) +{ + ns_tree_add(&init_time_ns); +} diff --git a/kernel/time/namespace_internal.h b/kernel/time/namespace_internal.h new file mode 100644 index 000000000000..b37ba179f43b --- /dev/null +++ b/kernel/time/namespace_internal.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _TIME_NAMESPACE_INTERNAL_H +#define _TIME_NAMESPACE_INTERNAL_H + +#include <linux/mutex.h> + +struct time_namespace; + +/* + * Protects possibly multiple offsets writers racing each other + * and tasks entering the namespace. + */ +extern struct mutex timens_offset_lock; + +#ifdef CONFIG_TIME_NS_VDSO +int timens_vdso_alloc_vvar_page(struct time_namespace *ns); +void timens_vdso_free_vvar_page(struct time_namespace *ns); +#else /* !CONFIG_TIME_NS_VDSO */ +static inline int timens_vdso_alloc_vvar_page(struct time_namespace *ns) +{ + return 0; +} +static inline void timens_vdso_free_vvar_page(struct time_namespace *ns) +{ +} +#endif /* CONFIG_TIME_NS_VDSO */ + +#endif /* _TIME_NAMESPACE_INTERNAL_H */ diff --git a/kernel/time/namespace_vdso.c b/kernel/time/namespace_vdso.c new file mode 100644 index 000000000000..0d74d160eec9 --- /dev/null +++ b/kernel/time/namespace_vdso.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Author: Andrei Vagin <avagin@openvz.org> + * Author: Dmitry Safonov <dima@arista.com> + */ + +#include <linux/cleanup.h> +#include <linux/mm.h> +#include <linux/time_namespace.h> +#include <linux/time.h> +#include <linux/vdso_datastore.h> + +#include <vdso/clocksource.h> +#include <vdso/datapage.h> + +#include "namespace_internal.h" + +static struct timens_offset offset_from_ts(struct timespec64 off) +{ + struct timens_offset ret; + + ret.sec = off.tv_sec; + ret.nsec = off.tv_nsec; + + return ret; +} + +/* + * A time namespace VVAR page has the same layout as the VVAR page which + * contains the system wide VDSO data. + * + * For a normal task the VVAR pages are installed in the normal ordering: + * VVAR + * PVCLOCK + * HVCLOCK + * TIMENS <- Not really required + * + * Now for a timens task the pages are installed in the following order: + * TIMENS + * PVCLOCK + * HVCLOCK + * VVAR + * + * The check for vdso_clock->clock_mode is in the unlikely path of + * the seq begin magic. So for the non-timens case most of the time + * 'seq' is even, so the branch is not taken. + * + * If 'seq' is odd, i.e. a concurrent update is in progress, the extra check + * for vdso_clock->clock_mode is a non-issue. The task is spin waiting for the + * update to finish and for 'seq' to become even anyway. + * + * Timens page has vdso_clock->clock_mode set to VDSO_CLOCKMODE_TIMENS which + * enforces the time namespace handling path. + */ +static void timens_setup_vdso_clock_data(struct vdso_clock *vc, + struct time_namespace *ns) +{ + struct timens_offset *offset = vc->offset; + struct timens_offset monotonic = offset_from_ts(ns->offsets.monotonic); + struct timens_offset boottime = offset_from_ts(ns->offsets.boottime); + + vc->seq = 1; + vc->clock_mode = VDSO_CLOCKMODE_TIMENS; + offset[CLOCK_MONOTONIC] = monotonic; + offset[CLOCK_MONOTONIC_RAW] = monotonic; + offset[CLOCK_MONOTONIC_COARSE] = monotonic; + offset[CLOCK_BOOTTIME] = boottime; + offset[CLOCK_BOOTTIME_ALARM] = boottime; +} + +struct page *find_timens_vvar_page(struct vm_area_struct *vma) +{ + if (likely(vma->vm_mm == current->mm)) + return current->nsproxy->time_ns->vvar_page; + + /* + * VM_PFNMAP | VM_IO protect .fault() handler from being called + * through interfaces like /proc/$pid/mem or + * process_vm_{readv,writev}() as long as there's no .access() + * in special_mapping_vmops(). + * For more details check_vma_flags() and __access_remote_vm() + */ + + WARN(1, "vvar_page accessed remotely"); + + return NULL; +} + +static void timens_set_vvar_page(struct task_struct *task, + struct time_namespace *ns) +{ + struct vdso_time_data *vdata; + struct vdso_clock *vc; + unsigned int i; + + if (ns == &init_time_ns) + return; + + /* Fast-path, taken by every task in namespace except the first. */ + if (likely(ns->frozen_offsets)) + return; + + guard(mutex)(&timens_offset_lock); + /* Nothing to-do: vvar_page has been already initialized. */ + if (ns->frozen_offsets) + return; + + ns->frozen_offsets = true; + vdata = page_address(ns->vvar_page); + vc = vdata->clock_data; + + for (i = 0; i < CS_BASES; i++) + timens_setup_vdso_clock_data(&vc[i], ns); + + if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) { + for (i = 0; i < ARRAY_SIZE(vdata->aux_clock_data); i++) + timens_setup_vdso_clock_data(&vdata->aux_clock_data[i], ns); + } +} + +/* + * The vvar page layout depends on whether a task belongs to the root or + * non-root time namespace. Whenever a task changes its namespace, the VVAR + * page tables are cleared and then they will be re-faulted with a + * corresponding layout. + * See also the comment near timens_setup_vdso_clock_data() for details. + */ +static int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) +{ + struct mm_struct *mm = task->mm; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); + + guard(mmap_read_lock)(mm); + for_each_vma(vmi, vma) { + if (vma_is_special_mapping(vma, &vdso_vvar_mapping)) + zap_vma(vma); + } + return 0; +} + +void timens_commit(struct task_struct *tsk, struct time_namespace *ns) +{ + timens_set_vvar_page(tsk, ns); + vdso_join_timens(tsk, ns); +} + +int timens_vdso_alloc_vvar_page(struct time_namespace *ns) +{ + ns->vvar_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (!ns->vvar_page) + return -ENOMEM; + + return 0; +} + +void timens_vdso_free_vvar_page(struct time_namespace *ns) +{ + __free_page(ns->vvar_page); +} diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 163e7a2033b6..97fa99b96dd0 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -18,6 +18,7 @@ #include <linux/module.h> #include <linux/rtc.h> #include <linux/audit.h> +#include <linux/timekeeper_internal.h> #include "ntp_internal.h" #include "timekeeping_internal.h" @@ -86,14 +87,16 @@ struct ntp_data { #endif }; -static struct ntp_data tk_ntp_data = { - .tick_usec = USER_TICK_USEC, - .time_state = TIME_OK, - .time_status = STA_UNSYNC, - .time_constant = 2, - .time_maxerror = NTP_PHASE_LIMIT, - .time_esterror = NTP_PHASE_LIMIT, - .ntp_next_leap_sec = TIME64_MAX, +static struct ntp_data tk_ntp_data[TIMEKEEPERS_MAX] = { + [ 0 ... TIMEKEEPERS_MAX - 1 ] = { + .tick_usec = USER_TICK_USEC, + .time_state = TIME_OK, + .time_status = STA_UNSYNC, + .time_constant = 2, + .time_maxerror = NTP_PHASE_LIMIT, + .time_esterror = NTP_PHASE_LIMIT, + .ntp_next_leap_sec = TIME64_MAX, + }, }; #define SECS_PER_DAY 86400 @@ -300,7 +303,7 @@ static void ntp_update_offset(struct ntp_data *ntpdata, long offset) * Select how the frequency is to be controlled * and in which mode (PLL or FLL). */ - real_secs = __ktime_get_real_seconds(); + real_secs = ktime_get_ntp_seconds(ntpdata - tk_ntp_data); secs = (long)(real_secs - ntpdata->time_reftime); if (unlikely(ntpdata->time_status & STA_FREQHOLD)) secs = 0; @@ -348,33 +351,38 @@ static void __ntp_clear(struct ntp_data *ntpdata) /** * ntp_clear - Clears the NTP state variables + * @tkid: Timekeeper ID to be able to select proper ntp data array member */ -void ntp_clear(void) +void ntp_clear(unsigned int tkid) { - __ntp_clear(&tk_ntp_data); + __ntp_clear(&tk_ntp_data[tkid]); } -u64 ntp_tick_length(void) +u64 ntp_tick_length(unsigned int tkid) { - return tk_ntp_data.tick_length; + return tk_ntp_data[tkid].tick_length; } /** * ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t + * @tkid: Timekeeper ID * - * Provides the time of the next leapsecond against CLOCK_REALTIME in - * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending. + * Returns: For @tkid == TIMEKEEPER_CORE this provides the time of the next + * leap second against CLOCK_REALTIME in a ktime_t format if a + * leap second is pending. KTIME_MAX otherwise. */ -ktime_t ntp_get_next_leap(void) +ktime_t ntp_get_next_leap(unsigned int tkid) { - struct ntp_data *ntpdata = &tk_ntp_data; - ktime_t ret; + struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE]; + + if (tkid != TIMEKEEPER_CORE) + return KTIME_MAX; if ((ntpdata->time_state == TIME_INS) && (ntpdata->time_status & STA_INS)) return ktime_set(ntpdata->ntp_next_leap_sec, 0); - ret = KTIME_MAX; - return ret; + + return KTIME_MAX; } /* @@ -387,9 +395,9 @@ ktime_t ntp_get_next_leap(void) * * Also handles leap second processing, and returns leap offset */ -int second_overflow(time64_t secs) +int second_overflow(unsigned int tkid, time64_t secs) { - struct ntp_data *ntpdata = &tk_ntp_data; + struct ntp_data *ntpdata = &tk_ntp_data[tkid]; s64 delta; int leap = 0; s32 rem; @@ -605,7 +613,7 @@ static inline int update_rtc(struct timespec64 *to_set, unsigned long *offset_ns */ static inline bool ntp_synced(void) { - return !(tk_ntp_data.time_status & STA_UNSYNC); + return !(tk_ntp_data[TIMEKEEPER_CORE].time_status & STA_UNSYNC); } /* @@ -678,8 +686,7 @@ void ntp_notify_cmos_timer(bool offset_set) static void __init ntp_init_cmos_sync(void) { - hrtimer_init(&sync_hrtimer, CLOCK_REALTIME, HRTIMER_MODE_ABS); - sync_hrtimer.function = sync_timer_callback; + hrtimer_setup(&sync_hrtimer, sync_timer_callback, CLOCK_REALTIME, HRTIMER_MODE_ABS); } #else /* CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */ static inline void __init ntp_init_cmos_sync(void) { } @@ -703,7 +710,7 @@ static inline void process_adj_status(struct ntp_data *ntpdata, const struct __k * reference time to current time. */ if (!(ntpdata->time_status & STA_PLL) && (txc->status & STA_PLL)) - ntpdata->time_reftime = __ktime_get_real_seconds(); + ntpdata->time_reftime = ktime_get_ntp_seconds(ntpdata - tk_ntp_data); /* only set allowed bits */ ntpdata->time_status &= STA_RONLY; @@ -760,10 +767,10 @@ static inline void process_adjtimex_modes(struct ntp_data *ntpdata, const struct * adjtimex() mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ -int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, - s32 *time_tai, struct audit_ntp_data *ad) +int ntp_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts, + s32 *time_tai, struct audit_ntp_data *ad) { - struct ntp_data *ntpdata = &tk_ntp_data; + struct ntp_data *ntpdata = &tk_ntp_data[tkid]; int result; if (txc->modes & ADJ_ADJTIME) { @@ -1032,8 +1039,8 @@ static void hardpps_update_phase(struct ntp_data *ntpdata, long error) */ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { + struct ntp_data *ntpdata = &tk_ntp_data[TIMEKEEPER_CORE]; struct pps_normtime pts_norm, freq_norm; - struct ntp_data *ntpdata = &tk_ntp_data; pts_norm = pps_normalize_ts(*phase_ts); @@ -1084,18 +1091,18 @@ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_t static int __init ntp_tick_adj_setup(char *str) { - int rc = kstrtos64(str, 0, &tk_ntp_data.ntp_tick_adj); + int rc = kstrtos64(str, 0, &tk_ntp_data[TIMEKEEPER_CORE].ntp_tick_adj); if (rc) return rc; - tk_ntp_data.ntp_tick_adj <<= NTP_SCALE_SHIFT; + tk_ntp_data[TIMEKEEPER_CORE].ntp_tick_adj <<= NTP_SCALE_SHIFT; return 1; } - __setup("ntp_tick_adj=", ntp_tick_adj_setup); void __init ntp_init(void) { - ntp_clear(); + for (int id = 0; id < TIMEKEEPERS_MAX; id++) + __ntp_clear(tk_ntp_data + id); ntp_init_cmos_sync(); } diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 5a633dce9057..7084d839c207 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -3,14 +3,13 @@ #define _LINUX_NTP_INTERNAL_H extern void ntp_init(void); -extern void ntp_clear(void); +extern void ntp_clear(unsigned int tkid); /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ -extern u64 ntp_tick_length(void); -extern ktime_t ntp_get_next_leap(void); -extern int second_overflow(time64_t secs); -extern int __do_adjtimex(struct __kernel_timex *txc, - const struct timespec64 *ts, - s32 *time_tai, struct audit_ntp_data *ad); +extern u64 ntp_tick_length(unsigned int tkid); +extern ktime_t ntp_get_next_leap(unsigned int tkid); +extern int second_overflow(unsigned int tkid, time64_t secs); +extern int ntp_adjtimex(unsigned int tkid, struct __kernel_timex *txc, const struct timespec64 *ts, + s32 *time_tai, struct audit_ntp_data *ad); extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts); #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 1af0bb2cc45c..dab37295c8c2 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -90,26 +90,6 @@ static long posix_clock_ioctl(struct file *fp, return err; } -#ifdef CONFIG_COMPAT -static long posix_clock_compat_ioctl(struct file *fp, - unsigned int cmd, unsigned long arg) -{ - struct posix_clock_context *pccontext = fp->private_data; - struct posix_clock *clk = get_posix_clock(fp); - int err = -ENOTTY; - - if (!clk) - return -ENODEV; - - if (clk->ops.ioctl) - err = clk->ops.ioctl(pccontext, cmd, arg); - - put_posix_clock(clk); - - return err; -} -#endif - static int posix_clock_open(struct inode *inode, struct file *fp) { int err; @@ -123,12 +103,13 @@ static int posix_clock_open(struct inode *inode, struct file *fp) err = -ENODEV; goto out; } - pccontext = kzalloc(sizeof(*pccontext), GFP_KERNEL); + pccontext = kzalloc_obj(*pccontext); if (!pccontext) { err = -ENOMEM; goto out; } pccontext->clk = clk; + pccontext->fp = fp; if (clk->ops.open) { err = clk->ops.open(pccontext, fp->f_mode); if (err) { @@ -171,11 +152,9 @@ static const struct file_operations posix_clock_file_operations = { .read = posix_clock_read, .poll = posix_clock_poll, .unlocked_ioctl = posix_clock_ioctl, + .compat_ioctl = posix_clock_ioctl, .open = posix_clock_open, .release = posix_clock_release, -#ifdef CONFIG_COMPAT - .compat_ioctl = posix_clock_compat_ioctl, -#endif }; int posix_clock_register(struct posix_clock *clk, struct device *dev) @@ -251,7 +230,7 @@ static int pc_clock_adjtime(clockid_t id, struct __kernel_timex *tx) if (err) return err; - if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + if (tx->modes && (cd.fp->f_mode & FMODE_WRITE) == 0) { err = -EACCES; goto out; } diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 50e8d04ab661..0de2bb7cbec0 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1406,6 +1406,15 @@ void run_posix_cpu_timers(void) lockdep_assert_irqs_disabled(); /* + * Ensure that release_task(tsk) can't happen while + * handle_posix_cpu_timers() is running. Otherwise, a concurrent + * posix_cpu_timer_del() may fail to lock_task_sighand(tsk) and + * miss timer->it.cpu.firing != 0. + */ + if (tsk->exit_state) + return; + + /* * If the actual expiry is deferred to task work context and the * work is already scheduled there is no point to do anything here. */ @@ -1548,7 +1557,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, * Report back to the user the time still remaining. */ restart = ¤t->restart_block; - restart->nanosleep.expires = expires; + restart->nanosleep.expires = ns_to_ktime(expires); if (restart->nanosleep.type != TT_NONE) error = nanosleep_copyout(restart, &it.it_value); } @@ -1590,7 +1599,7 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block) clockid_t which_clock = restart_block->nanosleep.clockid; struct timespec64 t; - t = ns_to_timespec64(restart_block->nanosleep.expires); + t = ktime_to_timespec64(restart_block->nanosleep.expires); return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 1b675aee99a9..9331e1614124 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -9,34 +9,27 @@ * * These are all the functions necessary to implement POSIX clocks & timers */ -#include <linux/mm.h> +#include <linux/compat.h> +#include <linux/compiler.h> +#include <linux/init.h> +#include <linux/jhash.h> #include <linux/interrupt.h> -#include <linux/slab.h> -#include <linux/time.h> -#include <linux/mutex.h> -#include <linux/sched/task.h> - -#include <linux/uaccess.h> #include <linux/list.h> -#include <linux/init.h> -#include <linux/compiler.h> -#include <linux/hash.h> +#include <linux/memblock.h> +#include <linux/nospec.h> #include <linux/posix-clock.h> #include <linux/posix-timers.h> +#include <linux/prctl.h> +#include <linux/sched/task.h> +#include <linux/slab.h> #include <linux/syscalls.h> -#include <linux/wait.h> -#include <linux/workqueue.h> -#include <linux/export.h> -#include <linux/hashtable.h> -#include <linux/compat.h> -#include <linux/nospec.h> +#include <linux/time.h> #include <linux/time_namespace.h> +#include <linux/uaccess.h> #include "timekeeping.h" #include "posix-timers.h" -static struct kmem_cache *posix_timers_cache; - /* * Timers are managed in a hash table for lockless lookup. The hash key is * constructed from current::signal and the timer ID and the timer is @@ -46,39 +39,60 @@ static struct kmem_cache *posix_timers_cache; * This allows checkpoint/restore to reconstruct the exact timer IDs for * a process. */ -static DEFINE_HASHTABLE(posix_timers_hashtable, 9); -static DEFINE_SPINLOCK(hash_lock); +struct timer_hash_bucket { + spinlock_t lock; + struct hlist_head head; +}; + +static struct { + struct timer_hash_bucket *buckets; + unsigned long mask; + struct kmem_cache *cache; +} __timer_data __ro_after_init __aligned(4*sizeof(long)); + +#define timer_buckets (__timer_data.buckets) +#define timer_hashmask (__timer_data.mask) +#define posix_timers_cache (__timer_data.cache) static const struct k_clock * const posix_clocks[]; static const struct k_clock *clockid_to_kclock(const clockid_t id); static const struct k_clock clock_realtime, clock_monotonic; +#define TIMER_ANY_ID INT_MIN + /* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */ #if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif -static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); +static struct k_itimer *lock_timer(timer_t timer_id); +static inline void unlock_timer(struct k_itimer *timr) +{ + if (likely((timr))) + spin_unlock_irq(&timr->it_lock); +} + +#define scoped_timer_get_or_fail(_id) \ + scoped_cond_guard(lock_timer, return -EINVAL, _id) -#define lock_timer(tid, flags) \ -({ struct k_itimer *__timr; \ - __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ - __timr; \ -}) +#define scoped_timer (scope) -static int hash(struct signal_struct *sig, unsigned int nr) +DEFINE_CLASS(lock_timer, struct k_itimer *, unlock_timer(_T), lock_timer(id), timer_t id); +DEFINE_CLASS_IS_COND_GUARD(lock_timer); + +static struct timer_hash_bucket *hash_bucket(struct signal_struct *sig, unsigned int nr) { - return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable)); + return &timer_buckets[jhash2((u32 *)&sig, sizeof(sig) / sizeof(u32), nr) & timer_hashmask]; } -static struct k_itimer *__posix_timers_find(struct hlist_head *head, - struct signal_struct *sig, - timer_t id) +static struct k_itimer *posix_timer_by_id(timer_t id) { + struct signal_struct *sig = current->signal; + struct timer_hash_bucket *bucket = hash_bucket(sig, id); struct k_itimer *timer; - hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) { + hlist_for_each_entry_rcu(timer, &bucket->head, t_hash) { /* timer->it_signal can be set concurrently */ if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id)) return timer; @@ -86,46 +100,88 @@ static struct k_itimer *__posix_timers_find(struct hlist_head *head, return NULL; } -static struct k_itimer *posix_timer_by_id(timer_t id) +static inline struct signal_struct *posix_sig_owner(const struct k_itimer *timer) { - struct signal_struct *sig = current->signal; - struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)]; + unsigned long val = (unsigned long)timer->it_signal; - return __posix_timers_find(head, sig, id); + /* + * Mask out bit 0, which acts as invalid marker to prevent + * posix_timer_by_id() detecting it as valid. + */ + return (struct signal_struct *)(val & ~1UL); } -static int posix_timer_add(struct k_itimer *timer) +static bool posix_timer_hashed(struct timer_hash_bucket *bucket, struct signal_struct *sig, + timer_t id) { - struct signal_struct *sig = current->signal; - struct hlist_head *head; - unsigned int cnt, id; + struct hlist_head *head = &bucket->head; + struct k_itimer *timer; - /* - * FIXME: Replace this by a per signal struct xarray once there is - * a plan to handle the resulting CRIU regression gracefully. - */ - for (cnt = 0; cnt <= INT_MAX; cnt++) { - spin_lock(&hash_lock); - id = sig->next_posix_timer_id; + hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&bucket->lock)) { + if ((posix_sig_owner(timer) == sig) && (timer->it_id == id)) + return true; + } + return false; +} - /* Write the next ID back. Clamp it to the positive space */ - sig->next_posix_timer_id = (id + 1) & INT_MAX; +static bool posix_timer_add_at(struct k_itimer *timer, struct signal_struct *sig, unsigned int id) +{ + struct timer_hash_bucket *bucket = hash_bucket(sig, id); - head = &posix_timers_hashtable[hash(sig, id)]; - if (!__posix_timers_find(head, sig, id)) { - hlist_add_head_rcu(&timer->t_hash, head); - spin_unlock(&hash_lock); - return id; + scoped_guard (spinlock, &bucket->lock) { + /* + * Validate under the lock as this could have raced against + * another thread ending up with the same ID, which is + * highly unlikely, but possible. + */ + if (!posix_timer_hashed(bucket, sig, id)) { + /* + * Set the timer ID and the signal pointer to make + * it identifiable in the hash table. The signal + * pointer has bit 0 set to indicate that it is not + * yet fully initialized. posix_timer_hashed() + * masks this bit out, but the syscall lookup fails + * to match due to it being set. This guarantees + * that there can't be duplicate timer IDs handed + * out. + */ + timer->it_id = (timer_t)id; + timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL); + hlist_add_head_rcu(&timer->t_hash, &bucket->head); + return true; } - spin_unlock(&hash_lock); } - /* POSIX return code when no timer ID could be allocated */ - return -EAGAIN; + return false; } -static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) +static int posix_timer_add(struct k_itimer *timer, int req_id) { - spin_unlock_irqrestore(&timr->it_lock, flags); + struct signal_struct *sig = current->signal; + + if (unlikely(req_id != TIMER_ANY_ID)) { + if (!posix_timer_add_at(timer, sig, req_id)) + return -EBUSY; + + /* + * Move the ID counter past the requested ID, so that after + * switching back to normal mode the IDs are outside of the + * exact allocated region. That avoids ID collisions on the + * next regular timer_create() invocations. + */ + atomic_set(&sig->next_posix_timer_id, req_id + 1); + return req_id; + } + + for (unsigned int cnt = 0; cnt <= INT_MAX; cnt++) { + /* Get the next timer ID and clamp it to positive space */ + unsigned int id = atomic_fetch_inc(&sig->next_posix_timer_id) & INT_MAX; + + if (posix_timer_add_at(timer, sig, id)) + return id; + cond_resched(); + } + /* POSIX return code when no timer ID could be allocated */ + return -EAGAIN; } static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp) @@ -220,15 +276,6 @@ static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) return 0; } -static __init int init_posix_timers(void) -{ - posix_timers_cache = kmem_cache_create("posix_timers_cache", - sizeof(struct k_itimer), 0, - SLAB_PANIC | SLAB_ACCOUNT, NULL); - return 0; -} -__initcall(init_posix_timers); - /* * The siginfo si_overrun field and the return value of timer_getoverrun(2) * are of type int. Clamp the overrun value to INT_MAX @@ -245,8 +292,7 @@ static void common_hrtimer_rearm(struct k_itimer *timr) { struct hrtimer *timer = &timr->it.real.timer; - timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(), - timr->it_interval); + timr->it_overrun += hrtimer_forward_now(timer, timr->it_interval); hrtimer_restart(timer); } @@ -259,7 +305,7 @@ static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_it * since the signal was queued. In either case, don't rearm and * drop the signal. */ - if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!timr->it_signal)) + if (timr->it_signal_seq != timr->it_sigqueue_seq || WARN_ON_ONCE(!posixtimer_valid(timr))) return false; if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING)) @@ -304,6 +350,9 @@ void posix_timer_queue_signal(struct k_itimer *timr) { lockdep_assert_held(&timr->it_lock); + if (!posixtimer_valid(timr)) + return; + timr->it_status = timr->it_interval ? POSIX_TIMER_REQUEUE_PENDING : POSIX_TIMER_DISARMED; posixtimer_send_sigqueue(timr); } @@ -324,6 +373,21 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } +long posixtimer_create_prctl(unsigned long ctrl) +{ + switch (ctrl) { + case PR_TIMER_CREATE_RESTORE_IDS_OFF: + current->signal->timer_create_restore_ids = 0; + return 0; + case PR_TIMER_CREATE_RESTORE_IDS_ON: + current->signal->timer_create_restore_ids = 1; + return 0; + case PR_TIMER_CREATE_RESTORE_IDS_GET: + return current->signal->timer_create_restore_ids; + } + return -EINVAL; +} + static struct pid *good_sigevent(sigevent_t * event) { struct pid *pid = task_tgid(current); @@ -350,8 +414,12 @@ static struct pid *good_sigevent(sigevent_t * event) static struct k_itimer *alloc_posix_timer(void) { - struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); + struct k_itimer *tmr; + if (unlikely(!posix_timers_cache)) + return NULL; + + tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); if (!tmr) return tmr; @@ -373,15 +441,16 @@ void posixtimer_free_timer(struct k_itimer *tmr) static void posix_timer_unhash_and_free(struct k_itimer *tmr) { - spin_lock(&hash_lock); - hlist_del_rcu(&tmr->t_hash); - spin_unlock(&hash_lock); + struct timer_hash_bucket *bucket = hash_bucket(posix_sig_owner(tmr), tmr->it_id); + + scoped_guard (spinlock, &bucket->lock) + hlist_del_rcu(&tmr->t_hash); posixtimer_putref(tmr); } static int common_timer_create(struct k_itimer *new_timer) { - hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); + hrtimer_setup(&new_timer->it.real.timer, posix_timer_fn, new_timer->it_clock, 0); return 0; } @@ -390,6 +459,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, timer_t __user *created_timer_id) { const struct k_clock *kc = clockid_to_kclock(which_clock); + timer_t req_id = TIMER_ANY_ID; struct k_itimer *new_timer; int error, new_timer_id; @@ -398,6 +468,15 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, if (!kc->timer_create) return -EOPNOTSUPP; + /* Special case for CRIU to restore timers with a given timer ID. */ + if (unlikely(current->signal->timer_create_restore_ids)) { + if (copy_from_user(&req_id, created_timer_id, sizeof(req_id))) + return -EFAULT; + /* Valid IDs are 0..INT_MAX */ + if ((unsigned int)req_id > INT_MAX) + return -EINVAL; + } + new_timer = alloc_posix_timer(); if (unlikely(!new_timer)) return -EAGAIN; @@ -406,24 +485,21 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, /* * Add the timer to the hash table. The timer is not yet valid - * because new_timer::it_signal is still NULL. The timer id is also - * not yet visible to user space. + * after insertion, but has a unique ID allocated. */ - new_timer_id = posix_timer_add(new_timer); + new_timer_id = posix_timer_add(new_timer, req_id); if (new_timer_id < 0) { posixtimer_free_timer(new_timer); return new_timer_id; } - new_timer->it_id = (timer_t) new_timer_id; new_timer->it_clock = which_clock; new_timer->kclock = kc; new_timer->it_overrun = -1LL; if (event) { - rcu_read_lock(); - new_timer->it_pid = get_pid(good_sigevent(event)); - rcu_read_unlock(); + scoped_guard (rcu) + new_timer->it_pid = get_pid(good_sigevent(event)); if (!new_timer->it_pid) { error = -EINVAL; goto out; @@ -434,7 +510,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, } else { new_timer->it_sigev_notify = SIGEV_SIGNAL; new_timer->sigq.info.si_signo = SIGALRM; - memset(&new_timer->sigq.info.si_value, 0, sizeof(sigval_t)); new_timer->sigq.info.si_value.sival_int = new_timer->it_id; new_timer->it_pid = get_pid(task_tgid(current)); } @@ -452,8 +527,8 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, goto out; } /* - * After succesful copy out, the timer ID is visible to user space - * now but not yet valid because new_timer::signal is still NULL. + * After successful copy out, the timer ID is visible to user space + * now but not yet valid because new_timer::signal low order bit is 1. * * Complete the initialization with the clock specific create * callback. @@ -462,14 +537,25 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, if (error) goto out; - spin_lock_irq(¤t->sighand->siglock); - /* This makes the timer valid in the hash table */ - WRITE_ONCE(new_timer->it_signal, current->signal); - hlist_add_head(&new_timer->list, ¤t->signal->posix_timers); - spin_unlock_irq(¤t->sighand->siglock); /* - * After unlocking sighand::siglock @new_timer is subject to - * concurrent removal and cannot be touched anymore + * timer::it_lock ensures that __lock_timer() observes a fully + * initialized timer when it observes a valid timer::it_signal. + * + * sighand::siglock is required to protect signal::posix_timers. + */ + scoped_guard (spinlock_irq, &new_timer->it_lock) { + guard(spinlock)(¤t->sighand->siglock); + /* + * new_timer::it_signal contains the signal pointer with + * bit 0 set, which makes it invalid for syscall operations. + * Store the unmodified signal pointer to make it valid. + */ + WRITE_ONCE(new_timer->it_signal, current->signal); + hlist_add_head_rcu(&new_timer->list, ¤t->signal->posix_timers); + } + /* + * After unlocking @new_timer is subject to concurrent removal and + * cannot be touched anymore */ return 0; out: @@ -507,7 +593,7 @@ COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, } #endif -static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) +static struct k_itimer *lock_timer(timer_t timer_id) { struct k_itimer *timr; @@ -522,11 +608,11 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) * The hash lookup and the timers are RCU protected. * * Timers are added to the hash in invalid state where - * timr::it_signal == NULL. timer::it_signal is only set after the - * rest of the initialization succeeded. + * timr::it_signal is marked invalid. timer::it_signal is only set + * after the rest of the initialization succeeded. * * Timer destruction happens in steps: - * 1) Set timr::it_signal to NULL with timr::it_lock held + * 1) Set timr::it_signal marked invalid with timr::it_lock held * 2) Release timr::it_lock * 3) Remove from the hash under hash_lock * 4) Put the reference count. @@ -543,25 +629,21 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) * * The lookup validates locklessly that timr::it_signal == * current::it_signal and timr::it_id == @timer_id. timr::it_id - * can't change, but timr::it_signal becomes NULL during - * destruction. + * can't change, but timr::it_signal can become invalid during + * destruction, which makes the locked check fail. */ - rcu_read_lock(); + guard(rcu)(); timr = posix_timer_by_id(timer_id); if (timr) { - spin_lock_irqsave(&timr->it_lock, *flags); + spin_lock_irq(&timr->it_lock); /* * Validate under timr::it_lock that timr::it_signal is * still valid. Pairs with #1 above. */ - if (timr->it_signal == current->signal) { - rcu_read_unlock(); + if (timr->it_signal == current->signal) return timr; - } - spin_unlock_irqrestore(&timr->it_lock, *flags); + spin_unlock_irq(&timr->it_lock); } - rcu_read_unlock(); - return NULL; } @@ -652,24 +734,10 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) { - const struct k_clock *kc; - struct k_itimer *timr; - unsigned long flags; - int ret = 0; - - timr = lock_timer(timer_id, &flags); - if (!timr) - return -EINVAL; - memset(setting, 0, sizeof(*setting)); - kc = timr->kclock; - if (WARN_ON_ONCE(!kc || !kc->timer_get)) - ret = -EINVAL; - else - kc->timer_get(timr, setting); - - unlock_timer(timr, flags); - return ret; + scoped_timer_get_or_fail(timer_id) + scoped_timer->kclock->timer_get(scoped_timer, setting); + return 0; } /* Get the time remaining on a POSIX.1b interval timer. */ @@ -723,18 +791,8 @@ SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id, */ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) { - struct k_itimer *timr; - unsigned long flags; - int overrun; - - timr = lock_timer(timer_id, &flags); - if (!timr) - return -EINVAL; - - overrun = timer_overrun_to_int(timr); - unlock_timer(timr, flags); - - return overrun; + scoped_timer_get_or_fail(timer_id) + return timer_overrun_to_int(scoped_timer); } static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, @@ -747,7 +805,7 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, /* * Posix magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they become CLOCK_MONOTONIC based under the - * hood. See hrtimer_init(). Update timr->kclock, so the generic + * hood. See hrtimer_setup(). Update timr->kclock, so the generic * functions which use timr->kclock->clock_get_*() work. * * Note: it_clock stays unmodified, because the next timer_set() might @@ -756,11 +814,10 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, if (timr->it_clock == CLOCK_REALTIME) timr->kclock = absolute ? &clock_realtime : &clock_monotonic; - hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); - timr->it.real.timer.function = posix_timer_fn; + hrtimer_setup(&timr->it.real.timer, posix_timer_fn, timr->it_clock, mode); if (!absolute) - expires = ktime_add_safe(expires, timer->base->get_time()); + expires = ktime_add_safe(expires, hrtimer_cb_get_time(timer)); hrtimer_set_expires(timer, expires); if (!sigev_none) @@ -791,26 +848,13 @@ static void common_timer_wait_running(struct k_itimer *timer) * when the task which tries to delete or disarm the timer has preempted * the task which runs the expiry in task work context. */ -static struct k_itimer *timer_wait_running(struct k_itimer *timer, - unsigned long *flags) +static void timer_wait_running(struct k_itimer *timer) { - const struct k_clock *kc = READ_ONCE(timer->kclock); - timer_t timer_id = READ_ONCE(timer->it_id); - - /* Prevent kfree(timer) after dropping the lock */ - rcu_read_lock(); - unlock_timer(timer, *flags); - /* * kc->timer_wait_running() might drop RCU lock. So @timer * cannot be touched anymore after the function returns! */ - if (!WARN_ON_ONCE(!kc->timer_wait_running)) - kc->timer_wait_running(timer); - - rcu_read_unlock(); - /* Relock the timer. It might be not longer hashed. */ - return lock_timer(timer_id, flags); + timer->kclock->timer_wait_running(timer); } /* @@ -865,15 +909,9 @@ int common_timer_set(struct k_itimer *timr, int flags, return 0; } -static int do_timer_settime(timer_t timer_id, int tmr_flags, - struct itimerspec64 *new_spec64, +static int do_timer_settime(timer_t timer_id, int tmr_flags, struct itimerspec64 *new_spec64, struct itimerspec64 *old_spec64) { - const struct k_clock *kc; - struct k_itimer *timr; - unsigned long flags; - int error; - if (!timespec64_valid(&new_spec64->it_interval) || !timespec64_valid(&new_spec64->it_value)) return -EINVAL; @@ -881,33 +919,28 @@ static int do_timer_settime(timer_t timer_id, int tmr_flags, if (old_spec64) memset(old_spec64, 0, sizeof(*old_spec64)); - timr = lock_timer(timer_id, &flags); -retry: - if (!timr) - return -EINVAL; + for (; ; old_spec64 = NULL) { + struct k_itimer *timr; - if (old_spec64) - old_spec64->it_interval = ktime_to_timespec64(timr->it_interval); + scoped_timer_get_or_fail(timer_id) { + timr = scoped_timer; - /* Prevent signal delivery and rearming. */ - timr->it_signal_seq++; + if (old_spec64) + old_spec64->it_interval = ktime_to_timespec64(timr->it_interval); - kc = timr->kclock; - if (WARN_ON_ONCE(!kc || !kc->timer_set)) - error = -EINVAL; - else - error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64); - - if (error == TIMER_RETRY) { - // We already got the old time... - old_spec64 = NULL; - /* Unlocks and relocks the timer if it still exists */ - timr = timer_wait_running(timr, &flags); - goto retry; - } - unlock_timer(timr, flags); + /* Prevent signal delivery and rearming. */ + timr->it_signal_seq++; - return error; + int ret = timr->kclock->timer_set(timr, tmr_flags, new_spec64, old_spec64); + if (ret != TIMER_RETRY) + return ret; + + /* Protect the timer from being freed when leaving the lock scope */ + rcu_read_lock(); + } + timer_wait_running(timr); + rcu_read_unlock(); + } } /* Set a POSIX.1b interval timer */ @@ -978,110 +1011,58 @@ static inline void posix_timer_cleanup_ignored(struct k_itimer *tmr) } } -static inline int timer_delete_hook(struct k_itimer *timer) +static void posix_timer_delete(struct k_itimer *timer) { - const struct k_clock *kc = timer->kclock; - - /* Prevent signal delivery and rearming. */ + /* + * Invalidate the timer, remove it from the linked list and remove + * it from the ignored list if pending. + * + * The invalidation must be written with siglock held so that the + * signal code observes the invalidated timer::it_signal in + * do_sigaction(), which prevents it from moving a pending signal + * of a deleted timer to the ignore list. + * + * The invalidation also prevents signal queueing, signal delivery + * and therefore rearming from the signal delivery path. + * + * A concurrent lookup can still find the timer in the hash, but it + * will check timer::it_signal with timer::it_lock held and observe + * bit 0 set, which invalidates it. That also prevents the timer ID + * from being handed out before this timer is completely gone. + */ timer->it_signal_seq++; - if (WARN_ON_ONCE(!kc || !kc->timer_del)) - return -EINVAL; - return kc->timer_del(timer); + scoped_guard (spinlock, ¤t->sighand->siglock) { + unsigned long sig = (unsigned long)timer->it_signal | 1UL; + + WRITE_ONCE(timer->it_signal, (struct signal_struct *)sig); + hlist_del_rcu(&timer->list); + posix_timer_cleanup_ignored(timer); + } + + while (timer->kclock->timer_del(timer) == TIMER_RETRY) { + guard(rcu)(); + spin_unlock_irq(&timer->it_lock); + timer_wait_running(timer); + spin_lock_irq(&timer->it_lock); + } } /* Delete a POSIX.1b interval timer. */ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) { struct k_itimer *timer; - unsigned long flags; - - timer = lock_timer(timer_id, &flags); -retry_delete: - if (!timer) - return -EINVAL; - - if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) { - /* Unlocks and relocks the timer if it still exists */ - timer = timer_wait_running(timer, &flags); - goto retry_delete; + scoped_timer_get_or_fail(timer_id) { + timer = scoped_timer; + posix_timer_delete(timer); } - - spin_lock(¤t->sighand->siglock); - hlist_del(&timer->list); - posix_timer_cleanup_ignored(timer); - /* - * A concurrent lookup could check timer::it_signal lockless. It - * will reevaluate with timer::it_lock held and observe the NULL. - * - * It must be written with siglock held so that the signal code - * observes timer->it_signal == NULL in do_sigaction(SIG_IGN), - * which prevents it from moving a pending signal of a deleted - * timer to the ignore list. - */ - WRITE_ONCE(timer->it_signal, NULL); - spin_unlock(¤t->sighand->siglock); - - unlock_timer(timer, flags); + /* Remove it from the hash, which frees up the timer ID */ posix_timer_unhash_and_free(timer); return 0; } /* - * Delete a timer if it is armed, remove it from the hash and schedule it - * for RCU freeing. - */ -static void itimer_delete(struct k_itimer *timer) -{ - unsigned long flags; - - /* - * irqsave is required to make timer_wait_running() work. - */ - spin_lock_irqsave(&timer->it_lock, flags); - -retry_delete: - /* - * Even if the timer is not longer accessible from other tasks - * it still might be armed and queued in the underlying timer - * mechanism. Worse, that timer mechanism might run the expiry - * function concurrently. - */ - if (timer_delete_hook(timer) == TIMER_RETRY) { - /* - * Timer is expired concurrently, prevent livelocks - * and pointless spinning on RT. - * - * timer_wait_running() drops timer::it_lock, which opens - * the possibility for another task to delete the timer. - * - * That's not possible here because this is invoked from - * do_exit() only for the last thread of the thread group. - * So no other task can access and delete that timer. - */ - if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer)) - return; - - goto retry_delete; - } - hlist_del(&timer->list); - - posix_timer_cleanup_ignored(timer); - - /* - * Setting timer::it_signal to NULL is technically not required - * here as nothing can access the timer anymore legitimately via - * the hash table. Set it to NULL nevertheless so that all deletion - * paths are consistent. - */ - WRITE_ONCE(timer->it_signal, NULL); - - spin_unlock_irqrestore(&timer->it_lock, flags); - posix_timer_unhash_and_free(timer); -} - -/* * Invoked from do_exit() when the last thread of a thread group exits. * At that point no other task can access the timers of the dying * task anymore. @@ -1089,21 +1070,29 @@ retry_delete: void exit_itimers(struct task_struct *tsk) { struct hlist_head timers; + struct hlist_node *next; + struct k_itimer *timer; + + /* Clear restore mode for exec() */ + tsk->signal->timer_create_restore_ids = 0; if (hlist_empty(&tsk->signal->posix_timers)) return; /* Protect against concurrent read via /proc/$PID/timers */ - spin_lock_irq(&tsk->sighand->siglock); - hlist_move_list(&tsk->signal->posix_timers, &timers); - spin_unlock_irq(&tsk->sighand->siglock); + scoped_guard (spinlock_irq, &tsk->sighand->siglock) + hlist_move_list(&tsk->signal->posix_timers, &timers); /* The timers are not longer accessible via tsk::signal */ - while (!hlist_empty(&timers)) - itimer_delete(hlist_entry(timers.first, struct k_itimer, list)); + hlist_for_each_entry_safe(timer, next, &timers, list) { + scoped_guard (spinlock_irq, &timer->it_lock) + posix_timer_delete(timer); + posix_timer_unhash_and_free(timer); + cond_resched(); + } /* - * There should be no timers on the ignored list. itimer_delete() has + * There should be no timers on the ignored list. posix_timer_delete() has * mopped them up. */ if (!WARN_ON_ONCE(!hlist_empty(&tsk->signal->ignored_posix_timers))) @@ -1246,7 +1235,7 @@ SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, * sys_clock_settime(). The kernel internal timekeeping is always using * nanoseconds precision independent of the clocksource device which is * used to read the time from. The resolution of that device only - * affects the presicion of the time returned by sys_clock_gettime(). + * affects the precision of the time returned by sys_clock_gettime(). * * Returns: * 0 Success. @tp contains the resolution @@ -1529,6 +1518,9 @@ static const struct k_clock * const posix_clocks[] = { [CLOCK_REALTIME_ALARM] = &alarm_clock, [CLOCK_BOOTTIME_ALARM] = &alarm_clock, [CLOCK_TAI] = &clock_tai, +#ifdef CONFIG_POSIX_AUX_CLOCKS + [CLOCK_AUX ... CLOCK_AUX_LAST] = &clock_aux, +#endif }; static const struct k_clock *clockid_to_kclock(const clockid_t id) @@ -1545,3 +1537,31 @@ static const struct k_clock *clockid_to_kclock(const clockid_t id) return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))]; } + +static int __init posixtimer_init(void) +{ + unsigned long i, size; + unsigned int shift; + + posix_timers_cache = kmem_cache_create("posix_timers_cache", + sizeof(struct k_itimer), + __alignof__(struct k_itimer), + SLAB_ACCOUNT, NULL); + + if (IS_ENABLED(CONFIG_BASE_SMALL)) + size = 512; + else + size = roundup_pow_of_two(512 * num_possible_cpus()); + + timer_buckets = alloc_large_system_hash("posixtimers", sizeof(*timer_buckets), + size, 0, 0, &shift, NULL, size, size); + size = 1UL << shift; + timer_hashmask = size - 1; + + for (i = 0; i < size; i++) { + spin_lock_init(&timer_buckets[i].lock); + INIT_HLIST_HEAD(&timer_buckets[i].head); + } + return 0; +} +core_initcall(posixtimer_init); diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 61906f0688c1..7f259e845d24 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -41,6 +41,7 @@ extern const struct k_clock clock_posix_dynamic; extern const struct k_clock clock_process; extern const struct k_clock clock_thread; extern const struct k_clock alarm_clock; +extern const struct k_clock clock_aux; void posix_timer_queue_signal(struct k_itimer *timr); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index fcca4e72f1ef..f3aaef695b8c 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -174,8 +174,7 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) return HRTIMER_RESTART; } -void __init -sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) +void sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { u64 res, wrap, new_mask, new_epoch, cyc, ns; u32 new_mult, new_shift; @@ -216,7 +215,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) update_clock_read_data(&rd); - if (sched_clock_timer.function != NULL) { + if (ACCESS_PRIVATE(&sched_clock_timer, function) != NULL) { /* update timeout for clock wrap */ hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); @@ -247,6 +246,7 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) pr_debug("Registered %pS as sched_clock source\n", read); } +EXPORT_SYMBOL_GPL(sched_clock_register); void __init generic_sched_clock_init(void) { @@ -263,8 +263,7 @@ void __init generic_sched_clock_init(void) * Start the timer to keep sched_clock() properly updated and * sets the initial epoch. */ - hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); - sched_clock_timer.function = sched_clock_poll; + hrtimer_setup(&sched_clock_timer, sched_clock_poll, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD); } @@ -297,6 +296,11 @@ int sched_clock_suspend(void) return 0; } +static int sched_clock_syscore_suspend(void *data) +{ + return sched_clock_suspend(); +} + void sched_clock_resume(void) { struct clock_read_data *rd = &cd.read_data[0]; @@ -306,14 +310,23 @@ void sched_clock_resume(void) rd->read_sched_clock = cd.actual_read_sched_clock; } -static struct syscore_ops sched_clock_ops = { - .suspend = sched_clock_suspend, - .resume = sched_clock_resume, +static void sched_clock_syscore_resume(void *data) +{ + sched_clock_resume(); +} + +static const struct syscore_ops sched_clock_syscore_ops = { + .suspend = sched_clock_syscore_suspend, + .resume = sched_clock_syscore_resume, +}; + +static struct syscore sched_clock_syscore = { + .ops = &sched_clock_syscore_ops, }; static int __init sched_clock_syscore_init(void) { - register_syscore_ops(&sched_clock_ops); + register_syscore(&sched_clock_syscore); return 0; } diff --git a/kernel/time/sleep_timeout.c b/kernel/time/sleep_timeout.c index dfe939f6e4ec..3c90574bd904 100644 --- a/kernel/time/sleep_timeout.c +++ b/kernel/time/sleep_timeout.c @@ -22,7 +22,7 @@ struct process_timer { static void process_timeout(struct timer_list *t) { - struct process_timer *timeout = from_timer(timeout, t, timer); + struct process_timer *timeout = timer_container_of(timeout, t, timer); wake_up_process(timeout->task); } @@ -97,10 +97,10 @@ signed long __sched schedule_timeout(signed long timeout) timer.timer.expires = expire; add_timer(&timer.timer); schedule(); - del_timer_sync(&timer.timer); + timer_delete_sync(&timer.timer); /* Remove the timer from the object tracker */ - destroy_timer_on_stack(&timer.timer); + timer_destroy_on_stack(&timer.timer); timeout = expire - jiffies; diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index e28f9210f8a1..51f6a1032c83 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -78,7 +78,6 @@ static struct clock_event_device ce_broadcast_hrtimer = { .set_state_shutdown = bc_shutdown, .set_next_ktime = bc_set_next, .features = CLOCK_EVT_FEAT_ONESHOT | - CLOCK_EVT_FEAT_KTIME | CLOCK_EVT_FEAT_HRTIMER, .rating = 0, .bound_on = -1, @@ -100,7 +99,6 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) void tick_setup_hrtimer_broadcast(void) { - hrtimer_init(&bctimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - bctimer.function = bc_handler; + hrtimer_setup(&bctimer, bc_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); clockevents_register_device(&ce_broadcast_hrtimer); } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 0207868c8b4d..115e0bf01276 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -3,7 +3,7 @@ * This file contains functions which emulate a local clock-event * device via a broadcast event source. * - * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ @@ -76,8 +76,10 @@ const struct clock_event_device *tick_get_wakeup_device(int cpu) */ static void tick_broadcast_start_periodic(struct clock_event_device *bc) { - if (bc) + if (bc) { + bc->next_event_forced = 0; tick_setup_periodic(bc, 1); + } } /* @@ -106,6 +108,7 @@ static struct clock_event_device *tick_get_oneshot_wakeup_device(int cpu) static void tick_oneshot_wakeup_handler(struct clock_event_device *wd) { + wd->next_event_forced = 0; /* * If we woke up early and the tick was reprogrammed in the * meantime then this may be spurious but harmless. @@ -403,6 +406,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) bool bc_local; raw_spin_lock(&tick_broadcast_lock); + tick_broadcast_device.evtdev->next_event_forced = 0; /* Handle spurious interrupts gracefully */ if (clockevent_state_shutdown(tick_broadcast_device.evtdev)) { @@ -696,6 +700,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) raw_spin_lock(&tick_broadcast_lock); dev->next_event = KTIME_MAX; + tick_broadcast_device.evtdev->next_event_forced = 0; next_event = KTIME_MAX; cpumask_clear(tmpmask); now = ktime_get(); @@ -1063,6 +1068,7 @@ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bc->event_handler = tick_handle_oneshot_broadcast; + bc->next_event_forced = 0; bc->next_event = KTIME_MAX; /* @@ -1175,6 +1181,7 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu) } /* This moves the broadcast assignment to this CPU: */ + bc->next_event_forced = 0; clockevents_program_event(bc, bc->next_event, 1); } raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index a47bcf71defc..6a9198a4279b 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -3,7 +3,7 @@ * This file contains the base functions to manage periodic tick * related events. * - * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ @@ -110,6 +110,7 @@ void tick_handle_periodic(struct clock_event_device *dev) int cpu = smp_processor_id(); ktime_t next = dev->next_event; + dev->next_event_forced = 0; tick_periodic(cpu); /* @@ -411,24 +412,18 @@ int tick_cpu_dying(unsigned int dying_cpu) } /* - * Shutdown an event device on a given cpu: + * Shutdown an event device on the outgoing CPU: * - * This is called on a life CPU, when a CPU is dead. So we cannot - * access the hardware device itself. - * We just set the mode and remove it from the lists. + * Called by the dying CPU during teardown, with clockevents_lock held + * and interrupts disabled. */ -void tick_shutdown(unsigned int cpu) +void tick_shutdown(void) { - struct tick_device *td = &per_cpu(tick_cpu_device, cpu); + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); struct clock_event_device *dev = td->evtdev; td->mode = TICKDEV_MODE_PERIODIC; if (dev) { - /* - * Prevent that the clock events layer tries to call - * the set mode function! - */ - clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); clockevents_exchange_device(dev, NULL); dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; @@ -509,6 +504,7 @@ void tick_resume(void) #ifdef CONFIG_SUSPEND static DEFINE_RAW_SPINLOCK(tick_freeze_lock); +static DEFINE_WAIT_OVERRIDE_MAP(tick_freeze_map, LD_WAIT_SLEEP); static unsigned int tick_freeze_depth; /** @@ -528,9 +524,22 @@ void tick_freeze(void) if (tick_freeze_depth == num_online_cpus()) { trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), true); + /* + * All other CPUs have their interrupts disabled and are + * suspended to idle. Other tasks have been frozen so there + * is no scheduling happening. This means that there is no + * concurrency in the system at this point. Therefore it is + * okay to acquire a sleeping lock on PREEMPT_RT, such as a + * spinlock, because the lock cannot be held by other CPUs + * or threads and acquiring it cannot block. + * + * Inform lockdep about the situation. + */ + lock_map_acquire_try(&tick_freeze_map); system_state = SYSTEM_SUSPEND; sched_clock_suspend(); timekeeping_suspend(); + lock_map_release(&tick_freeze_map); } else { tick_suspend_local(); } @@ -552,8 +561,16 @@ void tick_unfreeze(void) raw_spin_lock(&tick_freeze_lock); if (tick_freeze_depth == num_online_cpus()) { + /* + * Similar to tick_freeze(). On resumption the first CPU may + * acquire uncontended sleeping locks while other CPUs block on + * tick_freeze_lock. + */ + lock_map_acquire_try(&tick_freeze_map); timekeeping_resume(); sched_clock_resume(); + lock_map_release(&tick_freeze_map); + system_state = SYSTEM_RUNNING; trace_suspend_resume(TPS("timekeeping_freeze"), smp_processor_id(), false); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index faac36de35b9..597d816d22e8 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -26,7 +26,7 @@ extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); extern void tick_check_new_device(struct clock_event_device *dev); extern void tick_offline_cpu(unsigned int cpu); -extern void tick_shutdown(unsigned int cpu); +extern void tick_shutdown(void); extern void tick_suspend(void); extern void tick_resume(void); extern bool tick_check_replacement(struct clock_event_device *curdev, @@ -156,7 +156,6 @@ static inline void tick_nohz_init(void) { } #endif #ifdef CONFIG_NO_HZ_COMMON -extern unsigned long tick_nohz_active; extern void timers_update_nohz(void); extern u64 get_jiffies_update(unsigned long *basej); # ifdef CONFIG_SMP @@ -171,7 +170,6 @@ extern void timer_expire_remote(unsigned int cpu); # endif #else /* CONFIG_NO_HZ_COMMON */ static inline void timers_update_nohz(void) { } -#define tick_nohz_active (0) #endif DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 5e2c2c26b3cc..7472597f3225 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -3,7 +3,7 @@ * This file contains functions which manage high resolution tick * related events. * - * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ @@ -19,6 +19,10 @@ /** * tick_program_event - program the CPU local timer device for the next event + * @expires: the time at which the next timer event should occur + * @force: flag to force reprograming even if the event time hasn't changed + * + * Return: 0 on success, negative error code on failure */ int tick_program_event(ktime_t expires, int force) { @@ -57,6 +61,13 @@ void tick_resume_oneshot(void) /** * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz) + * @newdev: Pointer to the clock event device to configure + * @handler: Function to be called when the event device triggers an interrupt + * @next_event: Initial expiry time for the next event (in ktime) + * + * Configures the specified clock event device for onshot mode, + * assigns the given handler as its event callback, and programs + * the device to trigger at the specified next event time. */ void tick_setup_oneshot(struct clock_event_device *newdev, void (*handler)(struct clock_event_device *), @@ -69,6 +80,10 @@ void tick_setup_oneshot(struct clock_event_device *newdev, /** * tick_switch_to_oneshot - switch to oneshot mode + * @handler: function to call when an event occurs on the tick device + * + * Return: 0 on success, -EINVAL if the tick device is not present, + * not functional, or does not support oneshot mode. */ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) { @@ -101,7 +116,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) /** * tick_oneshot_mode_active - check whether the system is in oneshot mode * - * returns 1 when either nohz or highres are enabled. otherwise 0. + * Return: 1 when either nohz or highres are enabled, otherwise 0. */ int tick_oneshot_mode_active(void) { @@ -120,6 +135,9 @@ int tick_oneshot_mode_active(void) * tick_init_highres - switch to high resolution mode * * Called with interrupts disabled. + * + * Return: 0 on success, -EINVAL if the tick device cannot switch + * to oneshot/high-resolution mode. */ int tick_init_highres(void) { diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index fa058510af9c..cbbb87a0c6e7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> + * Copyright(C) 2005-2006, Linutronix GmbH, Thomas Gleixner <tglx@kernel.org> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * @@ -201,6 +201,27 @@ static inline void tick_sched_flag_clear(struct tick_sched *ts, ts->flags &= ~flag; } +/* + * Allow only one non-timekeeper CPU at a time update jiffies from + * the timer tick. + * + * Returns true if update was run. + */ +static bool tick_limited_update_jiffies64(struct tick_sched *ts, ktime_t now) +{ + static atomic_t in_progress; + int inp; + + inp = atomic_read(&in_progress); + if (inp || !atomic_try_cmpxchg(&in_progress, &inp, 1)) + return false; + + if (ts->last_tick_jiffies == jiffies) + tick_do_update_jiffies64(now); + atomic_set(&in_progress, 0); + return true; +} + #define MAX_STALLED_JIFFIES 5 static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) @@ -239,10 +260,11 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now) ts->stalled_jiffies = 0; ts->last_tick_jiffies = READ_ONCE(jiffies); } else { - if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) { - tick_do_update_jiffies64(now); - ts->stalled_jiffies = 0; - ts->last_tick_jiffies = READ_ONCE(jiffies); + if (++ts->stalled_jiffies >= MAX_STALLED_JIFFIES) { + if (tick_limited_update_jiffies64(ts, now)) { + ts->stalled_jiffies = 0; + ts->last_tick_jiffies = READ_ONCE(jiffies); + } } } @@ -322,6 +344,9 @@ static bool check_tick_dependency(atomic_t *dep) { int val = atomic_read(dep); + if (likely(!tracepoint_enabled(tick_stop))) + return !!val; + if (val & TICK_DEP_MASK_POSIX_TIMER) { trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); return true; @@ -671,7 +696,7 @@ void __init tick_nohz_init(void) * NO HZ enabled ? */ bool tick_nohz_enabled __read_mostly = true; -unsigned long tick_nohz_active __read_mostly; +static unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ @@ -682,6 +707,12 @@ static int __init setup_tick_nohz(char *str) __setup("nohz=", setup_tick_nohz); +bool tick_nohz_is_active(void) +{ + return tick_nohz_active; +} +EXPORT_SYMBOL_GPL(tick_nohz_is_active); + bool tick_nohz_tick_stopped(void) { struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); @@ -833,19 +864,32 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); +/* Simplified variant of hrtimer_forward_now() */ +static ktime_t tick_forward_now(ktime_t expires, ktime_t now) +{ + ktime_t delta = now - expires; + + if (likely(delta < TICK_NSEC)) + return expires + TICK_NSEC; + + expires += TICK_NSEC * ktime_divns(delta, TICK_NSEC); + if (expires > now) + return expires; + return expires + TICK_NSEC; +} + static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) { - hrtimer_cancel(&ts->sched_timer); - hrtimer_set_expires(&ts->sched_timer, ts->last_tick); + ktime_t expires = ts->last_tick; - /* Forward the time to expire in the future */ - hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); + if (now >= expires) + expires = tick_forward_now(expires, now); if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) { - hrtimer_start_expires(&ts->sched_timer, - HRTIMER_MODE_ABS_PINNED_HARD); + hrtimer_start(&ts->sched_timer, expires, HRTIMER_MODE_ABS_PINNED_HARD); } else { - tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); + hrtimer_set_expires(&ts->sched_timer, expires); + tick_program_event(expires, 1); } /* @@ -1152,16 +1196,15 @@ static bool report_idle_softirq(void) return false; } - if (ratelimit >= 10) - return false; - /* On RT, softirq handling may be waiting on some lock */ if (local_bh_blocked()) return false; - pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", - pending); - ratelimit++; + if (ratelimit < 10) { + pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", + pending); + ratelimit++; + } return true; } @@ -1483,6 +1526,7 @@ static void tick_nohz_lowres_handler(struct clock_event_device *dev) struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); dev->next_event = KTIME_MAX; + dev->next_event_forced = 0; if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART)) tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); @@ -1573,12 +1617,10 @@ void tick_setup_sched_timer(bool hrtimer) struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); /* Emulate tick processing via per-CPU hrtimers: */ - hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); + hrtimer_setup(&ts->sched_timer, tick_nohz_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD); - if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) { + if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer) tick_sched_flag_set(ts, TS_FLAG_HIGHRES); - ts->sched_timer.function = tick_nohz_handler; - } /* Get the next period (per-CPU) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); diff --git a/kernel/time/time.c b/kernel/time/time.c index 1b69caa87480..771cef87ad3b 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -207,7 +207,7 @@ SYSCALL_DEFINE2(settimeofday, struct __kernel_old_timeval __user *, tv, get_user(new_ts.tv_nsec, &tv->tv_usec)) return -EFAULT; - if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0) + if (new_ts.tv_nsec >= USEC_PER_SEC || new_ts.tv_nsec < 0) return -EINVAL; new_ts.tv_nsec *= NSEC_PER_USEC; @@ -365,20 +365,16 @@ SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp) } #endif +#if HZ > MSEC_PER_SEC || (MSEC_PER_SEC % HZ) /** * jiffies_to_msecs - Convert jiffies to milliseconds * @j: jiffies value * - * Avoid unnecessary multiplications/divisions in the - * two most common HZ cases. - * * Return: milliseconds value */ unsigned int jiffies_to_msecs(const unsigned long j) { -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - return (MSEC_PER_SEC / HZ) * j; -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) +#if HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); #else # if BITS_PER_LONG == 32 @@ -390,7 +386,9 @@ unsigned int jiffies_to_msecs(const unsigned long j) #endif } EXPORT_SYMBOL(jiffies_to_msecs); +#endif +#if (USEC_PER_SEC % HZ) /** * jiffies_to_usecs - Convert jiffies to microseconds * @j: jiffies value @@ -405,17 +403,14 @@ unsigned int jiffies_to_usecs(const unsigned long j) */ BUILD_BUG_ON(HZ > USEC_PER_SEC); -#if !(USEC_PER_SEC % HZ) - return (USEC_PER_SEC / HZ) * j; -#else -# if BITS_PER_LONG == 32 +#if BITS_PER_LONG == 32 return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; -# else +#else return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN; -# endif #endif } EXPORT_SYMBOL(jiffies_to_usecs); +#endif /** * mktime64 - Converts date to seconds. @@ -702,7 +697,7 @@ EXPORT_SYMBOL(clock_t_to_jiffies); * * Return: jiffies_64 value converted to 64-bit "clock_t" (CLOCKS_PER_SEC) */ -u64 jiffies_64_to_clock_t(u64 x) +notrace u64 jiffies_64_to_clock_t(u64 x) { #if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 # if HZ < USER_HZ @@ -858,6 +853,7 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs, return res; } +EXPORT_SYMBOL_GPL(timespec64_add_safe); /** * get_timespec64 - get user's time value into kernel space diff --git a/kernel/time/time_test.c b/kernel/time/time_test.c index 2889763165e5..1b99180da288 100644 --- a/kernel/time/time_test.c +++ b/kernel/time/time_test.c @@ -4,7 +4,9 @@ #include <linux/time.h> /* - * Traditional implementation of leap year evaluation. + * Traditional implementation of leap year evaluation, but note that long + * is a signed type and the tests do cover negative year values. So this + * can't use the is_leap_year() helper from rtc.h. */ static bool is_leap(long year) { diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c index e6285288d765..2e64dbb6302d 100644 --- a/kernel/time/timecounter.c +++ b/kernel/time/timecounter.c @@ -6,7 +6,7 @@ #include <linux/timecounter.h> void timecounter_init(struct timecounter *tc, - const struct cyclecounter *cc, + struct cyclecounter *cc, u64 start_tstamp) { tc->cc = cc; @@ -62,38 +62,3 @@ u64 timecounter_read(struct timecounter *tc) } EXPORT_SYMBOL_GPL(timecounter_read); -/* - * This is like cyclecounter_cyc2ns(), but it is used for computing a - * time previous to the time stored in the cycle counter. - */ -static u64 cc_cyc2ns_backwards(const struct cyclecounter *cc, - u64 cycles, u64 mask, u64 frac) -{ - u64 ns = (u64) cycles; - - ns = ((ns * cc->mult) - frac) >> cc->shift; - - return ns; -} - -u64 timecounter_cyc2time(const struct timecounter *tc, - u64 cycle_tstamp) -{ - u64 delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask; - u64 nsec = tc->nsec, frac = tc->frac; - - /* - * Instead of always treating cycle_tstamp as more recent - * than tc->cycle_last, detect when it is too far in the - * future and treat it as old time stamp instead. - */ - if (delta > tc->cc->mask / 2) { - delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask; - nsec -= cc_cyc2ns_backwards(tc->cc, delta, tc->mask, frac); - } else { - nsec += cyclecounter_cyc2ns(tc->cc, delta, tc->mask, &frac); - } - - return nsec; -} -EXPORT_SYMBOL_GPL(timecounter_cyc2time); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1e67d076f195..c493a4010305 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -3,31 +3,30 @@ * Kernel timekeeping code and accessor functions. Based on code from * timer.c, moved in commit 8524070b7982. */ -#include <linux/timekeeper_internal.h> +#include <linux/audit.h> +#include <linux/clocksource.h> +#include <linux/compiler.h> +#include <linux/jiffies.h> +#include <linux/kobject.h> #include <linux/module.h> -#include <linux/interrupt.h> -#include <linux/percpu.h> -#include <linux/init.h> -#include <linux/mm.h> #include <linux/nmi.h> -#include <linux/sched.h> -#include <linux/sched/loadavg.h> +#include <linux/pvclock_gtod.h> +#include <linux/random.h> #include <linux/sched/clock.h> +#include <linux/sched/loadavg.h> +#include <linux/static_key.h> +#include <linux/stop_machine.h> #include <linux/syscore_ops.h> -#include <linux/clocksource.h> -#include <linux/jiffies.h> +#include <linux/tick.h> #include <linux/time.h> #include <linux/timex.h> -#include <linux/tick.h> -#include <linux/stop_machine.h> -#include <linux/pvclock_gtod.h> -#include <linux/compiler.h> -#include <linux/audit.h> -#include <linux/random.h> +#include <linux/timekeeper_internal.h> + +#include <vdso/auxclock.h> #include "tick-internal.h" -#include "ntp_internal.h" #include "timekeeping_internal.h" +#include "ntp_internal.h" #define TK_CLEAR_NTP (1 << 0) #define TK_CLOCK_WAS_SET (1 << 1) @@ -53,7 +52,38 @@ struct tk_data { raw_spinlock_t lock; } ____cacheline_aligned; -static struct tk_data tk_core; +static struct tk_data timekeeper_data[TIMEKEEPERS_MAX]; + +/* The core timekeeper */ +#define tk_core (timekeeper_data[TIMEKEEPER_CORE]) + +#ifdef CONFIG_POSIX_AUX_CLOCKS +static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) +{ + return ktime_get_aux_ts64(CLOCK_AUX + tkid - TIMEKEEPER_AUX_FIRST, ts); +} + +static inline bool tk_is_aux(const struct timekeeper *tk) +{ + return tk->id >= TIMEKEEPER_AUX_FIRST && tk->id <= TIMEKEEPER_AUX_LAST; +} +#else +static inline bool tk_get_aux_ts64(unsigned int tkid, struct timespec64 *ts) +{ + return false; +} + +static inline bool tk_is_aux(const struct timekeeper *tk) +{ + return false; +} +#endif + +static inline void tk_update_aux_offs(struct timekeeper *tk, ktime_t offs) +{ + tk->offs_aux = offs; + tk->monotonic_to_aux = ktime_to_timespec64(offs); +} /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -113,6 +143,16 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned = { .base[1] = FAST_TK_INIT, }; +#ifdef CONFIG_POSIX_AUX_CLOCKS +static __init void tk_aux_setup(void); +static void tk_aux_update_clocksource(void); +static void tk_aux_advance(void); +#else +static inline void tk_aux_setup(void) { } +static inline void tk_aux_update_clocksource(void) { } +static inline void tk_aux_advance(void) { } +#endif + unsigned long timekeeper_lock_irqsave(void) { unsigned long flags; @@ -164,10 +204,34 @@ static inline struct timespec64 tk_xtime(const struct timekeeper *tk) return ts; } +static inline struct timespec64 tk_xtime_coarse(const struct timekeeper *tk) +{ + struct timespec64 ts; + + ts.tv_sec = tk->xtime_sec; + ts.tv_nsec = tk->coarse_nsec; + return ts; +} + +/* + * Update the nanoseconds part for the coarse time keepers. They can't rely + * on xtime_nsec because xtime_nsec could be adjusted by a small negative + * amount when the multiplication factor of the clock is adjusted, which + * could cause the coarse clocks to go slightly backwards. See + * timekeeping_apply_adjustment(). Thus we keep a separate copy for the coarse + * clockids which only is updated when the clock has been set or we have + * accumulated time. + */ +static inline void tk_update_coarse_nsecs(struct timekeeper *tk) +{ + tk->coarse_nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; +} + static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec = ts->tv_sec; tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift; + tk_update_coarse_nsecs(tk); } static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) @@ -175,6 +239,7 @@ static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) tk->xtime_sec += ts->tv_sec; tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift; tk_normalize_xtime(tk); + tk_update_coarse_nsecs(tk); } static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm) @@ -206,6 +271,11 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot); } +#ifdef CONFIG_ARCH_WANTS_CLOCKSOURCE_READ_INLINE +#include <asm/clock_inlined.h> + +static DEFINE_STATIC_KEY_FALSE(clocksource_read_inlined); + /* * tk_clock_read - atomic clocksource read() helper * @@ -219,12 +289,35 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) * a read of the fast-timekeeper tkrs (which is protected by its own locking * and update logic). */ -static inline u64 tk_clock_read(const struct tk_read_base *tkr) +static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr) +{ + struct clocksource *clock = READ_ONCE(tkr->clock); + + if (static_branch_likely(&clocksource_read_inlined)) + return arch_inlined_clocksource_read(clock); + + return clock->read(clock); +} + +static inline void clocksource_disable_inline_read(void) +{ + static_branch_disable(&clocksource_read_inlined); +} + +static inline void clocksource_enable_inline_read(void) +{ + static_branch_enable(&clocksource_read_inlined); +} +#else +static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr) { struct clocksource *clock = READ_ONCE(tkr->clock); return clock->read(clock); } +static inline void clocksource_disable_inline_read(void) { } +static inline void clocksource_enable_inline_read(void) { } +#endif /** * tk_setup_internals - Set up internals to use clocksource clock. @@ -298,6 +391,27 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) tk->tkr_raw.mult = clock->mult; tk->ntp_err_mult = 0; tk->skip_second_overflow = 0; + + tk->cs_id = clock->id; + + /* Coupled clockevent data */ + if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) && + clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT) { + /* + * Aim for an one hour maximum delta and use KHz to handle + * clocksources with a frequency above 4GHz correctly as + * the frequency argument of clocks_calc_mult_shift() is u32. + */ + clocks_calc_mult_shift(&tk->cs_ns_to_cyc_mult, &tk->cs_ns_to_cyc_shift, + NSEC_PER_MSEC, clock->freq_khz, 3600 * 1000); + /* + * Initialize the conversion limit as the previous clocksource + * might have the same shift/mult pair so the quick check in + * tk_update_ns_to_cyc() fails to update it after a clocksource + * change leaving it effectivly zero. + */ + tk->cs_ns_to_cyc_maxns = div_u64(clock->mask, tk->cs_ns_to_cyc_mult); + } } /* Timekeeper helper functions. */ @@ -306,7 +420,7 @@ static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta) return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift); } -static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) +static __always_inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) { /* Calculate the delta since the last update_wall_time() */ u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask; @@ -576,7 +690,7 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); */ static inline void tk_update_leap_state(struct timekeeper *tk) { - tk->next_leap_ktime = ntp_get_next_leap(); + tk->next_leap_ktime = ntp_get_next_leap(tk->id); if (tk->next_leap_ktime != KTIME_MAX) /* Convert to monotonic time */ tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real); @@ -627,6 +741,36 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); } +static inline void tk_update_ns_to_cyc(struct timekeeper *tks, struct timekeeper *tkc) +{ + struct tk_read_base *tkrs = &tks->tkr_mono; + struct tk_read_base *tkrc = &tkc->tkr_mono; + unsigned int shift; + + if (!IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) || + !(tkrs->clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT)) + return; + + if (tkrs->mult == tkrc->mult && tkrs->shift == tkrc->shift) + return; + /* + * The conversion math is simple: + * + * CS::MULT (1 << NS_TO_CYC_SHIFT) + * --------------- = ---------------------- + * (1 << CS:SHIFT) NS_TO_CYC_MULT + * + * Ergo: + * + * NS_TO_CYC_MULT = (1 << (CS::SHIFT + NS_TO_CYC_SHIFT)) / CS::MULT + * + * NS_TO_CYC_SHIFT has been set up in tk_setup_internals() + */ + shift = tkrs->shift + tks->cs_ns_to_cyc_shift; + tks->cs_ns_to_cyc_mult = (u32)div_u64(1ULL << shift, tkrs->mult); + tks->cs_ns_to_cyc_maxns = div_u64(tkrs->clock->mask, tks->cs_ns_to_cyc_mult); +} + /* * Restore the shadow timekeeper from the real timekeeper. */ @@ -638,7 +782,7 @@ static void timekeeping_restore_shadow(struct tk_data *tkd) static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action) { - struct timekeeper *tk = &tk_core.shadow_timekeeper; + struct timekeeper *tk = &tkd->shadow_timekeeper; lockdep_assert_held(&tkd->lock); @@ -653,18 +797,23 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act if (action & TK_CLEAR_NTP) { tk->ntp_error = 0; - ntp_clear(); + ntp_clear(tk->id); } tk_update_leap_state(tk); tk_update_ktime_data(tk); + tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; - update_vsyscall(tk); - update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + if (tk->id == TIMEKEEPER_CORE) { + tk_update_ns_to_cyc(tk, &tkd->timekeeper); + update_vsyscall(tk); + update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); - tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; - update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); - update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); + update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + } else if (tk_is_aux(tk)) { + vdso_time_update_aux(tk); + } if (action & TK_CLOCK_WAS_SET) tk->clock_was_set_seq++; @@ -708,6 +857,72 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk_normalize_xtime(tk); delta -= incr; } + tk_update_coarse_nsecs(tk); +} + +/* + * ktime_expiry_to_cycles - Convert a expiry time to clocksource cycles + * @id: Clocksource ID which is required for validity + * @expires_ns: Absolute CLOCK_MONOTONIC expiry time (nsecs) to be converted + * @cycles: Pointer to storage for corresponding absolute cycles value + * + * Convert a CLOCK_MONOTONIC based absolute expiry time to a cycles value + * based on the correlated clocksource of the clockevent device by using + * the base nanoseconds and cycles values of the last timekeeper update and + * converting the delta between @expires_ns and base nanoseconds to cycles. + * + * This only works for clockevent devices which are using a less than or + * equal comparator against the clocksource. + * + * Utilizing this avoids two clocksource reads for such devices, the + * ktime_get() in clockevents_program_event() to calculate the delta expiry + * value and the readout in the device::set_next_event() callback to + * convert the delta back to a absolute comparator value. + * + * Returns: True if @id matches the current clocksource ID, false otherwise + */ +bool ktime_expiry_to_cycles(enum clocksource_ids id, ktime_t expires_ns, u64 *cycles) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct tk_read_base *tkrm = &tk->tkr_mono; + ktime_t base_ns, delta_ns, max_ns; + u64 base_cycles, delta_cycles; + unsigned int seq; + u32 mult, shift; + + /* + * Racy check to avoid the seqcount overhead when ID does not match. If + * the relevant clocksource is installed concurrently, then this will + * just delay the switch over to this mechanism until the next event is + * programmed. If the ID is not matching the clock events code will use + * the regular relative set_next_event() callback as before. + */ + if (data_race(tk->cs_id) != id) + return false; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + if (tk->cs_id != id) + return false; + + base_cycles = tkrm->cycle_last; + base_ns = tkrm->base + (tkrm->xtime_nsec >> tkrm->shift); + + mult = tk->cs_ns_to_cyc_mult; + shift = tk->cs_ns_to_cyc_shift; + max_ns = tk->cs_ns_to_cyc_maxns; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + /* Prevent negative deltas and multiplication overflows */ + delta_ns = min(expires_ns - base_ns, max_ns); + delta_ns = max(delta_ns, 0); + + /* Convert to cycles */ + delta_cycles = ((u64)delta_ns * mult) >> shift; + *cycles = base_cycles + delta_cycles; + return true; } /** @@ -774,7 +989,7 @@ u32 ktime_get_resolution_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_resolution_ns); -static ktime_t *offsets[TK_OFFS_MAX] = { +static const ktime_t *const offsets[TK_OFFS_MAX] = { [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, [TK_OFFS_TAI] = &tk_core.timekeeper.offs_tai, @@ -783,8 +998,9 @@ static ktime_t *offsets[TK_OFFS_MAX] = { ktime_t ktime_get_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; + const ktime_t *offset = offsets[offs]; unsigned int seq; - ktime_t base, *offset = offsets[offs]; + ktime_t base; u64 nsecs; WARN_ON(timekeeping_suspended); @@ -804,8 +1020,9 @@ EXPORT_SYMBOL_GPL(ktime_get_with_offset); ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) { struct timekeeper *tk = &tk_core.timekeeper; + const ktime_t *offset = offsets[offs]; unsigned int seq; - ktime_t base, *offset = offsets[offs]; + ktime_t base; u64 nsecs; WARN_ON(timekeeping_suspended); @@ -813,7 +1030,7 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) do { seq = read_seqcount_begin(&tk_core.seq); base = ktime_add(tk->tkr_mono.base, *offset); - nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsecs = tk->coarse_nsec; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -828,7 +1045,7 @@ EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); */ ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs) { - ktime_t *offset = offsets[offs]; + const ktime_t *offset = offsets[offs]; unsigned int seq; ktime_t tconv; @@ -949,9 +1166,14 @@ time64_t ktime_get_real_seconds(void) EXPORT_SYMBOL_GPL(ktime_get_real_seconds); /** - * __ktime_get_real_seconds - The same as ktime_get_real_seconds - * but without the sequence counter protect. This internal function - * is called just when timekeeping lock is already held. + * __ktime_get_real_seconds - Unprotected access to CLOCK_REALTIME seconds + * + * The same as ktime_get_real_seconds() but without the sequence counter + * protection. This function is used in restricted contexts like the x86 MCE + * handler and in KGDB. It's unprotected on 32-bit vs. concurrent half + * completed modification and only to be used for such critical contexts. + * + * Returns: Racy snapshot of the CLOCK_REALTIME seconds value */ noinstr time64_t __ktime_get_real_seconds(void) { @@ -1230,7 +1452,7 @@ int get_device_system_crosststamp(int (*get_time_fn) struct system_time_snapshot *history_begin, struct system_device_crosststamp *xtstamp) { - struct system_counterval_t system_counterval; + struct system_counterval_t system_counterval = {}; struct timekeeper *tk = &tk_core.timekeeper; u64 cycles, now, interval_start; unsigned int clock_was_set_seq = 0; @@ -1386,41 +1608,73 @@ int do_settimeofday64(const struct timespec64 *ts) } EXPORT_SYMBOL(do_settimeofday64); +static inline bool timekeeper_is_core_tk(struct timekeeper *tk) +{ + return !IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS) || tk->id == TIMEKEEPER_CORE; +} + /** - * timekeeping_inject_offset - Adds or subtracts from the current time. + * __timekeeping_inject_offset - Adds or subtracts from the current time. + * @tkd: Pointer to the timekeeper to modify * @ts: Pointer to the timespec variable containing the offset * * Adds or subtracts an offset value from the current time. */ -static int timekeeping_inject_offset(const struct timespec64 *ts) +static int __timekeeping_inject_offset(struct tk_data *tkd, const struct timespec64 *ts) { + struct timekeeper *tks = &tkd->shadow_timekeeper; + struct timespec64 tmp; + if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { - struct timekeeper *tks = &tk_core.shadow_timekeeper; - struct timespec64 tmp; - - timekeeping_forward_now(tks); + timekeeping_forward_now(tks); + if (timekeeper_is_core_tk(tks)) { /* Make sure the proposed value is valid */ tmp = timespec64_add(tk_xtime(tks), *ts); if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 || !timespec64_valid_settod(&tmp)) { - timekeeping_restore_shadow(&tk_core); + timekeeping_restore_shadow(tkd); return -EINVAL; } tk_xtime_add(tks, ts); tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts)); - timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); + } else { + struct tk_read_base *tkr_mono = &tks->tkr_mono; + ktime_t now, offs; + + /* Get the current time */ + now = ktime_add_ns(tkr_mono->base, timekeeping_get_ns(tkr_mono)); + /* Add the relative offset change */ + offs = ktime_add(tks->offs_aux, timespec64_to_ktime(*ts)); + + /* Prevent that the resulting time becomes negative */ + if (ktime_add(now, offs) < 0) { + timekeeping_restore_shadow(tkd); + return -EINVAL; + } + tk_update_aux_offs(tks, offs); } - /* Signal hrtimers about time change */ - clock_was_set(CLOCK_SET_WALL); + timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); return 0; } +static int timekeeping_inject_offset(const struct timespec64 *ts) +{ + int ret; + + scoped_guard (raw_spinlock_irqsave, &tk_core.lock) + ret = __timekeeping_inject_offset(&tk_core, ts); + + /* Signal hrtimers about time change */ + if (!ret) + clock_was_set(CLOCK_SET_WALL); + return ret; +} + /* * Indicates if there is an offset between the system clock and the hardware * clock/persistent clock/rtc. @@ -1496,6 +1750,8 @@ static int change_clocksource(void *data) timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL); } + tk_aux_update_clocksource(); + if (old) { if (old->disable) old->disable(old); @@ -1518,7 +1774,19 @@ int timekeeping_notify(struct clocksource *clock) if (tk->tkr_mono.clock == clock) return 0; + + /* Disable inlined reads accross the clocksource switch */ + clocksource_disable_inline_read(); + stop_machine(change_clocksource, clock, NULL); + + /* + * If the clocksource has been selected and supports inlined reads + * enable the branch. + */ + if (tk->tkr_mono.clock == clock && clock->flags & CLOCK_SOURCE_CAN_INLINE_READ) + clocksource_enable_inline_read(); + tick_clock_notify(); return tk->tkr_mono.clock == clock ? 0 : -1; } @@ -1547,6 +1815,39 @@ void ktime_get_raw_ts64(struct timespec64 *ts) } EXPORT_SYMBOL(ktime_get_raw_ts64); +/** + * ktime_get_clock_ts64 - Returns time of a clock in a timespec + * @id: POSIX clock ID of the clock to read + * @ts: Pointer to the timespec64 to be set + * + * The timestamp is invalidated (@ts->sec is set to -1) if the + * clock @id is not available. + */ +void ktime_get_clock_ts64(clockid_t id, struct timespec64 *ts) +{ + /* Invalidate time stamp */ + ts->tv_sec = -1; + ts->tv_nsec = 0; + + switch (id) { + case CLOCK_REALTIME: + ktime_get_real_ts64(ts); + return; + case CLOCK_MONOTONIC: + ktime_get_ts64(ts); + return; + case CLOCK_MONOTONIC_RAW: + ktime_get_raw_ts64(ts); + return; + case CLOCK_AUX ... CLOCK_AUX_LAST: + if (IS_ENABLED(CONFIG_POSIX_AUX_CLOCKS)) + ktime_get_aux_ts64(id, ts); + return; + default: + WARN_ON_ONCE(1); + } +} +EXPORT_SYMBOL_GPL(ktime_get_clock_ts64); /** * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres @@ -1623,10 +1924,12 @@ read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, *boot_offset = ns_to_timespec64(local_clock()); } -static __init void tkd_basic_setup(struct tk_data *tkd) +static __init void tkd_basic_setup(struct tk_data *tkd, enum timekeeper_ids tk_id, bool valid) { raw_spin_lock_init(&tkd->lock); seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock); + tkd->timekeeper.id = tkd->shadow_timekeeper.id = tk_id; + tkd->timekeeper.clock_valid = tkd->shadow_timekeeper.clock_valid = valid; } /* @@ -1656,7 +1959,8 @@ void __init timekeeping_init(void) struct timekeeper *tks = &tk_core.shadow_timekeeper; struct clocksource *clock; - tkd_basic_setup(&tk_core); + tkd_basic_setup(&tk_core, TIMEKEEPER_CORE, true); + tk_aux_setup(); read_persistent_wall_and_boot_offset(&wall_time, &boot_offset); if (timespec64_valid_settod(&wall_time) && @@ -1845,6 +2149,11 @@ void timekeeping_resume(void) timerfd_resume(); } +static void timekeeping_syscore_resume(void *data) +{ + timekeeping_resume(); +} + int timekeeping_suspend(void) { struct timekeeper *tks = &tk_core.shadow_timekeeper; @@ -1912,15 +2221,24 @@ int timekeeping_suspend(void) return 0; } +static int timekeeping_syscore_suspend(void *data) +{ + return timekeeping_suspend(); +} + /* sysfs resume/suspend bits for timekeeping */ -static struct syscore_ops timekeeping_syscore_ops = { - .resume = timekeeping_resume, - .suspend = timekeeping_suspend, +static const struct syscore_ops timekeeping_syscore_ops = { + .resume = timekeeping_syscore_resume, + .suspend = timekeeping_syscore_suspend, +}; + +static struct syscore timekeeping_syscore = { + .ops = &timekeeping_syscore_ops, }; static int __init timekeeping_init_ops(void) { - register_syscore_ops(&timekeeping_syscore_ops); + register_syscore(&timekeeping_syscore); return 0; } device_initcall(timekeeping_init_ops); @@ -2008,7 +2326,7 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, */ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) { - u64 ntp_tl = ntp_tick_length(); + u64 ntp_tl = ntp_tick_length(tk->id); u32 mult; /* @@ -2089,7 +2407,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) } /* Figure out if its a leap sec and apply if needed */ - leap = second_overflow(tk->xtime_sec); + leap = second_overflow(tk->id, tk->xtime_sec); if (unlikely(leap)) { struct timespec64 ts; @@ -2155,15 +2473,13 @@ static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset, * timekeeping_advance - Updates the timekeeper to the current time and * current NTP tick length */ -static bool timekeeping_advance(enum timekeeping_adv_mode mode) +static bool __timekeeping_advance(struct tk_data *tkd, enum timekeeping_adv_mode mode) { - struct timekeeper *tk = &tk_core.shadow_timekeeper; - struct timekeeper *real_tk = &tk_core.timekeeper; + struct timekeeper *tk = &tkd->shadow_timekeeper; + struct timekeeper *real_tk = &tkd->timekeeper; unsigned int clock_set = 0; int shift = 0, maxshift; - u64 offset; - - guard(raw_spinlock_irqsave)(&tk_core.lock); + u64 offset, orig_offset; /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) @@ -2172,7 +2488,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), tk->tkr_mono.cycle_last, tk->tkr_mono.mask, tk->tkr_mono.clock->max_raw_delta); - + orig_offset = offset; /* Check if there's really nothing to do */ if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) return false; @@ -2188,7 +2504,7 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) shift = ilog2(offset) - ilog2(tk->cycle_interval); shift = max(0, shift); /* Bound shift to one less than what overflows tick_length */ - maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; + maxshift = (64 - (ilog2(ntp_tick_length(tk->id)) + 1)) - 1; shift = min(shift, maxshift); while (offset >= tk->cycle_interval) { offset = logarithmic_accumulation(tk, offset, shift, &clock_set); @@ -2205,19 +2521,35 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) */ clock_set |= accumulate_nsecs_to_secs(tk); - timekeeping_update_from_shadow(&tk_core, clock_set); + /* + * To avoid inconsistencies caused adjtimex TK_ADV_FREQ calls + * making small negative adjustments to the base xtime_nsec + * value, only update the coarse clocks if we accumulated time + */ + if (orig_offset != offset) + tk_update_coarse_nsecs(tk); + + timekeeping_update_from_shadow(tkd, clock_set); return !!clock_set; } +static bool timekeeping_advance(enum timekeeping_adv_mode mode) +{ + guard(raw_spinlock_irqsave)(&tk_core.lock); + return __timekeeping_advance(&tk_core, mode); +} + /** * update_wall_time - Uses the current clocksource to increment the wall time * + * It also updates the enabled auxiliary clock timekeepers */ void update_wall_time(void) { if (timekeeping_advance(TK_ADV_TICK)) clock_was_set_delayed(); + tk_aux_advance(); } /** @@ -2248,7 +2580,7 @@ void ktime_get_coarse_real_ts64(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - *ts = tk_xtime(tk); + *ts = tk_xtime_coarse(tk); } while (read_seqcount_retry(&tk_core.seq, seq)); } EXPORT_SYMBOL(ktime_get_coarse_real_ts64); @@ -2271,7 +2603,7 @@ void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - *ts = tk_xtime(tk); + *ts = tk_xtime_coarse(tk); offset = tk_core.timekeeper.offs_real; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -2350,12 +2682,12 @@ void ktime_get_coarse_ts64(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - now = tk_xtime(tk); + now = tk_xtime_coarse(tk); mono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec, - now.tv_nsec + mono.tv_nsec); + now.tv_nsec + mono.tv_nsec); } EXPORT_SYMBOL(ktime_get_coarse_ts64); @@ -2415,7 +2747,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, /* * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex */ -static int timekeeping_validate_timex(const struct __kernel_timex *txc) +static int timekeeping_validate_timex(const struct __kernel_timex *txc, bool aux_clock) { if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ @@ -2474,6 +2806,22 @@ static int timekeeping_validate_timex(const struct __kernel_timex *txc) return -EINVAL; } + if (aux_clock) { + /* Auxiliary clocks are similar to TAI and do not have leap seconds */ + if (txc->modes & ADJ_STATUS && + txc->status & (STA_INS | STA_DEL)) + return -EINVAL; + + /* No TAI offset setting */ + if (txc->modes & ADJ_TAI) + return -EINVAL; + + /* No PPS support either */ + if (txc->modes & ADJ_STATUS && + txc->status & (STA_PPSFREQ | STA_PPSTIME)) + return -EINVAL; + } + return 0; } @@ -2492,74 +2840,103 @@ unsigned long random_get_entropy_fallback(void) } EXPORT_SYMBOL_GPL(random_get_entropy_fallback); -/** - * do_adjtimex() - Accessor function to NTP __do_adjtimex function - * @txc: Pointer to kernel_timex structure containing NTP parameters - */ -int do_adjtimex(struct __kernel_timex *txc) +struct adjtimex_result { + struct audit_ntp_data ad; + struct timespec64 delta; + bool clock_set; +}; + +static int __do_adjtimex(struct tk_data *tkd, struct __kernel_timex *txc, + struct adjtimex_result *result) { - struct audit_ntp_data ad; - bool offset_set = false; - bool clock_set = false; + struct timekeeper *tks = &tkd->shadow_timekeeper; + bool aux_clock = !timekeeper_is_core_tk(tks); struct timespec64 ts; + s32 orig_tai, tai; int ret; /* Validate the data before disabling interrupts */ - ret = timekeeping_validate_timex(txc); + ret = timekeeping_validate_timex(txc, aux_clock); if (ret) return ret; add_device_randomness(txc, sizeof(*txc)); - if (txc->modes & ADJ_SETOFFSET) { - struct timespec64 delta; + if (!aux_clock) + ktime_get_real_ts64(&ts); + else + tk_get_aux_ts64(tkd->timekeeper.id, &ts); + + add_device_randomness(&ts, sizeof(ts)); - delta.tv_sec = txc->time.tv_sec; - delta.tv_nsec = txc->time.tv_usec; + guard(raw_spinlock_irqsave)(&tkd->lock); + + if (!tks->clock_valid) + return -ENODEV; + + if (txc->modes & ADJ_SETOFFSET) { + result->delta.tv_sec = txc->time.tv_sec; + result->delta.tv_nsec = txc->time.tv_usec; if (!(txc->modes & ADJ_NANO)) - delta.tv_nsec *= 1000; - ret = timekeeping_inject_offset(&delta); + result->delta.tv_nsec *= 1000; + ret = __timekeeping_inject_offset(tkd, &result->delta); if (ret) return ret; - - offset_set = delta.tv_sec != 0; - audit_tk_injoffset(delta); + result->clock_set = true; } - audit_ntp_init(&ad); + orig_tai = tai = tks->tai_offset; + ret = ntp_adjtimex(tks->id, txc, &ts, &tai, &result->ad); - ktime_get_real_ts64(&ts); - add_device_randomness(&ts, sizeof(ts)); + if (tai != orig_tai) { + __timekeeping_set_tai_offset(tks, tai); + timekeeping_update_from_shadow(tkd, TK_CLOCK_WAS_SET); + result->clock_set = true; + } else { + tk_update_leap_state_all(tkd); + } - scoped_guard (raw_spinlock_irqsave, &tk_core.lock) { - struct timekeeper *tks = &tk_core.shadow_timekeeper; - s32 orig_tai, tai; + /* Update the multiplier immediately if frequency was set directly */ + if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) + result->clock_set |= __timekeeping_advance(tkd, TK_ADV_FREQ); - orig_tai = tai = tks->tai_offset; - ret = __do_adjtimex(txc, &ts, &tai, &ad); + return ret; +} - if (tai != orig_tai) { - __timekeeping_set_tai_offset(tks, tai); - timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET); - clock_set = true; - } else { - tk_update_leap_state_all(&tk_core); - } - } +/** + * do_adjtimex() - Accessor function to NTP __do_adjtimex function + * @txc: Pointer to kernel_timex structure containing NTP parameters + */ +int do_adjtimex(struct __kernel_timex *txc) +{ + struct adjtimex_result result = { }; + int ret; + + ret = __do_adjtimex(&tk_core, txc, &result); + if (ret < 0) + return ret; - audit_ntp_log(&ad); + if (txc->modes & ADJ_SETOFFSET) + audit_tk_injoffset(result.delta); - /* Update the multiplier immediately if frequency was set directly */ - if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK)) - clock_set |= timekeeping_advance(TK_ADV_FREQ); + audit_ntp_log(&result.ad); - if (clock_set) + if (result.clock_set) clock_was_set(CLOCK_SET_WALL); - ntp_notify_cmos_timer(offset_set); + ntp_notify_cmos_timer(result.delta.tv_sec != 0); return ret; } +/* + * Invoked from NTP with the time keeper lock held, so lockless access is + * fine. + */ +long ktime_get_ntp_seconds(unsigned int id) +{ + return timekeeper_data[id].timekeeper.xtime_sec; +} + #ifdef CONFIG_NTP_PPS /** * hardpps() - Accessor function to NTP __hardpps function @@ -2573,3 +2950,321 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) } EXPORT_SYMBOL(hardpps); #endif /* CONFIG_NTP_PPS */ + +#ifdef CONFIG_POSIX_AUX_CLOCKS +#include "posix-timers.h" + +/* + * Bitmap for the activated auxiliary timekeepers to allow lockless quick + * checks in the hot paths without touching extra cache lines. If set, then + * the state of the corresponding timekeeper has to be re-checked under + * timekeeper::lock. + */ +static unsigned long aux_timekeepers; + +static inline unsigned int clockid_to_tkid(unsigned int id) +{ + return TIMEKEEPER_AUX_FIRST + id - CLOCK_AUX; +} + +static inline struct tk_data *aux_get_tk_data(clockid_t id) +{ + if (!clockid_aux_valid(id)) + return NULL; + return &timekeeper_data[clockid_to_tkid(id)]; +} + +/* Invoked from timekeeping after a clocksource change */ +static void tk_aux_update_clocksource(void) +{ + unsigned long active = READ_ONCE(aux_timekeepers); + unsigned int id; + + for_each_set_bit(id, &active, BITS_PER_LONG) { + struct tk_data *tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST]; + struct timekeeper *tks = &tkd->shadow_timekeeper; + + guard(raw_spinlock_irqsave)(&tkd->lock); + if (!tks->clock_valid) + continue; + + timekeeping_forward_now(tks); + tk_setup_internals(tks, tk_core.timekeeper.tkr_raw.clock); + timekeeping_update_from_shadow(tkd, TK_UPDATE_ALL); + } +} + +static void tk_aux_advance(void) +{ + unsigned long active = READ_ONCE(aux_timekeepers); + unsigned int id; + + /* Lockless quick check to avoid extra cache lines */ + for_each_set_bit(id, &active, BITS_PER_LONG) { + struct tk_data *aux_tkd = &timekeeper_data[id + TIMEKEEPER_AUX_FIRST]; + + guard(raw_spinlock)(&aux_tkd->lock); + if (aux_tkd->shadow_timekeeper.clock_valid) + __timekeeping_advance(aux_tkd, TK_ADV_TICK); + } +} + +/** + * ktime_get_aux - Get time for a AUX clock + * @id: ID of the clock to read (CLOCK_AUX...) + * @kt: Pointer to ktime_t to store the time stamp + * + * Returns: True if the timestamp is valid, false otherwise + */ +bool ktime_get_aux(clockid_t id, ktime_t *kt) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct timekeeper *aux_tk; + unsigned int seq; + ktime_t base; + u64 nsecs; + + WARN_ON(timekeeping_suspended); + + if (!aux_tkd) + return false; + + aux_tk = &aux_tkd->timekeeper; + do { + seq = read_seqcount_begin(&aux_tkd->seq); + if (!aux_tk->clock_valid) + return false; + + base = ktime_add(aux_tk->tkr_mono.base, aux_tk->offs_aux); + nsecs = timekeeping_get_ns(&aux_tk->tkr_mono); + } while (read_seqcount_retry(&aux_tkd->seq, seq)); + + *kt = ktime_add_ns(base, nsecs); + return true; +} +EXPORT_SYMBOL_GPL(ktime_get_aux); + +/** + * ktime_get_aux_ts64 - Get time for a AUX clock + * @id: ID of the clock to read (CLOCK_AUX...) + * @ts: Pointer to timespec64 to store the time stamp + * + * Returns: True if the timestamp is valid, false otherwise + */ +bool ktime_get_aux_ts64(clockid_t id, struct timespec64 *ts) +{ + ktime_t now; + + if (!ktime_get_aux(id, &now)) + return false; + *ts = ktime_to_timespec64(now); + return true; +} +EXPORT_SYMBOL_GPL(ktime_get_aux_ts64); + +static int aux_get_res(clockid_t id, struct timespec64 *tp) +{ + if (!clockid_aux_valid(id)) + return -ENODEV; + + tp->tv_sec = aux_clock_resolution_ns() / NSEC_PER_SEC; + tp->tv_nsec = aux_clock_resolution_ns() % NSEC_PER_SEC; + return 0; +} + +static int aux_get_timespec(clockid_t id, struct timespec64 *tp) +{ + return ktime_get_aux_ts64(id, tp) ? 0 : -ENODEV; +} + +static int aux_clock_set(const clockid_t id, const struct timespec64 *tnew) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct timekeeper *aux_tks; + ktime_t tnow, nsecs; + + if (!timespec64_valid_settod(tnew)) + return -EINVAL; + if (!aux_tkd) + return -ENODEV; + + aux_tks = &aux_tkd->shadow_timekeeper; + + guard(raw_spinlock_irq)(&aux_tkd->lock); + if (!aux_tks->clock_valid) + return -ENODEV; + + /* Forward the timekeeper base time */ + timekeeping_forward_now(aux_tks); + /* + * Get the updated base time. tkr_mono.base has not been + * updated yet, so do that first. That makes the update + * in timekeeping_update_from_shadow() redundant, but + * that's harmless. After that @tnow can be calculated + * by using tkr_mono::cycle_last, which has been set + * by timekeeping_forward_now(). + */ + tk_update_ktime_data(aux_tks); + nsecs = timekeeping_cycles_to_ns(&aux_tks->tkr_mono, aux_tks->tkr_mono.cycle_last); + tnow = ktime_add(aux_tks->tkr_mono.base, nsecs); + + /* + * Calculate the new AUX offset as delta to @tnow ("monotonic"). + * That avoids all the tk::xtime back and forth conversions as + * xtime ("realtime") is not applicable for auxiliary clocks and + * kept in sync with "monotonic". + */ + tk_update_aux_offs(aux_tks, ktime_sub(timespec64_to_ktime(*tnew), tnow)); + + timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); + return 0; +} + +static int aux_clock_adj(const clockid_t id, struct __kernel_timex *txc) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct adjtimex_result result = { }; + + if (!aux_tkd) + return -ENODEV; + + /* + * @result is ignored for now as there are neither hrtimers nor a + * RTC related to auxiliary clocks for now. + */ + return __do_adjtimex(aux_tkd, txc, &result); +} + +const struct k_clock clock_aux = { + .clock_getres = aux_get_res, + .clock_get_timespec = aux_get_timespec, + .clock_set = aux_clock_set, + .clock_adj = aux_clock_adj, +}; + +static void aux_clock_enable(clockid_t id) +{ + struct tk_read_base *tkr_raw = &tk_core.timekeeper.tkr_raw; + struct tk_data *aux_tkd = aux_get_tk_data(id); + struct timekeeper *aux_tks = &aux_tkd->shadow_timekeeper; + + /* Prevent the core timekeeper from changing. */ + guard(raw_spinlock_irq)(&tk_core.lock); + + /* + * Setup the auxiliary clock assuming that the raw core timekeeper + * clock frequency conversion is close enough. Userspace has to + * adjust for the deviation via clock_adjtime(2). + */ + guard(raw_spinlock_nested)(&aux_tkd->lock); + + /* Remove leftovers of a previous registration */ + memset(aux_tks, 0, sizeof(*aux_tks)); + /* Restore the timekeeper id */ + aux_tks->id = aux_tkd->timekeeper.id; + /* Setup the timekeeper based on the current system clocksource */ + tk_setup_internals(aux_tks, tkr_raw->clock); + + /* Mark it valid and set it live */ + aux_tks->clock_valid = true; + timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); +} + +static void aux_clock_disable(clockid_t id) +{ + struct tk_data *aux_tkd = aux_get_tk_data(id); + + guard(raw_spinlock_irq)(&aux_tkd->lock); + aux_tkd->shadow_timekeeper.clock_valid = false; + timekeeping_update_from_shadow(aux_tkd, TK_UPDATE_ALL); +} + +static DEFINE_MUTEX(aux_clock_mutex); + +static ssize_t aux_clock_enable_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + /* Lazy atoi() as name is "0..7" */ + int id = kobj->name[0] & 0x7; + bool enable; + + if (!capable(CAP_SYS_TIME)) + return -EPERM; + + if (kstrtobool(buf, &enable) < 0) + return -EINVAL; + + guard(mutex)(&aux_clock_mutex); + if (enable == test_bit(id, &aux_timekeepers)) + return count; + + if (enable) { + aux_clock_enable(CLOCK_AUX + id); + set_bit(id, &aux_timekeepers); + } else { + aux_clock_disable(CLOCK_AUX + id); + clear_bit(id, &aux_timekeepers); + } + return count; +} + +static ssize_t aux_clock_enable_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + unsigned long active = READ_ONCE(aux_timekeepers); + /* Lazy atoi() as name is "0..7" */ + int id = kobj->name[0] & 0x7; + + return sysfs_emit(buf, "%d\n", test_bit(id, &active)); +} + +static struct kobj_attribute aux_clock_enable_attr = __ATTR_RW(aux_clock_enable); + +static struct attribute *aux_clock_enable_attrs[] = { + &aux_clock_enable_attr.attr, + NULL +}; + +static const struct attribute_group aux_clock_enable_attr_group = { + .attrs = aux_clock_enable_attrs, +}; + +static int __init tk_aux_sysfs_init(void) +{ + struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj); + int ret = -ENOMEM; + + if (!tko) + return ret; + + auxo = kobject_create_and_add("aux_clocks", tko); + if (!auxo) + goto err_clean; + + for (int i = 0; i < MAX_AUX_CLOCKS; i++) { + char id[2] = { [0] = '0' + i, }; + struct kobject *clk = kobject_create_and_add(id, auxo); + + if (!clk) { + ret = -ENOMEM; + goto err_clean; + } + + ret = sysfs_create_group(clk, &aux_clock_enable_attr_group); + if (ret) + goto err_clean; + } + return 0; + +err_clean: + kobject_put(auxo); + kobject_put(tko); + return ret; +} +late_initcall(tk_aux_sysfs_init); + +static __init void tk_aux_setup(void) +{ + for (int i = TIMEKEEPER_AUX_FIRST; i <= TIMEKEEPER_AUX_LAST; i++) + tkd_basic_setup(&timekeeper_data[i], i, false); +} +#endif /* CONFIG_POSIX_AUX_CLOCKS */ diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 543beba096c7..198d0608db74 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -9,6 +9,8 @@ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_boot, ktime_t *offs_tai); +bool ktime_expiry_to_cycles(enum clocksource_ids id, ktime_t expires_ns, u64 *cycles); + extern int timekeeping_valid_for_hres(void); extern u64 timekeeping_max_deferment(void); extern void timekeeping_warp_clock(void); diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 8c9079108ffb..973ede670a36 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -45,4 +45,7 @@ static inline u64 clocksource_delta(u64 now, u64 last, u64 mask, u64 max_delta) unsigned long timekeeper_lock_irqsave(void); void timekeeper_unlock_irqrestore(unsigned long flags); +/* NTP specific interface to access the current seconds value */ +long ktime_get_ntp_seconds(unsigned int id); + #endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index c8f776dc6ee0..04d928c21aba 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -281,7 +281,7 @@ DEFINE_STATIC_KEY_FALSE(timers_migration_enabled); static void timers_update_migration(void) { - if (sysctl_timer_migration && tick_nohz_active) + if (sysctl_timer_migration && tick_nohz_is_active()) static_branch_enable(&timers_migration_enabled); else static_branch_disable(&timers_migration_enabled); @@ -386,32 +386,6 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu, } /** - * __round_jiffies - function to round jiffies to a full second - * @j: the time in (absolute) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * __round_jiffies() rounds an absolute time in the future (in jiffies) - * up or down to (approximately) full seconds. This is useful for timers - * for which the exact time they fire does not matter too much, as long as - * they fire approximately every X seconds. - * - * By rounding these timers to whole seconds, all such timers will fire - * at the same time, rather than at various times spread out. The goal - * of this is to have the CPU wake up less, which saves power. - * - * The exact rounding is skewed for each processor to avoid all - * processors firing at the exact same time, which could lead - * to lock contention or spurious cache line bouncing. - * - * The return value is the rounded version of the @j parameter. - */ -unsigned long __round_jiffies(unsigned long j, int cpu) -{ - return round_jiffies_common(j, cpu, false); -} -EXPORT_SYMBOL_GPL(__round_jiffies); - -/** * __round_jiffies_relative - function to round jiffies to a full second * @j: the time in (relative) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen @@ -483,22 +457,6 @@ unsigned long round_jiffies_relative(unsigned long j) EXPORT_SYMBOL_GPL(round_jiffies_relative); /** - * __round_jiffies_up - function to round jiffies up to a full second - * @j: the time in (absolute) jiffies that should be rounded - * @cpu: the processor number on which the timeout will happen - * - * This is the same as __round_jiffies() except that it will never - * round down. This is useful for timeouts for which the exact time - * of firing does not matter too much, as long as they don't fire too - * early. - */ -unsigned long __round_jiffies_up(unsigned long j, int cpu) -{ - return round_jiffies_common(j, cpu, true); -} -EXPORT_SYMBOL_GPL(__round_jiffies_up); - -/** * __round_jiffies_up_relative - function to round jiffies up to a full second * @j: the time in (relative) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen @@ -744,7 +702,7 @@ static bool timer_fixup_init(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_ACTIVE: - del_timer_sync(timer); + timer_delete_sync(timer); debug_object_init(timer, &timer_debug_descr); return true; default: @@ -790,7 +748,7 @@ static bool timer_fixup_free(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_ACTIVE: - del_timer_sync(timer); + timer_delete_sync(timer); debug_object_free(timer, &timer_debug_descr); return true; default: @@ -850,7 +808,7 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags, const char *name, struct lock_class_key *key); -void init_timer_on_stack_key(struct timer_list *timer, +void timer_init_key_on_stack(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) @@ -858,13 +816,13 @@ void init_timer_on_stack_key(struct timer_list *timer, debug_object_init_on_stack(timer, &timer_debug_descr); do_init_timer(timer, func, flags, name, key); } -EXPORT_SYMBOL_GPL(init_timer_on_stack_key); +EXPORT_SYMBOL_GPL(timer_init_key_on_stack); -void destroy_timer_on_stack(struct timer_list *timer) +void timer_destroy_on_stack(struct timer_list *timer) { debug_object_free(timer, &timer_debug_descr); } -EXPORT_SYMBOL_GPL(destroy_timer_on_stack); +EXPORT_SYMBOL_GPL(timer_destroy_on_stack); #else static inline void debug_timer_init(struct timer_list *timer) { } @@ -904,7 +862,7 @@ static void do_init_timer(struct timer_list *timer, } /** - * init_timer_key - initialize a timer + * timer_init_key - initialize a timer * @timer: the timer to be initialized * @func: timer callback function * @flags: timer flags @@ -912,17 +870,17 @@ static void do_init_timer(struct timer_list *timer, * @key: lockdep class key of the fake lock used for tracking timer * sync lock dependencies * - * init_timer_key() must be done to a timer prior to calling *any* of the + * timer_init_key() must be done to a timer prior to calling *any* of the * other timer functions. */ -void init_timer_key(struct timer_list *timer, +void timer_init_key(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) { debug_init(timer); do_init_timer(timer, func, flags, name, key); } -EXPORT_SYMBOL(init_timer_key); +EXPORT_SYMBOL(timer_init_key); static inline void detach_timer(struct timer_list *timer, bool clear_pending) { @@ -1212,10 +1170,10 @@ EXPORT_SYMBOL(mod_timer_pending); * * mod_timer(timer, expires) is equivalent to: * - * del_timer(timer); timer->expires = expires; add_timer(timer); + * timer_delete(timer); timer->expires = expires; add_timer(timer); * * mod_timer() is more efficient than the above open coded sequence. In - * case that the timer is inactive, the del_timer() part is a NOP. The + * case that the timer is inactive, the timer_delete() part is a NOP. The * timer is in any case activated with the new expiry time @expires. * * Note that if there are multiple unserialized concurrent users of the @@ -1500,10 +1458,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) base = lock_timer_base(timer, &flags); - if (base->running_timer != timer) + if (base->running_timer != timer) { ret = detach_if_pending(timer, base, true); - if (shutdown) - timer->function = NULL; + if (shutdown) + timer->function = NULL; + } raw_spin_unlock_irqrestore(&base->lock, flags); @@ -1511,7 +1470,7 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) } /** - * try_to_del_timer_sync - Try to deactivate a timer + * timer_delete_sync_try - Try to deactivate a timer * @timer: Timer to deactivate * * This function tries to deactivate a timer. On success the timer is not @@ -1526,11 +1485,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) * * %1 - The timer was pending and deactivated * * %-1 - The timer callback function is running on a different CPU */ -int try_to_del_timer_sync(struct timer_list *timer) +int timer_delete_sync_try(struct timer_list *timer) { return __try_to_del_timer_sync(timer, false); } -EXPORT_SYMBOL(try_to_del_timer_sync); +EXPORT_SYMBOL(timer_delete_sync_try); #ifdef CONFIG_PREEMPT_RT static __init void timer_base_init_expiry_lock(struct timer_base *base) @@ -1900,7 +1859,7 @@ static void timer_recalc_next_expiry(struct timer_base *base) unsigned long clk, next, adj; unsigned lvl, offset = 0; - next = base->clk + NEXT_TIMER_MAX_DELTA; + next = base->clk + TIMER_NEXT_MAX_DELTA; clk = base->clk; for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { int pos = next_pending_bucket(base, offset, clk & LVL_MASK); @@ -1963,7 +1922,7 @@ static void timer_recalc_next_expiry(struct timer_base *base) WRITE_ONCE(base->next_expiry, next); base->next_expiry_recalc = false; - base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA); + base->timers_pending = !(next == base->clk + TIMER_NEXT_MAX_DELTA); } #ifdef CONFIG_NO_HZ_COMMON @@ -2015,7 +1974,7 @@ static unsigned long next_timer_interrupt(struct timer_base *base, * easy comparable to find out which base holds the first pending timer. */ if (!base->timers_pending) - WRITE_ONCE(base->next_expiry, basej + NEXT_TIMER_MAX_DELTA); + WRITE_ONCE(base->next_expiry, basej + TIMER_NEXT_MAX_DELTA); return base->next_expiry; } @@ -2360,6 +2319,7 @@ u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle) */ void timer_clear_idle(void) { + int this_cpu = smp_processor_id(); /* * We do this unlocked. The worst outcome is a remote pinned timer * enqueue sending a pointless IPI, but taking the lock would just @@ -2368,9 +2328,9 @@ void timer_clear_idle(void) * path. Required for BASE_LOCAL only. */ __this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false); - if (tick_nohz_full_cpu(smp_processor_id())) + if (tick_nohz_full_cpu(this_cpu)) __this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false); - trace_timer_base_idle(false, smp_processor_id()); + trace_timer_base_idle(false, this_cpu); /* Activate without holding the timer_base->lock */ tmigr_cpu_activate(); @@ -2399,7 +2359,7 @@ static inline void __run_timers(struct timer_base *base) * timer at this clk are that all matching timers have been * dequeued or no timer has been queued since * base::next_expiry was set to base::clk + - * NEXT_TIMER_MAX_DELTA. + * TIMER_NEXT_MAX_DELTA. */ WARN_ON_ONCE(!levels && !base->next_expiry_recalc && base->timers_pending); @@ -2514,7 +2474,7 @@ void update_process_times(int user_tick) run_local_timers(); rcu_sched_clock_irq(user_tick); #ifdef CONFIG_IRQ_WORK - if (in_irq()) + if (in_hardirq()) irq_work_tick(); #endif sched_tick(); @@ -2544,7 +2504,7 @@ int timers_prepare_cpu(unsigned int cpu) for (b = 0; b < NR_BASES; b++) { base = per_cpu_ptr(&timer_bases[b], cpu); base->clk = jiffies; - base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA; base->next_expiry_recalc = false; base->timers_pending = false; base->is_idle = false; @@ -2599,7 +2559,7 @@ static void __init init_timer_cpu(int cpu) base->cpu = cpu; raw_spin_lock_init(&base->lock); base->clk = jiffies; - base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->next_expiry = base->clk + TIMER_NEXT_MAX_DELTA; timer_base_init_expiry_lock(base); } } @@ -2612,7 +2572,7 @@ static void __init init_timer_cpus(void) init_timer_cpu(cpu); } -void __init init_timers(void) +void __init timers_init(void) { init_timer_cpus(); posix_cputimers_init_work(); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 1c311c46da50..427d7ddea3af 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -46,8 +46,8 @@ static void print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { - SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function); - SEQ_printf(m, ", S:%02x", timer->state); + SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, ACCESS_PRIVATE(timer, function)); + SEQ_printf(m, ", S:%02x", timer->is_queued); SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)), @@ -56,13 +56,11 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now)); } -static void -print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, - u64 now) +static void print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { + struct timerqueue_linked_node *curr; struct hrtimer *timer, tmp; unsigned long next = 0, i; - struct timerqueue_node *curr; unsigned long flags; next_one: @@ -72,13 +70,13 @@ next_one: raw_spin_lock_irqsave(&base->cpu_base->lock, flags); - curr = timerqueue_getnext(&base->active); + curr = timerqueue_linked_first(&base->active); /* * Crude but we have to do this O(N*N) thing, because * we have to unlock the base when printing: */ while (curr && i < next) { - curr = timerqueue_iterate_next(curr); + curr = timerqueue_linked_next(curr); i++; } @@ -98,15 +96,13 @@ next_one: static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { - SEQ_printf(m, " .base: %pK\n", base); + SEQ_printf(m, " .base: %p\n", base); SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); - - SEQ_printf(m, " .get_time: %ps\n", base->get_time); #ifdef CONFIG_HIGH_RES_TIMERS - SEQ_printf(m, " .offset: %Lu nsecs\n", - (unsigned long long) ktime_to_ns(base->offset)); + SEQ_printf(m, " .offset: %Ld nsecs\n", + (long long) base->offset); #endif SEQ_printf(m, "active timers:\n"); print_active_timers(m, base, now + ktime_to_ns(base->offset)); diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 2f6330831f08..52c15affdbff 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -10,6 +10,7 @@ #include <linux/spinlock.h> #include <linux/timerqueue.h> #include <trace/events/ipi.h> +#include <linux/sched/isolation.h> #include "timer_migration.h" #include "tick-internal.h" @@ -420,14 +421,53 @@ static struct list_head *tmigr_level_list __read_mostly; static unsigned int tmigr_hierarchy_levels __read_mostly; static unsigned int tmigr_crossnode_level __read_mostly; +static struct tmigr_group *tmigr_root; + static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu); +/* + * CPUs available for timer migration. + * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock. + * Additionally tmigr_available_mutex serializes set/clear operations with each other. + */ +static cpumask_var_t tmigr_available_cpumask; +static DEFINE_MUTEX(tmigr_available_mutex); + +/* Enabled during late initcall */ +static DEFINE_STATIC_KEY_FALSE(tmigr_exclude_isolated); + #define TMIGR_NONE 0xFF #define BIT_CNT 8 static inline bool tmigr_is_not_available(struct tmigr_cpu *tmc) { - return !(tmc->tmgroup && tmc->online); + return !(tmc->tmgroup && tmc->available); +} + +/* + * Returns true if @cpu should be excluded from the hierarchy as isolated. + * Domain isolated CPUs don't participate in timer migration, nohz_full CPUs + * are still part of the hierarchy but become idle (from a tick and timer + * migration perspective) when they stop their tick. This lets the timekeeping + * CPU handle their global timers. Marking also isolated CPUs as idle would be + * too costly, hence they are completely excluded from the hierarchy. + * This check is necessary, for instance, to prevent offline isolated CPUs from + * being incorrectly marked as available once getting back online. + * + * This function returns false during early boot and the isolation logic is + * enabled only after isolated CPUs are marked as unavailable at late boot. + * The tick CPU can be isolated at boot, however we cannot mark it as + * unavailable to avoid having no global migrator for the nohz_full CPUs. This + * should be ensured by the callers of this function: implicitly from hotplug + * callbacks and explicitly in tmigr_init_isolation() and + * tmigr_isolated_exclude_cpumask(). + */ +static inline bool tmigr_is_isolated(int cpu) +{ + if (!static_branch_unlikely(&tmigr_exclude_isolated)) + return false; + return (!housekeeping_cpu(cpu, HK_TYPE_DOMAIN) && + housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)); } /* @@ -502,11 +542,6 @@ static bool tmigr_check_lonely(struct tmigr_group *group) * @now: timer base monotonic * @check: is set if there is the need to handle remote timers; * required in tmigr_requires_handle_remote() only - * @tmc_active: this flag indicates, whether the CPU which triggers - * the hierarchy walk is !idle in the timer migration - * hierarchy. When the CPU is idle and the whole hierarchy is - * idle, only the first event of the top level has to be - * considered. */ struct tmigr_walk { u64 nextexp; @@ -517,16 +552,13 @@ struct tmigr_walk { unsigned long basej; u64 now; bool check; - bool tmc_active; }; typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *); -static void __walk_groups(up_f up, struct tmigr_walk *data, - struct tmigr_cpu *tmc) +static void __walk_groups_from(up_f up, struct tmigr_walk *data, + struct tmigr_group *child, struct tmigr_group *group) { - struct tmigr_group *child = NULL, *group = tmc->tmgroup; - do { WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels); @@ -544,6 +576,12 @@ static void __walk_groups(up_f up, struct tmigr_walk *data, } while (group); } +static void __walk_groups(up_f up, struct tmigr_walk *data, + struct tmigr_cpu *tmc) +{ + __walk_groups_from(up, data, NULL, tmc->tmgroup); +} + static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc) { lockdep_assert_held(&tmc->lock); @@ -708,7 +746,7 @@ void tmigr_cpu_activate(void) /* * Returns true, if there is nothing to be propagated to the next level * - * @data->firstexp is set to expiry of first gobal event of the (top level of + * @data->firstexp is set to expiry of first global event of the (top level of * the) hierarchy, but only when hierarchy is completely idle. * * The child and group states need to be read under the lock, to prevent a race @@ -926,7 +964,7 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now, * updated the event takes care when hierarchy is completely * idle. Otherwise the migrator does it as the event is enqueued. */ - if (!tmc->online || tmc->remote || tmc->cpuevt.ignore || + if (!tmc->available || tmc->remote || tmc->cpuevt.ignore || now < tmc->cpuevt.nextevt.expires) { raw_spin_unlock_irq(&tmc->lock); return; @@ -940,8 +978,12 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now, /* Drop the lock to allow the remote CPU to exit idle */ raw_spin_unlock_irq(&tmc->lock); - if (cpu != smp_processor_id()) - timer_expire_remote(cpu); + /* + * This can't exclude the local CPU because jiffies might have advanced + * after the timer softirq invoked run_timer_base(BASE_GLOBAL) and the + * point where the jiffies snapshot @jif was taken in tmigr_handle_remote(). + */ + timer_expire_remote(cpu); /* * Lock ordering needs to be preserved - timer_base locks before tmigr @@ -973,7 +1015,7 @@ static void tmigr_handle_remote_cpu(unsigned int cpu, u64 now, * (See also section "Required event and timerqueue update after a * remote expiry" in the documentation at the top) */ - if (!tmc->online || !tmc->idle) { + if (!tmc->available || !tmc->idle) { timer_unlock_remote_bases(cpu); goto unlock; } @@ -1113,15 +1155,6 @@ static bool tmigr_requires_handle_remote_up(struct tmigr_group *group, */ if (!tmigr_check_migrator(group, childmask)) return true; - - /* - * When there is a parent group and the CPU which triggered the - * hierarchy walk is not active, proceed the walk to reach the top level - * group before reading the next_expiry value. - */ - if (group->parent && !data->tmc_active) - return false; - /* * The lock is required on 32bit architectures to read the variable * consistently with a concurrent writer. On 64bit the lock is not @@ -1166,7 +1199,6 @@ bool tmigr_requires_handle_remote(void) data.now = get_jiffies_update(&jif); data.childmask = tmc->groupmask; data.firstexp = KTIME_MAX; - data.tmc_active = !tmc->idle; data.check = false; /* @@ -1405,23 +1437,20 @@ u64 tmigr_quick_check(u64 nextevt) return KTIME_MAX; do { - if (!tmigr_check_lonely(group)) { + if (!tmigr_check_lonely(group)) return KTIME_MAX; - } else { - /* - * Since current CPU is active, events may not be sorted - * from bottom to the top because the CPU's event is ignored - * up to the top and its sibling's events not propagated upwards. - * Thus keep track of the lowest observed expiry. - */ - nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry)); - if (!group->parent) - return nextevt; - } + + /* + * Since current CPU is active, events may not be sorted + * from bottom to the top because the CPU's event is ignored + * up to the top and its sibling's events not propagated upwards. + * Thus keep track of the lowest observed expiry. + */ + nextevt = min_t(u64, nextevt, READ_ONCE(group->next_expiry)); group = group->parent; } while (group); - return KTIME_MAX; + return nextevt; } /* @@ -1435,38 +1464,43 @@ static long tmigr_trigger_active(void *unused) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); - WARN_ON_ONCE(!tmc->online || tmc->idle); + WARN_ON_ONCE(!tmc->available || tmc->idle); return 0; } -static int tmigr_cpu_offline(unsigned int cpu) +static int tmigr_clear_cpu_available(unsigned int cpu) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); int migrator; u64 firstexp; - raw_spin_lock_irq(&tmc->lock); - tmc->online = false; - WRITE_ONCE(tmc->wakeup, KTIME_MAX); + guard(mutex)(&tmigr_available_mutex); - /* - * CPU has to handle the local events on his own, when on the way to - * offline; Therefore nextevt value is set to KTIME_MAX - */ - firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); - trace_tmigr_cpu_offline(tmc); - raw_spin_unlock_irq(&tmc->lock); + cpumask_clear_cpu(cpu, tmigr_available_cpumask); + scoped_guard(raw_spinlock_irq, &tmc->lock) { + if (!tmc->available) + return 0; + tmc->available = false; + WRITE_ONCE(tmc->wakeup, KTIME_MAX); + + /* + * CPU has to handle the local events on his own, when on the way to + * offline; Therefore nextevt value is set to KTIME_MAX + */ + firstexp = __tmigr_cpu_deactivate(tmc, KTIME_MAX); + trace_tmigr_cpu_unavailable(tmc); + } if (firstexp != KTIME_MAX) { - migrator = cpumask_any_but(cpu_online_mask, cpu); + migrator = cpumask_any(tmigr_available_cpumask); work_on_cpu(migrator, tmigr_trigger_active, NULL); } return 0; } -static int tmigr_cpu_online(unsigned int cpu) +static int __tmigr_set_cpu_available(unsigned int cpu) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); @@ -1474,16 +1508,131 @@ static int tmigr_cpu_online(unsigned int cpu) if (WARN_ON_ONCE(!tmc->tmgroup)) return -EINVAL; - raw_spin_lock_irq(&tmc->lock); - trace_tmigr_cpu_online(tmc); - tmc->idle = timer_base_is_idle(); - if (!tmc->idle) - __tmigr_cpu_activate(tmc); - tmc->online = true; - raw_spin_unlock_irq(&tmc->lock); + guard(mutex)(&tmigr_available_mutex); + + cpumask_set_cpu(cpu, tmigr_available_cpumask); + scoped_guard(raw_spinlock_irq, &tmc->lock) { + if (tmc->available) + return 0; + trace_tmigr_cpu_available(tmc); + tmc->idle = timer_base_is_idle(); + if (!tmc->idle) + __tmigr_cpu_activate(tmc); + tmc->available = true; + } + return 0; +} + +static int tmigr_set_cpu_available(unsigned int cpu) +{ + if (tmigr_is_isolated(cpu)) + return 0; + + return __tmigr_set_cpu_available(cpu); +} + +static void tmigr_cpu_isolate(struct work_struct *ignored) +{ + tmigr_clear_cpu_available(smp_processor_id()); +} + +static void tmigr_cpu_unisolate(struct work_struct *ignored) +{ + /* + * Don't call tmigr_is_isolated() ->housekeeping_cpu() directly because + * the cpuset mutex is correctly held by the workqueue caller but lockdep + * doesn't know that. + */ + __tmigr_set_cpu_available(smp_processor_id()); +} + +/** + * tmigr_isolated_exclude_cpumask - Exclude given CPUs from hierarchy + * @exclude_cpumask: the cpumask to be excluded from timer migration hierarchy + * + * This function can be called from cpuset code to provide the new set of + * isolated CPUs that should be excluded from the hierarchy. + * Online CPUs not present in exclude_cpumask but already excluded are brought + * back to the hierarchy. + * Functions to isolate/unisolate need to be called locally and can sleep. + */ +int tmigr_isolated_exclude_cpumask(struct cpumask *exclude_cpumask) +{ + struct work_struct __percpu *works __free(free_percpu) = + alloc_percpu(struct work_struct); + cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL; + int cpu; + + if (!works) + return -ENOMEM; + if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + /* + * First set previously isolated CPUs as available (unisolate). + * This cpumask contains only CPUs that switched to available now. + */ + guard(cpus_read_lock)(); + cpumask_andnot(cpumask, cpu_online_mask, exclude_cpumask); + cpumask_andnot(cpumask, cpumask, tmigr_available_cpumask); + + for_each_cpu(cpu, cpumask) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + INIT_WORK(work, tmigr_cpu_unisolate); + schedule_work_on(cpu, work); + } + for_each_cpu(cpu, cpumask) + flush_work(per_cpu_ptr(works, cpu)); + + /* + * Then clear previously available CPUs (isolate). + * This cpumask contains only CPUs that switched to not available now. + * There cannot be overlap with the newly available ones. + */ + cpumask_and(cpumask, exclude_cpumask, tmigr_available_cpumask); + cpumask_and(cpumask, cpumask, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE)); + /* + * Handle this here and not in the cpuset code because exclude_cpumask + * might include also the tick CPU if included in isolcpus. + */ + for_each_cpu(cpu, cpumask) { + if (!tick_nohz_cpu_hotpluggable(cpu)) { + cpumask_clear_cpu(cpu, cpumask); + break; + } + } + + for_each_cpu(cpu, cpumask) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + INIT_WORK(work, tmigr_cpu_isolate); + schedule_work_on(cpu, work); + } + for_each_cpu(cpu, cpumask) + flush_work(per_cpu_ptr(works, cpu)); + return 0; } +static int __init tmigr_init_isolation(void) +{ + cpumask_var_t cpumask __free(free_cpumask_var) = CPUMASK_VAR_NULL; + + static_branch_enable(&tmigr_exclude_isolated); + + if (!housekeeping_enabled(HK_TYPE_DOMAIN)) + return 0; + if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_andnot(cpumask, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); + + /* Protect against RCU torture hotplug testing */ + return tmigr_isolated_exclude_cpumask(cpumask); +} +late_initcall(tmigr_init_isolation); + static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, int node) { @@ -1501,21 +1650,6 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, s.seq = 0; atomic_set(&group->migr_state, s.state); - /* - * If this is a new top-level, prepare its groupmask in advance. - * This avoids accidents where yet another new top-level is - * created in the future and made visible before the current groupmask. - */ - if (list_empty(&tmigr_level_list[lvl])) { - group->groupmask = BIT(0); - /* - * The previous top level has prepared its groupmask already, - * simply account it as the first child. - */ - if (lvl > 0) - group->num_children = 1; - } - timerqueue_init_head(&group->events); timerqueue_init(&group->groupevt.nextevt); group->groupevt.nextevt.expires = KTIME_MAX; @@ -1523,8 +1657,7 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, group->groupevt.ignore = true; } -static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node, - unsigned int lvl) +static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl) { struct tmigr_group *tmp, *group = NULL; @@ -1570,25 +1703,51 @@ static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node, return group; } +static bool tmigr_init_root(struct tmigr_group *group, bool activate) +{ + if (!group->parent && group != tmigr_root) { + /* + * This is the new top-level, prepare its groupmask in advance + * to avoid accidents where yet another new top-level is + * created in the future and made visible before this groupmask. + */ + group->groupmask = BIT(0); + WARN_ON_ONCE(activate); + + return true; + } + + return false; + +} + static void tmigr_connect_child_parent(struct tmigr_group *child, struct tmigr_group *parent, bool activate) { - struct tmigr_walk data; - - raw_spin_lock_irq(&child->lock); - raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); + if (tmigr_init_root(parent, activate)) { + /* + * The previous top level had prepared its groupmask already, + * simply account it in advance as the first child. If some groups + * have been created between the old and new root due to node + * mismatch, the new root's child will be intialized accordingly. + */ + parent->num_children = 1; + } - if (activate) { + /* Connecting old root to new root ? */ + if (!parent->parent && activate) { /* - * @child is the old top and @parent the new one. In this - * case groupmask is pre-initialized and @child already - * accounted, along with its new sibling corresponding to the - * CPU going up. + * @child is the old top, or in case of node mismatch, some + * intermediate group between the old top and the new one in + * @parent. In this case the @child must be pre-accounted above + * as the first child. Its new inactive sibling corresponding + * to the CPU going up has been accounted as the second child. */ - WARN_ON_ONCE(child->groupmask != BIT(0) || parent->num_children != 2); + WARN_ON_ONCE(parent->num_children != 2); + child->groupmask = BIT(0); } else { - /* Adding @child for the CPU going up to @parent. */ + /* Common case adding @child for the CPU going up to @parent. */ child->groupmask = BIT(parent->num_children++); } @@ -1599,87 +1758,61 @@ static void tmigr_connect_child_parent(struct tmigr_group *child, */ smp_store_release(&child->parent, parent); - raw_spin_unlock(&parent->lock); - raw_spin_unlock_irq(&child->lock); - trace_tmigr_connect_child_parent(child); - - if (!activate) - return; - - /* - * To prevent inconsistent states, active children need to be active in - * the new parent as well. Inactive children are already marked inactive - * in the parent group: - * - * * When new groups were created by tmigr_setup_groups() starting from - * the lowest level (and not higher then one level below the current - * top level), then they are not active. They will be set active when - * the new online CPU comes active. - * - * * But if a new group above the current top level is required, it is - * mandatory to propagate the active state of the already existing - * child to the new parent. So tmigr_connect_child_parent() is - * executed with the formerly top level group (child) and the newly - * created group (parent). - * - * * It is ensured that the child is active, as this setup path is - * executed in hotplug prepare callback. This is exectued by an - * already connected and !idle CPU. Even if all other CPUs go idle, - * the CPU executing the setup will be responsible up to current top - * level group. And the next time it goes inactive, it will release - * the new childmask and parent to subsequent walkers through this - * @child. Therefore propagate active state unconditionally. - */ - data.childmask = child->groupmask; - - /* - * There is only one new level per time (which is protected by - * tmigr_mutex). When connecting the child and the parent and set the - * child active when the parent is inactive, the parent needs to be the - * uppermost level. Otherwise there went something wrong! - */ - WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent); } -static int tmigr_setup_groups(unsigned int cpu, unsigned int node) +static int tmigr_setup_groups(unsigned int cpu, unsigned int node, + struct tmigr_group *start, bool activate) { struct tmigr_group *group, *child, **stack; - int top = 0, err = 0, i = 0; - struct list_head *lvllist; + int i, top = 0, err = 0, start_lvl = 0; + bool root_mismatch = false; - stack = kcalloc(tmigr_hierarchy_levels, sizeof(*stack), GFP_KERNEL); + stack = kzalloc_objs(*stack, tmigr_hierarchy_levels); if (!stack) return -ENOMEM; - do { - group = tmigr_get_group(cpu, node, i); + if (start) { + stack[start->level] = start; + start_lvl = start->level + 1; + } + + if (tmigr_root) + root_mismatch = tmigr_root->numa_node != node; + + for (i = start_lvl; i < tmigr_hierarchy_levels; i++) { + group = tmigr_get_group(node, i); if (IS_ERR(group)) { err = PTR_ERR(group); + i--; break; } top = i; - stack[i++] = group; + stack[i] = group; /* * When booting only less CPUs of a system than CPUs are - * available, not all calculated hierarchy levels are required. + * available, not all calculated hierarchy levels are required, + * unless a node mismatch is detected. * * The loop is aborted as soon as the highest level, which might * be different from tmigr_hierarchy_levels, contains only a - * single group. + * single group, unless the nodes mismatch below tmigr_crossnode_level */ - if (group->parent || list_is_singular(&tmigr_level_list[i - 1])) + if (group->parent) break; + if ((!root_mismatch || i >= tmigr_crossnode_level) && + list_is_singular(&tmigr_level_list[i])) + break; + } - } while (i < tmigr_hierarchy_levels); - - /* Assert single root */ - WARN_ON_ONCE(!err && !group->parent && !list_is_singular(&tmigr_level_list[top])); + /* Assert single root without parent */ + if (WARN_ON_ONCE(i >= tmigr_hierarchy_levels)) + return -EINVAL; - while (i > 0) { - group = stack[--i]; + for (; i >= start_lvl; i--) { + group = stack[i]; if (err < 0) { list_del(&group->list); @@ -1695,12 +1828,10 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) if (i == 0) { struct tmigr_cpu *tmc = per_cpu_ptr(&tmigr_cpu, cpu); - raw_spin_lock_irq(&group->lock); - tmc->tmgroup = group; tmc->groupmask = BIT(group->num_children++); - raw_spin_unlock_irq(&group->lock); + tmigr_init_root(group, activate); trace_tmigr_connect_cpu_parent(tmc); @@ -1708,42 +1839,76 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) continue; } else { child = stack[i - 1]; - /* Will be activated at online time */ - tmigr_connect_child_parent(child, group, false); + tmigr_connect_child_parent(child, group, activate); } + } - /* check if uppermost level was newly created */ - if (top != i) - continue; - - WARN_ON_ONCE(top == 0); + if (err < 0) + goto out; - lvllist = &tmigr_level_list[top]; + if (activate) { + struct tmigr_walk data; + union tmigr_state state; /* - * Newly created root level should have accounted the upcoming - * CPU's child group and pre-accounted the old root. + * To prevent inconsistent states, active children need to be active in + * the new parent as well. Inactive children are already marked inactive + * in the parent group: + * + * * When new groups were created by tmigr_setup_groups() starting from + * the lowest level, then they are not active. They will be set active + * when the new online CPU comes active. + * + * * But if new groups above the current top level are required, it is + * mandatory to propagate the active state of the already existing + * child to the new parents. So tmigr_active_up() activates the + * new parents while walking up from the old root to the new. + * + * * It is ensured that @start is active, (or on the way to be activated + * by another CPU that woke up before the current one) as this setup path + * is executed in hotplug prepare callback. This is executed by an already + * connected and !idle CPU in the hierarchy. + * + * * The below RmW atomic operation ensures that: + * + * 1) If the old root has been completely activated, the latest state is + * acquired (the below implicit acquire pairs with the implicit release + * from cmpxchg() in tmigr_active_up()). + * + * 2) If the old root is still on the way to be activated, the lagging behind + * CPU performing the activation will acquire the links up to the new root. + * (The below implicit release pairs with the implicit acquire from cmpxchg() + * in tmigr_active_up()). + * + * 3) Every subsequent CPU below the old root will acquire the new links while + * walking through the old root (The below implicit release pairs with the + * implicit acquire from cmpxchg() in either tmigr_active_up()) or + * tmigr_inactive_up(). + */ + state.state = atomic_fetch_or(0, &start->migr_state); + WARN_ON_ONCE(!start->parent); + /* + * If the state of the old root is inactive, another CPU is on its way to activate + * it and propagate to the new root. */ - if (group->num_children == 2 && list_is_singular(lvllist)) { - /* - * The target CPU must never do the prepare work, except - * on early boot when the boot CPU is the target. Otherwise - * it may spuriously activate the old top level group inside - * the new one (nevertheless whether old top level group is - * active or not) and/or release an uninitialized childmask. - */ - WARN_ON_ONCE(cpu == raw_smp_processor_id()); - - lvllist = &tmigr_level_list[top - 1]; - list_for_each_entry(child, lvllist, list) { - if (child->parent) - continue; - - tmigr_connect_child_parent(child, group, true); - } + if (state.active) { + data.childmask = start->groupmask; + __walk_groups_from(tmigr_active_up, &data, start, start->parent); } } + /* Root update */ + if (list_is_singular(&tmigr_level_list[top])) { + group = list_first_entry(&tmigr_level_list[top], + typeof(*group), list); + WARN_ON_ONCE(group->parent); + if (tmigr_root) { + /* Old root should be the same or below */ + WARN_ON_ONCE(tmigr_root->level > top); + } + tmigr_root = group; + } +out: kfree(stack); return err; @@ -1751,12 +1916,31 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) static int tmigr_add_cpu(unsigned int cpu) { + struct tmigr_group *old_root = tmigr_root; int node = cpu_to_node(cpu); int ret; - mutex_lock(&tmigr_mutex); - ret = tmigr_setup_groups(cpu, node); - mutex_unlock(&tmigr_mutex); + guard(mutex)(&tmigr_mutex); + + ret = tmigr_setup_groups(cpu, node, NULL, false); + + /* Root has changed? Connect the old one to the new */ + if (ret >= 0 && old_root && old_root != tmigr_root) { + /* + * The target CPU must never do the prepare work, except + * on early boot when the boot CPU is the target. Otherwise + * it may spuriously activate the old top level group inside + * the new one (nevertheless whether old top level group is + * active or not) and/or release an uninitialized childmask. + */ + WARN_ON_ONCE(cpu == raw_smp_processor_id()); + /* + * The (likely) current CPU is expected to be online in the hierarchy, + * otherwise the old root may not be active as expected. + */ + WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available); + ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true); + } return ret; } @@ -1801,6 +1985,11 @@ static int __init tmigr_init(void) if (ncpus == 1) return 0; + if (!zalloc_cpumask_var(&tmigr_available_cpumask, GFP_KERNEL)) { + ret = -ENOMEM; + goto err; + } + /* * Calculate the required hierarchy levels. Unfortunately there is no * reliable information available, unless all possible CPUs have been @@ -1832,7 +2021,8 @@ static int __init tmigr_init(void) */ tmigr_crossnode_level = cpulvl; - tmigr_level_list = kcalloc(tmigr_hierarchy_levels, sizeof(struct list_head), GFP_KERNEL); + tmigr_level_list = kzalloc_objs(struct list_head, + tmigr_hierarchy_levels); if (!tmigr_level_list) goto err; @@ -1850,7 +2040,7 @@ static int __init tmigr_init(void) goto err; ret = cpuhp_setup_state(CPUHP_AP_TMIGR_ONLINE, "tmigr:online", - tmigr_cpu_online, tmigr_cpu_offline); + tmigr_set_cpu_available, tmigr_clear_cpu_available); if (ret) goto err; diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h index ae19f70f8170..70879cde6fdd 100644 --- a/kernel/time/timer_migration.h +++ b/kernel/time/timer_migration.h @@ -97,7 +97,7 @@ struct tmigr_group { */ struct tmigr_cpu { raw_spinlock_t lock; - bool online; + bool available; bool idle; bool remote; struct tmigr_group *tmgroup; diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c index 05d383143165..aa59919b8f2c 100644 --- a/kernel/time/vsyscall.c +++ b/kernel/time/vsyscall.c @@ -15,29 +15,28 @@ #include "timekeeping_internal.h" -static inline void update_vdso_data(struct vdso_data *vdata, - struct timekeeper *tk) +static inline void fill_clock_configuration(struct vdso_clock *vc, const struct tk_read_base *base) { + vc->cycle_last = base->cycle_last; +#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT + vc->max_cycles = base->clock->max_cycles; +#endif + vc->mask = base->mask; + vc->mult = base->mult; + vc->shift = base->shift; +} + +static inline void update_vdso_time_data(struct vdso_time_data *vdata, struct timekeeper *tk) +{ + struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; u64 nsec, sec; - vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; -#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT - vdata[CS_HRES_COARSE].max_cycles = tk->tkr_mono.clock->max_cycles; -#endif - vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; - vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult; - vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift; - vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; -#ifdef CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT - vdata[CS_RAW].max_cycles = tk->tkr_raw.clock->max_cycles; -#endif - vdata[CS_RAW].mask = tk->tkr_raw.mask; - vdata[CS_RAW].mult = tk->tkr_raw.mult; - vdata[CS_RAW].shift = tk->tkr_raw.shift; + fill_clock_configuration(&vc[CS_HRES_COARSE], &tk->tkr_mono); + fill_clock_configuration(&vc[CS_RAW], &tk->tkr_raw); /* CLOCK_MONOTONIC */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; nsec = tk->tkr_mono.xtime_nsec; @@ -55,7 +54,7 @@ static inline void update_vdso_data(struct vdso_data *vdata, nsec += (u64)tk->monotonic_to_boot.tv_nsec << tk->tkr_mono.shift; /* CLOCK_BOOTTIME */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; vdso_ts->sec = sec; while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { @@ -65,19 +64,20 @@ static inline void update_vdso_data(struct vdso_data *vdata, vdso_ts->nsec = nsec; /* CLOCK_MONOTONIC_RAW */ - vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; + vdso_ts = &vc[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; vdso_ts->sec = tk->raw_sec; vdso_ts->nsec = tk->tkr_raw.xtime_nsec; /* CLOCK_TAI */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_TAI]; vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; } void update_vsyscall(struct timekeeper *tk) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; + struct vdso_clock *vc = vdata->clock_data; struct vdso_timestamp *vdso_ts; s32 clock_mode; u64 nsec; @@ -86,55 +86,95 @@ void update_vsyscall(struct timekeeper *tk) vdso_write_begin(vdata); clock_mode = tk->tkr_mono.clock->vdso_clock_mode; - vdata[CS_HRES_COARSE].clock_mode = clock_mode; - vdata[CS_RAW].clock_mode = clock_mode; + vc[CS_HRES_COARSE].clock_mode = clock_mode; + vc[CS_RAW].clock_mode = clock_mode; /* CLOCK_REALTIME also required for time() */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; vdso_ts->sec = tk->xtime_sec; vdso_ts->nsec = tk->tkr_mono.xtime_nsec; /* CLOCK_REALTIME_COARSE */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; vdso_ts->sec = tk->xtime_sec; - vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + vdso_ts->nsec = tk->coarse_nsec; /* CLOCK_MONOTONIC_COARSE */ - vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; + vdso_ts = &vc[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsec = tk->coarse_nsec; nsec = nsec + tk->wall_to_monotonic.tv_nsec; vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); /* * Read without the seqlock held by clock_getres(). - * Note: No need to have a second copy. */ - WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); + WRITE_ONCE(vdata->hrtimer_res, hrtimer_resolution); /* * If the current clocksource is not VDSO capable, then spare the * update of the high resolution parts. */ if (clock_mode != VDSO_CLOCKMODE_NONE) - update_vdso_data(vdata, tk); + update_vdso_time_data(vdata, tk); - __arch_update_vsyscall(vdata); + __arch_update_vdso_clock(&vc[CS_HRES_COARSE]); + __arch_update_vdso_clock(&vc[CS_RAW]); vdso_write_end(vdata); - __arch_sync_vdso_data(vdata); + __arch_sync_vdso_time_data(vdata); } void update_vsyscall_tz(void) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; + + vdata->tz_minuteswest = sys_tz.tz_minuteswest; + vdata->tz_dsttime = sys_tz.tz_dsttime; + + __arch_sync_vdso_time_data(vdata); +} + +#ifdef CONFIG_POSIX_AUX_CLOCKS +void vdso_time_update_aux(struct timekeeper *tk) +{ + struct vdso_time_data *vdata = vdso_k_time_data; + struct vdso_timestamp *vdso_ts; + struct vdso_clock *vc; + s32 clock_mode; + u64 nsec; + + vc = &vdata->aux_clock_data[tk->id - TIMEKEEPER_AUX_FIRST]; + vdso_ts = &vc->basetime[VDSO_BASE_AUX]; + clock_mode = tk->tkr_mono.clock->vdso_clock_mode; + if (!tk->clock_valid) + clock_mode = VDSO_CLOCKMODE_NONE; + + /* copy vsyscall data */ + vdso_write_begin_clock(vc); - vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; - vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; + vc->clock_mode = clock_mode; - __arch_sync_vdso_data(vdata); + if (clock_mode != VDSO_CLOCKMODE_NONE) { + fill_clock_configuration(vc, &tk->tkr_mono); + + vdso_ts->sec = tk->xtime_sec + tk->monotonic_to_aux.tv_sec; + + nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; + nsec += tk->monotonic_to_aux.tv_nsec; + vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &nsec); + nsec = nsec << tk->tkr_mono.shift; + vdso_ts->nsec = nsec; + } + + __arch_update_vdso_clock(vc); + + vdso_write_end_clock(vc); + + __arch_sync_vdso_time_data(vdata); } +#endif /** * vdso_update_begin - Start of a VDSO update section @@ -150,7 +190,7 @@ void update_vsyscall_tz(void) */ unsigned long vdso_update_begin(void) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; unsigned long flags = timekeeper_lock_irqsave(); vdso_write_begin(vdata); @@ -167,9 +207,9 @@ unsigned long vdso_update_begin(void) */ void vdso_update_end(unsigned long flags) { - struct vdso_data *vdata = __arch_get_k_vdso_data(); + struct vdso_time_data *vdata = vdso_k_time_data; vdso_write_end(vdata); - __arch_sync_vdso_data(vdata); + __arch_sync_vdso_time_data(vdata); timekeeper_unlock_irqrestore(flags); } |
