diff options
| -rw-r--r-- | drivers/misc/ntsync.c | 3 | ||||
| -rw-r--r-- | drivers/power/supply/charger-manager.c | 16 | ||||
| -rw-r--r-- | fs/timerfd.c | 117 | ||||
| -rw-r--r-- | include/linux/alarmtimer.h | 9 | ||||
| -rw-r--r-- | include/linux/clocksource.h | 12 | ||||
| -rw-r--r-- | include/linux/delay.h | 2 | ||||
| -rw-r--r-- | include/linux/hrtimer.h | 24 | ||||
| -rw-r--r-- | include/trace/events/timer.h | 13 | ||||
| -rw-r--r-- | include/trace/events/timer_migration.h | 24 | ||||
| -rw-r--r-- | kernel/time/alarmtimer.c | 72 | ||||
| -rw-r--r-- | kernel/time/clocksource.c | 9 | ||||
| -rw-r--r-- | kernel/time/hrtimer.c | 152 | ||||
| -rw-r--r-- | kernel/time/jiffies.c | 11 | ||||
| -rw-r--r-- | kernel/time/namespace.c | 2 | ||||
| -rw-r--r-- | kernel/time/posix-cpu-timers.c | 19 | ||||
| -rw-r--r-- | kernel/time/posix-timers.c | 35 | ||||
| -rw-r--r-- | kernel/time/posix-timers.h | 4 | ||||
| -rw-r--r-- | kernel/time/tick-sched.c | 3 | ||||
| -rw-r--r-- | kernel/time/timer.c | 2 | ||||
| -rw-r--r-- | kernel/time/timer_migration.c | 241 | ||||
| -rw-r--r-- | kernel/time/timer_migration.h | 36 | ||||
| -rw-r--r-- | net/netfilter/xt_IDLETIMER.c | 24 | ||||
| -rwxr-xr-x | scripts/timer_migration_tree.py | 122 | ||||
| -rw-r--r-- | tools/testing/selftests/timers/posix_timers.c | 55 |
24 files changed, 717 insertions, 290 deletions
diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c index 30af282262ef..02c9d1192812 100644 --- a/drivers/misc/ntsync.c +++ b/drivers/misc/ntsync.c @@ -19,6 +19,7 @@ #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include <linux/time_namespace.h> #include <uapi/linux/ntsync.h> #define NTSYNC_NAME "ntsync" @@ -836,6 +837,8 @@ static int ntsync_schedule(const struct ntsync_q *q, const struct ntsync_wait_ar if (args->flags & NTSYNC_WAIT_REALTIME) clock = CLOCK_REALTIME; + else + timeout = timens_ktime_to_host(clock, timeout); do { if (signal_pending(current)) { diff --git a/drivers/power/supply/charger-manager.c b/drivers/power/supply/charger-manager.c index c49e0e4d02f7..1b0239c59114 100644 --- a/drivers/power/supply/charger-manager.c +++ b/drivers/power/supply/charger-manager.c @@ -881,26 +881,22 @@ static bool cm_setup_timer(void) mutex_unlock(&cm_list_mtx); if (timer_req && cm_timer) { - ktime_t now, add; - /* * Set alarm with the polling interval (wakeup_ms) * The alarm time should be NOW + CM_RTC_SMALL or later. */ - if (wakeup_ms == UINT_MAX || - wakeup_ms < CM_RTC_SMALL * MSEC_PER_SEC) + if (wakeup_ms == UINT_MAX || wakeup_ms < CM_RTC_SMALL * MSEC_PER_SEC) wakeup_ms = 2 * CM_RTC_SMALL * MSEC_PER_SEC; pr_info("Charger Manager wakeup timer: %u ms\n", wakeup_ms); - now = ktime_get_boottime(); - add = ktime_set(wakeup_ms / MSEC_PER_SEC, - (wakeup_ms % MSEC_PER_SEC) * NSEC_PER_MSEC); - alarm_start(cm_timer, ktime_add(now, add)); - cm_suspend_duration_ms = wakeup_ms; - return true; + /* + * The timer should always be queued as the timeout is at least + * two seconds out. Handle it correctly nevertheless. + */ + return alarm_start_timer(cm_timer, ktime_add_ms(0, wakeup_ms), true); } return false; } diff --git a/fs/timerfd.c b/fs/timerfd.c index 73104f36bcae..fe845af0b74e 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -55,6 +55,15 @@ static inline bool isalarm(struct timerfd_ctx *ctx) ctx->clockid == CLOCK_BOOTTIME_ALARM; } +static void __timerfd_triggered(struct timerfd_ctx *ctx) +{ + lockdep_assert_held(&ctx->wqh.lock); + + ctx->expired = 1; + ctx->ticks++; + wake_up_locked_poll(&ctx->wqh, EPOLLIN); +} + /* * This gets called when the timer event triggers. We set the "expired" * flag, but we do not re-arm the timer (in case it's necessary, @@ -62,13 +71,8 @@ static inline bool isalarm(struct timerfd_ctx *ctx) */ static void timerfd_triggered(struct timerfd_ctx *ctx) { - unsigned long flags; - - spin_lock_irqsave(&ctx->wqh.lock, flags); - ctx->expired = 1; - ctx->ticks++; - wake_up_locked_poll(&ctx->wqh, EPOLLIN); - spin_unlock_irqrestore(&ctx->wqh.lock, flags); + guard(spinlock_irqsave)(&ctx->wqh.lock); + __timerfd_triggered(ctx); } static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) @@ -184,15 +188,54 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) return remaining < 0 ? 0: remaining; } +static void timerfd_alarm_start(struct timerfd_ctx *ctx, ktime_t exp, bool relative) +{ + /* Start the timer. If it's expired already, handle the callback. */ + if (!alarm_start_timer(&ctx->t.alarm, exp, relative)) + __timerfd_triggered(ctx); +} + +static u64 timerfd_alarm_restart(struct timerfd_ctx *ctx) +{ + /* -1 to account for ctx->ticks++ in __timerfd_triggered() */ + u64 ticks = alarm_forward_now(&ctx->t.alarm, ctx->tintv) - 1; + + timerfd_alarm_start(ctx, alarm_get_expires(&ctx->t.alarm), false); + return ticks; +} + +static void timerfd_hrtimer_start(struct timerfd_ctx *ctx, ktime_t exp, + const enum hrtimer_mode mode) +{ + /* Start the timer. If it's expired already, handle the callback. */ + if (!hrtimer_start_range_ns_user(&ctx->t.tmr, exp, 0, mode)) + __timerfd_triggered(ctx); +} + +static u64 timerfd_hrtimer_restart(struct timerfd_ctx *ctx) +{ + /* -1 to account for ctx->ticks++ in __timerfd_triggered() */ + u64 ticks = hrtimer_forward_now(&ctx->t.tmr, ctx->tintv) - 1; + + timerfd_hrtimer_start(ctx, hrtimer_get_expires(&ctx->t.tmr), HRTIMER_MODE_ABS); + return ticks; +} + +static u64 timerfd_restart(struct timerfd_ctx *ctx) +{ + if (isalarm(ctx)) + return timerfd_alarm_restart(ctx); + return timerfd_hrtimer_restart(ctx); +} + static int timerfd_setup(struct timerfd_ctx *ctx, int flags, const struct itimerspec64 *ktmr) { + int clockid = ctx->clockid; enum hrtimer_mode htmode; ktime_t texp; - int clockid = ctx->clockid; - htmode = (flags & TFD_TIMER_ABSTIME) ? - HRTIMER_MODE_ABS: HRTIMER_MODE_REL; + htmode = (flags & TFD_TIMER_ABSTIME) ? HRTIMER_MODE_ABS: HRTIMER_MODE_REL; texp = timespec64_to_ktime(ktmr->it_value); ctx->expired = 0; @@ -206,20 +249,15 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags, timerfd_alarmproc); } else { hrtimer_setup(&ctx->t.tmr, timerfd_tmrproc, clockid, htmode); - hrtimer_set_expires(&ctx->t.tmr, texp); } if (texp != 0) { if (flags & TFD_TIMER_ABSTIME) texp = timens_ktime_to_host(clockid, texp); - if (isalarm(ctx)) { - if (flags & TFD_TIMER_ABSTIME) - alarm_start(&ctx->t.alarm, texp); - else - alarm_start_relative(&ctx->t.alarm, texp); - } else { - hrtimer_start(&ctx->t.tmr, texp, htmode); - } + if (isalarm(ctx)) + timerfd_alarm_start(ctx, texp, !(flags & TFD_TIMER_ABSTIME)); + else + timerfd_hrtimer_start(ctx, texp, htmode); if (timerfd_canceled(ctx)) return -ECANCELED; @@ -287,27 +325,19 @@ static ssize_t timerfd_read_iter(struct kiocb *iocb, struct iov_iter *to) } if (ctx->ticks) { - ticks = ctx->ticks; + unsigned int expired = ctx->expired; - if (ctx->expired && ctx->tintv) { - /* - * If tintv != 0, this is a periodic timer that - * needs to be re-armed. We avoid doing it in the timer - * callback to avoid DoS attacks specifying a very - * short timer period. - */ - if (isalarm(ctx)) { - ticks += alarm_forward_now( - &ctx->t.alarm, ctx->tintv) - 1; - alarm_restart(&ctx->t.alarm); - } else { - ticks += hrtimer_forward_now(&ctx->t.tmr, - ctx->tintv) - 1; - hrtimer_restart(&ctx->t.tmr); - } - } + ticks = ctx->ticks; ctx->expired = 0; ctx->ticks = 0; + + /* + * If tintv != 0, this is a periodic timer that needs to be + * re-armed. We avoid doing it in the timer callback to avoid + * DoS attacks specifying a very short timer period. + */ + if (expired && ctx->tintv) + ticks += timerfd_restart(ctx); } spin_unlock_irq(&ctx->wqh.lock); if (ticks) { @@ -526,18 +556,7 @@ static int do_timerfd_gettime(int ufd, struct itimerspec64 *t) spin_lock_irq(&ctx->wqh.lock); if (ctx->expired && ctx->tintv) { ctx->expired = 0; - - if (isalarm(ctx)) { - ctx->ticks += - alarm_forward_now( - &ctx->t.alarm, ctx->tintv) - 1; - alarm_restart(&ctx->t.alarm); - } else { - ctx->ticks += - hrtimer_forward_now(&ctx->t.tmr, ctx->tintv) - - 1; - hrtimer_restart(&ctx->t.tmr); - } + ctx->ticks += timerfd_restart(ctx); } t->it_value = ktime_to_timespec64(timerfd_get_remaining(ctx)); t->it_interval = ktime_to_timespec64(ctx->tintv); diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h index 3ffa5341dce2..2014288ca2f4 100644 --- a/include/linux/alarmtimer.h +++ b/include/linux/alarmtimer.h @@ -42,11 +42,14 @@ struct alarm { void *data; }; +static __always_inline ktime_t alarm_get_expires(struct alarm *alarm) +{ + return alarm->node.expires; +} + void alarm_init(struct alarm *alarm, enum alarmtimer_type type, void (*function)(struct alarm *, ktime_t)); -void alarm_start(struct alarm *alarm, ktime_t start); -void alarm_start_relative(struct alarm *alarm, ktime_t start); -void alarm_restart(struct alarm *alarm); +bool alarm_start_timer(struct alarm *alarm, ktime_t expires, bool relative); int alarm_try_to_cancel(struct alarm *alarm); int alarm_cancel(struct alarm *alarm); diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index c5b34c16602e..b12a6d19aa60 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -239,8 +239,6 @@ __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq); extern int __devm_clocksource_register_scale(struct device *dev, struct clocksource *cs, u32 scale, u32 freq); -extern void -__clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq); /* * Don't call this unless you are a default clocksource @@ -273,16 +271,6 @@ static inline int devm_clocksource_register_khz(struct device *dev, return __devm_clocksource_register_scale(dev, cs, 1000, khz); } -static inline void __clocksource_update_freq_hz(struct clocksource *cs, u32 hz) -{ - __clocksource_update_freq_scale(cs, 1, hz); -} - -static inline void __clocksource_update_freq_khz(struct clocksource *cs, u32 khz) -{ - __clocksource_update_freq_scale(cs, 1000, khz); -} - #ifdef CONFIG_ARCH_CLOCKSOURCE_INIT extern void clocksource_arch_init(struct clocksource *cs); #else diff --git a/include/linux/delay.h b/include/linux/delay.h index 46412c00033a..68b2a69dd24d 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -110,7 +110,7 @@ static const unsigned int max_slack_shift = 2; * fsleep - flexible sleep which autoselects the best mechanism * @usecs: requested sleep duration in microseconds * - * flseep() selects the best mechanism that will provide maximum 25% slack + * fsleep() selects the best mechanism that will provide maximum 25% slack * to the requested sleep duration. Therefore it uses: * * * udelay() loop for sleep durations <= 10 microseconds to avoid hrtimer diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 9ced498fefaa..6862dea0acc5 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -206,6 +206,9 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 range_ns, const enum hrtimer_mode mode); +extern bool hrtimer_start_range_ns_user(struct hrtimer *timer, ktime_t tim, + u64 range_ns, const enum hrtimer_mode mode); + /** * hrtimer_start - (re)start an hrtimer * @timer: the timer to be added @@ -223,17 +226,28 @@ static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim, extern int hrtimer_cancel(struct hrtimer *timer); extern int hrtimer_try_to_cancel(struct hrtimer *timer); -static inline void hrtimer_start_expires(struct hrtimer *timer, - enum hrtimer_mode mode) +static inline void hrtimer_start_expires(struct hrtimer *timer, enum hrtimer_mode mode) { - u64 delta; ktime_t soft, hard; + u64 delta; + soft = hrtimer_get_softexpires(timer); hard = hrtimer_get_expires(timer); delta = ktime_to_ns(ktime_sub(hard, soft)); hrtimer_start_range_ns(timer, soft, delta, mode); } +static inline bool hrtimer_start_expires_user(struct hrtimer *timer, enum hrtimer_mode mode) +{ + ktime_t soft, hard; + u64 delta; + + soft = hrtimer_get_softexpires(timer); + hard = hrtimer_get_expires(timer); + delta = ktime_to_ns(ktime_sub(hard, soft)); + return hrtimer_start_range_ns_user(timer, soft, delta, mode); +} + void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode); @@ -254,8 +268,8 @@ static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer) return __hrtimer_get_remaining(timer, false); } -extern u64 hrtimer_get_next_event(void); -extern u64 hrtimer_next_event_without(const struct hrtimer *exclude); +extern ktime_t hrtimer_get_next_event(void); +extern ktime_t hrtimer_next_event_without(const struct hrtimer *exclude); extern bool hrtimer_active(const struct hrtimer *timer); diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 07cbb9836b91..ca82fd62dc30 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -299,6 +299,19 @@ DECLARE_EVENT_CLASS(hrtimer_class, ); /** + * hrtimer_start_expired - Invoked when a expired timer was started + * @hrtimer: pointer to struct hrtimer + * + * Preceeded by a hrtimer_start tracepoint. + */ +DEFINE_EVENT(hrtimer_class, hrtimer_start_expired, + + TP_PROTO(struct hrtimer *hrtimer), + + TP_ARGS(hrtimer) +); + +/** * hrtimer_expire_exit - called immediately after the hrtimer callback returns * @hrtimer: pointer to struct hrtimer * diff --git a/include/trace/events/timer_migration.h b/include/trace/events/timer_migration.h index 61171b13c687..0b135e9301b1 100644 --- a/include/trace/events/timer_migration.h +++ b/include/trace/events/timer_migration.h @@ -33,15 +33,16 @@ TRACE_EVENT(tmigr_group_set, TRACE_EVENT(tmigr_connect_child_parent, - TP_PROTO(struct tmigr_group *child), + TP_PROTO(struct tmigr_hierarchy *hier, struct tmigr_group *child), - TP_ARGS(child), + TP_ARGS(hier, child), TP_STRUCT__entry( __field( void *, child ) __field( void *, parent ) __field( unsigned int, lvl ) __field( unsigned int, numa_node ) + __field( unsigned int, capacity ) __field( unsigned int, num_children ) __field( u32, groupmask ) ), @@ -51,26 +52,28 @@ TRACE_EVENT(tmigr_connect_child_parent, __entry->parent = child->parent; __entry->lvl = child->parent->level; __entry->numa_node = child->parent->numa_node; + __entry->capacity = hier->capacity; __entry->num_children = child->parent->num_children; __entry->groupmask = child->groupmask; ), - TP_printk("group=%p groupmask=%0x parent=%p lvl=%d numa=%d num_children=%d", - __entry->child, __entry->groupmask, __entry->parent, - __entry->lvl, __entry->numa_node, __entry->num_children) + TP_printk("group=%p groupmask=%0x parent=%p lvl=%d numa=%d capacity=%d num_children=%d", + __entry->child, __entry->groupmask, __entry->parent, __entry->lvl, + __entry->numa_node, __entry->capacity, __entry->num_children) ); TRACE_EVENT(tmigr_connect_cpu_parent, - TP_PROTO(struct tmigr_cpu *tmc), + TP_PROTO(struct tmigr_hierarchy *hier, struct tmigr_cpu *tmc), - TP_ARGS(tmc), + TP_ARGS(hier, tmc), TP_STRUCT__entry( __field( void *, parent ) __field( unsigned int, cpu ) __field( unsigned int, lvl ) __field( unsigned int, numa_node ) + __field( unsigned int, capacity ) __field( unsigned int, num_children ) __field( u32, groupmask ) ), @@ -80,13 +83,14 @@ TRACE_EVENT(tmigr_connect_cpu_parent, __entry->cpu = tmc->cpuevt.cpu; __entry->lvl = tmc->tmgroup->level; __entry->numa_node = tmc->tmgroup->numa_node; + __entry->capacity = hier->capacity; __entry->num_children = tmc->tmgroup->num_children; __entry->groupmask = tmc->groupmask; ), - TP_printk("cpu=%d groupmask=%0x parent=%p lvl=%d numa=%d num_children=%d", - __entry->cpu, __entry->groupmask, __entry->parent, - __entry->lvl, __entry->numa_node, __entry->num_children) + TP_printk("cpu=%d groupmask=%0x parent=%p lvl=%d numa=%d capacity=%d num_children=%d", + __entry->cpu, __entry->groupmask, __entry->parent, __entry->lvl, + __entry->numa_node, __entry->capacity, __entry->num_children) ); DECLARE_EVENT_CLASS(tmigr_group_and_cpu, diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 6e173d70d825..ea5be5870e51 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -337,48 +337,32 @@ void alarm_init(struct alarm *alarm, enum alarmtimer_type type, EXPORT_SYMBOL_GPL(alarm_init); /** - * alarm_start - Sets an absolute alarm to fire - * @alarm: ptr to alarm to set - * @start: time to run the alarm + * alarm_start_timer - Sets an alarm to fire + * @alarm: Pointer to alarm to set + * @expires: Expiry time + * @relative: True if @expires is relative + * + * Returns: True if the alarm was queued. False if it already expired */ -void alarm_start(struct alarm *alarm, ktime_t start) +bool alarm_start_timer(struct alarm *alarm, ktime_t expires, bool relative) { struct alarm_base *base = &alarm_bases[alarm->type]; - scoped_guard(spinlock_irqsave, &base->lock) { - alarm->node.expires = start; - alarmtimer_enqueue(base, alarm); - hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); - } + if (relative) + expires = ktime_add_safe(expires, base->get_ktime()); trace_alarmtimer_start(alarm, base->get_ktime()); -} -EXPORT_SYMBOL_GPL(alarm_start); - -/** - * alarm_start_relative - Sets a relative alarm to fire - * @alarm: ptr to alarm to set - * @start: time relative to now to run the alarm - */ -void alarm_start_relative(struct alarm *alarm, ktime_t start) -{ - struct alarm_base *base = &alarm_bases[alarm->type]; - - start = ktime_add_safe(start, base->get_ktime()); - alarm_start(alarm, start); -} -EXPORT_SYMBOL_GPL(alarm_start_relative); - -void alarm_restart(struct alarm *alarm) -{ - struct alarm_base *base = &alarm_bases[alarm->type]; guard(spinlock_irqsave)(&base->lock); - hrtimer_set_expires(&alarm->timer, alarm->node.expires); - hrtimer_restart(&alarm->timer); + alarm->node.expires = expires; alarmtimer_enqueue(base, alarm); + if (!hrtimer_start_range_ns_user(&alarm->timer, expires, 0, HRTIMER_MODE_ABS)) { + alarmtimer_dequeue(base, alarm); + return false; + } + return true; } -EXPORT_SYMBOL_GPL(alarm_restart); +EXPORT_SYMBOL_GPL(alarm_start_timer); /** * alarm_try_to_cancel - Tries to cancel an alarm timer @@ -512,8 +496,6 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) * @now: time at the timer expiration * * Posix timer callback for expired alarm timers. - * - * Return: whether the timer is to be restarted */ static void alarm_handle_timer(struct alarm *alarm, ktime_t now) { @@ -527,12 +509,12 @@ static void alarm_handle_timer(struct alarm *alarm, ktime_t now) * alarm_timer_rearm - Posix timer callback for rearming timer * @timr: Pointer to the posixtimer data struct */ -static void alarm_timer_rearm(struct k_itimer *timr) +static bool alarm_timer_rearm(struct k_itimer *timr) { struct alarm *alarm = &timr->it.alarm.alarmtimer; timr->it_overrun += alarm_forward_now(alarm, timr->it_interval); - alarm_start(alarm, alarm->node.expires); + return alarm_start_timer(alarm, alarm->node.expires, false); } /** @@ -588,7 +570,7 @@ static void alarm_timer_wait_running(struct k_itimer *timr) * @absolute: Expiry value is absolute time * @sigev_none: Posix timer does not deliver signals */ -static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires, +static bool alarm_timer_arm(struct k_itimer *timr, ktime_t expires, bool absolute, bool sigev_none) { struct alarm *alarm = &timr->it.alarm.alarmtimer; @@ -596,10 +578,16 @@ static void alarm_timer_arm(struct k_itimer *timr, ktime_t expires, if (!absolute) expires = ktime_add_safe(expires, base->get_ktime()); - if (sigev_none) + + /* + * sigev_none needs to update the expires value and pretend + * that the timer is queued + */ + if (sigev_none) { alarm->node.expires = expires; - else - alarm_start(&timr->it.alarm.alarmtimer, expires); + return true; + } + return alarm_start_timer(&timr->it.alarm.alarmtimer, expires, false); } /** @@ -706,7 +694,9 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp, alarm->data = (void *)current; do { set_current_state(TASK_INTERRUPTIBLE); - alarm_start(alarm, absexp); + if (!alarm_start_timer(alarm, absexp, false)) + alarm->data = NULL; + if (likely(alarm->data)) schedule(); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 313f6c88148e..e48c4d379a7c 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -1222,14 +1222,8 @@ static void clocksource_enqueue(struct clocksource *cs) * @cs: clocksource to be registered * @scale: Scale factor multiplied against freq to get clocksource hz * @freq: clocksource frequency (cycles per second) divided by scale - * - * This should only be called from the clocksource->enable() method. - * - * This *SHOULD NOT* be called directly! Please use the - * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper - * functions. */ -void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq) +static void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq) { u64 sec; @@ -1287,7 +1281,6 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n", cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns); } -EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale); /** * __clocksource_register_scale - Used to install new clocksources diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 5bd6efe598f0..638ce623c342 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1352,8 +1352,14 @@ static inline bool hrtimer_keep_base(struct hrtimer *timer, bool is_local, bool return hrtimer_prefer_local(is_local, is_first, is_pinned); } -static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, - const enum hrtimer_mode mode, struct hrtimer_clock_base *base) +enum { + HRTIMER_REPROGRAM_NONE, + HRTIMER_REPROGRAM, + HRTIMER_REPROGRAM_FORCE, +}; + +static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, + const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases); bool is_pinned, first, was_first, keep_base = false; @@ -1410,7 +1416,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del /* If a deferred rearm is pending skip reprogramming the device */ if (cpu_base->deferred_rearm) { cpu_base->deferred_needs_update = true; - return false; + return HRTIMER_REPROGRAM_NONE; } if (!was_first || cpu_base != this_cpu_base) { @@ -1423,7 +1429,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del * callbacks. */ if (likely(hrtimer_base_is_online(this_cpu_base))) - return first; + return first ? HRTIMER_REPROGRAM : HRTIMER_REPROGRAM_NONE; /* * Timer was enqueued remote because the current base is @@ -1432,7 +1438,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del */ if (first) smp_call_function_single_async(cpu_base->cpu, &cpu_base->csd); - return false; + return HRTIMER_REPROGRAM_NONE; } /* @@ -1446,7 +1452,7 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del */ if (timer->is_lazy) { if (cpu_base->expires_next <= hrtimer_get_expires(timer)) - return false; + return HRTIMER_REPROGRAM_NONE; } /* @@ -1455,8 +1461,24 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del * reprogram the hardware by evaluating the new first expiring * timer. */ - hrtimer_force_reprogram(cpu_base, /* skip_equal */ true); - return false; + return HRTIMER_REPROGRAM_FORCE; +} + +static int hrtimer_start_range_ns_common(struct hrtimer *timer, ktime_t tim, + u64 delta_ns, const enum hrtimer_mode mode, + struct hrtimer_clock_base *base) +{ + /* + * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft + * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard + * expiry mode because unmarked timers are moved to softirq expiry. + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); + else + WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard); + + return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, base); } /** @@ -1476,24 +1498,104 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, debug_hrtimer_assert_init(timer); + base = lock_hrtimer_base(timer, &flags); + + switch (hrtimer_start_range_ns_common(timer, tim, delta_ns, mode, base)) { + case HRTIMER_REPROGRAM: + hrtimer_reprogram(timer, true); + break; + case HRTIMER_REPROGRAM_FORCE: + hrtimer_force_reprogram(timer->base->cpu_base, 1); + break; + case HRTIMER_REPROGRAM_NONE: + break; + } + + unlock_hrtimer_base(timer, &flags); +} +EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); + +static inline bool hrtimer_check_user_timer(struct hrtimer *timer) +{ + struct hrtimer_cpu_base *cpu_base = timer->base->cpu_base; + ktime_t expires; + /* - * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft - * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard - * expiry mode because unmarked timers are moved to softirq expiry. + * This uses soft expires because that's the user provided + * expiry time, while expires can be further in the past + * due to a slack value added to the user expiry time. */ - if (!IS_ENABLED(CONFIG_PREEMPT_RT)) - WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); - else - WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard); + expires = hrtimer_get_softexpires(timer); + + /* Convert to monotonic */ + expires = ktime_sub(expires, timer->base->offset); + + /* + * Check whether this timer will end up as the first expiring timer in + * the CPU base. If not, no further checks required as it's then + * guaranteed to expire in the future. + */ + if (expires >= cpu_base->expires_next) + return true; + + /* Validate that the expiry time is in the future. */ + if (expires > ktime_get()) + return true; + + debug_hrtimer_deactivate(timer); + __remove_hrtimer(timer, timer->base, HRTIMER_STATE_INACTIVE, false); + trace_hrtimer_start_expired(timer); + return false; +} + +/** + * hrtimer_start_range_ns_user - (re)start an user controlled hrtimer + * @timer: the timer to be added + * @tim: expiry time + * @delta_ns: "slack" range for the timer + * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); + * softirq based mode is considered for debug purpose only! + * + * Returns: True when the timer was queued, false if it was already expired + * + * This function cannot invoke the timer callback for expired timers as it might + * be called under a lock which the timer callback needs to acquire. So the + * caller has to handle that case. + */ +bool hrtimer_start_range_ns_user(struct hrtimer *timer, ktime_t tim, + u64 delta_ns, const enum hrtimer_mode mode) +{ + struct hrtimer_clock_base *base; + unsigned long flags; + bool ret = true; + + debug_hrtimer_assert_init(timer); base = lock_hrtimer_base(timer, &flags); - if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base)) - hrtimer_reprogram(timer, true); + switch (hrtimer_start_range_ns_common(timer, tim, delta_ns, mode, base)) { + case HRTIMER_REPROGRAM: + ret = hrtimer_check_user_timer(timer); + if (ret) + hrtimer_reprogram(timer, true); + break; + case HRTIMER_REPROGRAM_FORCE: + ret = hrtimer_check_user_timer(timer); + /* + * The base must always be reevaluated, independent of the + * result above because the timer was the first pending timer. + */ + hrtimer_force_reprogram(timer->base->cpu_base, 1); + break; + case HRTIMER_REPROGRAM_NONE: + break; + } unlock_hrtimer_base(timer, &flags); + return ret; } -EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); +EXPORT_SYMBOL_GPL(hrtimer_start_range_ns_user); /** * hrtimer_try_to_cancel - try to deactivate a timer @@ -1681,10 +1783,10 @@ EXPORT_SYMBOL_GPL(__hrtimer_get_remaining); * * Returns the next expiry time or KTIME_MAX if no timer is pending. */ -u64 hrtimer_get_next_event(void) +ktime_t hrtimer_get_next_event(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - u64 expires = KTIME_MAX; + ktime_t expires = KTIME_MAX; guard(raw_spinlock_irqsave)(&cpu_base->lock); if (!hrtimer_hres_active(cpu_base)) @@ -1700,10 +1802,10 @@ u64 hrtimer_get_next_event(void) * Returns the next expiry time over all timers except for the @exclude one or * KTIME_MAX if none of them is pending. */ -u64 hrtimer_next_event_without(const struct hrtimer *exclude) +ktime_t hrtimer_next_event_without(const struct hrtimer *exclude) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - u64 expires = KTIME_MAX; + ktime_t expires = KTIME_MAX; unsigned int active; guard(raw_spinlock_irqsave)(&cpu_base->lock); @@ -2213,7 +2315,11 @@ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) mode |= HRTIMER_MODE_HARD; - hrtimer_start_expires(&sl->timer, mode); + /* If already expired, clear the task pointer and set current state to running */ + if (!hrtimer_start_expires_user(&sl->timer, mode)) { + sl->task = NULL; + __set_current_state(TASK_RUNNING); + } } EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 1c954f330dfe..d51428867a33 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -60,15 +60,14 @@ EXPORT_SYMBOL(get_jiffies_64); EXPORT_SYMBOL(jiffies); -static int __init init_jiffies_clocksource(void) -{ - return __clocksource_register(&clocksource_jiffies); -} - -core_initcall(init_jiffies_clocksource); +static bool cs_jiffies_registered __initdata; struct clocksource * __init __weak clocksource_default_clock(void) { + if (!cs_jiffies_registered) { + __clocksource_register(&clocksource_jiffies); + cs_jiffies_registered = true; + } return &clocksource_jiffies; } diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 4bca3f78c8ea..5fa0af66cf3f 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -57,6 +57,7 @@ ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim, return tim; } +EXPORT_SYMBOL_GPL(do_timens_ktime_to_host); static struct ucounts *inc_time_namespaces(struct user_namespace *ns) { @@ -351,6 +352,7 @@ struct time_namespace init_time_ns = { .user_ns = &init_user_ns, .frozen_offsets = true, }; +EXPORT_SYMBOL_GPL(init_time_ns); void __init time_ns_init(void) { diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 0de2bb7cbec0..74775b94d11b 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -19,7 +19,7 @@ #include "posix-timers.h" -static void posix_cpu_timer_rearm(struct k_itimer *timer); +static bool posix_cpu_timer_rearm(struct k_itimer *timer); void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) { @@ -1011,24 +1011,27 @@ static void check_process_timers(struct task_struct *tsk, /* * This is called from the signal code (via posixtimer_rearm) * when the last timer signal was delivered and we have to reload the timer. + * + * Return true unconditionally so the core code assumes the timer to be + * armed. Otherwise it would requeue the signal. */ -static void posix_cpu_timer_rearm(struct k_itimer *timer) +static bool posix_cpu_timer_rearm(struct k_itimer *timer) { clockid_t clkid = CPUCLOCK_WHICH(timer->it_clock); - struct task_struct *p; struct sighand_struct *sighand; + struct task_struct *p; unsigned long flags; u64 now; - rcu_read_lock(); + guard(rcu)(); p = cpu_timer_task_rcu(timer); if (!p) - goto out; + return true; /* Protect timer list r/w in arm_timer() */ sighand = lock_task_sighand(p, &flags); if (unlikely(sighand == NULL)) - goto out; + return true; /* * Fetch the current sample and update the timer's expiry time. @@ -1045,8 +1048,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer) */ arm_timer(timer, p); unlock_task_sighand(p, &flags); -out: - rcu_read_unlock(); + return true; } /** @@ -1504,6 +1506,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, spin_lock_irq(&timer.it_lock); error = posix_cpu_timer_set(&timer, flags, &it, NULL); if (error) { + posix_cpu_timer_del(&timer); spin_unlock_irq(&timer.it_lock); return error; } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 9331e1614124..436ba794cc0b 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -288,16 +288,18 @@ static inline int timer_overrun_to_int(struct k_itimer *timr) return (int)timr->it_overrun_last; } -static void common_hrtimer_rearm(struct k_itimer *timr) +static bool common_hrtimer_rearm(struct k_itimer *timr) { struct hrtimer *timer = &timr->it.real.timer; timr->it_overrun += hrtimer_forward_now(timer, timr->it_interval); - hrtimer_restart(timer); + return hrtimer_start_expires_user(timer, HRTIMER_MODE_ABS); } static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_itimer *timr) { + bool queued; + guard(spinlock)(&timr->it_lock); /* @@ -311,12 +313,18 @@ static bool __posixtimer_deliver_signal(struct kernel_siginfo *info, struct k_it if (!timr->it_interval || WARN_ON_ONCE(timr->it_status != POSIX_TIMER_REQUEUE_PENDING)) return true; - timr->kclock->timer_rearm(timr); - timr->it_status = POSIX_TIMER_ARMED; + /* timer_rearm() updates timr::it_overrun */ + queued = timr->kclock->timer_rearm(timr); + timr->it_overrun_last = timr->it_overrun; timr->it_overrun = -1LL; ++timr->it_signal_seq; info->si_overrun = timer_overrun_to_int(timr); + + if (queued) + timr->it_status = POSIX_TIMER_ARMED; + else + posix_timer_queue_signal(timr); return true; } @@ -795,7 +803,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) return timer_overrun_to_int(scoped_timer); } -static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, +static bool common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, bool absolute, bool sigev_none) { struct hrtimer *timer = &timr->it.real.timer; @@ -820,8 +828,11 @@ static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, expires = ktime_add_safe(expires, hrtimer_cb_get_time(timer)); hrtimer_set_expires(timer, expires); - if (!sigev_none) - hrtimer_start_expires(timer, HRTIMER_MODE_ABS); + /* For sigev_none pretend that the timer is queued */ + if (sigev_none) + return true; + + return hrtimer_start_expires_user(timer, HRTIMER_MODE_ABS); } static int common_hrtimer_try_to_cancel(struct k_itimer *timr) @@ -903,9 +914,13 @@ int common_timer_set(struct k_itimer *timr, int flags, expires = timens_ktime_to_host(timr->it_clock, expires); sigev_none = timr->it_sigev_notify == SIGEV_NONE; - kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); - if (!sigev_none) - timr->it_status = POSIX_TIMER_ARMED; + if (kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none)) { + if (!sigev_none) + timr->it_status = POSIX_TIMER_ARMED; + } else { + /* Timer was already expired, queue the signal */ + posix_timer_queue_signal(timr); + } return 0; } diff --git a/kernel/time/posix-timers.h b/kernel/time/posix-timers.h index 7f259e845d24..4ea9611dd716 100644 --- a/kernel/time/posix-timers.h +++ b/kernel/time/posix-timers.h @@ -27,11 +27,11 @@ struct k_clock { int (*timer_del)(struct k_itimer *timr); void (*timer_get)(struct k_itimer *timr, struct itimerspec64 *cur_setting); - void (*timer_rearm)(struct k_itimer *timr); + bool (*timer_rearm)(struct k_itimer *timr); s64 (*timer_forward)(struct k_itimer *timr, ktime_t now); ktime_t (*timer_remaining)(struct k_itimer *timr, ktime_t now); int (*timer_try_to_cancel)(struct k_itimer *timr); - void (*timer_arm)(struct k_itimer *timr, ktime_t expires, + bool (*timer_arm)(struct k_itimer *timr, ktime_t expires, bool absolute, bool sigev_none); void (*timer_wait_running)(struct k_itimer *timr); }; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index cbbb87a0c6e7..3026a301dff7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1407,8 +1407,7 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) * If the next highres timer to expire is earlier than 'next_event', the * idle governor needs to know that. */ - next_event = min_t(u64, next_event, - hrtimer_next_event_without(&ts->sched_timer)); + next_event = min(next_event, hrtimer_next_event_without(&ts->sched_timer)); return ktime_sub(next_event, now); } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 04d928c21aba..655a8c6cd84d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1932,7 +1932,7 @@ static void timer_recalc_next_expiry(struct timer_base *base) */ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) { - u64 nextevt = hrtimer_get_next_event(); + u64 nextevt = ktime_to_ns(hrtimer_get_next_event()); /* * If high resolution timers are enabled diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 52c15affdbff..806c23cf71fc 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -102,7 +102,7 @@ * active CPU/group information atomic_try_cmpxchg() is used instead and only * the per CPU tmigr_cpu->lock is held. * - * During the setup of groups tmigr_level_list is required. It is protected by + * During the setup of groups, hier->level_list is required. It is protected by * @tmigr_mutex. * * When @timer_base->lock as well as tmigr related locks are required, the lock @@ -416,13 +416,12 @@ */ static DEFINE_MUTEX(tmigr_mutex); -static struct list_head *tmigr_level_list __read_mostly; + +static LIST_HEAD(tmigr_hierarchy_list); static unsigned int tmigr_hierarchy_levels __read_mostly; static unsigned int tmigr_crossnode_level __read_mostly; -static struct tmigr_group *tmigr_root; - static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu); /* @@ -1469,6 +1468,34 @@ static long tmigr_trigger_active(void *unused) return 0; } +static unsigned int tmigr_get_capacity(int cpu) +{ + /* + * nohz_full CPUs need to make sure there is always an available (online) + * and never idle migrator to handle all their global timers. That duty + * is served by the timekeeper which then never stops its tick. But the + * timekeeper must then belong to the same hierarchy as all the nohz_full + * CPUs. Simply turn off capacity awareness when nohz_full is running. + */ + if (tick_nohz_full_enabled() || !IS_ENABLED(CONFIG_BROKEN)) + return SCHED_CAPACITY_SCALE; + else + return arch_scale_cpu_capacity(cpu); +} + +static struct tmigr_hierarchy *__tmigr_get_hierarchy(int cpu) +{ + unsigned int capacity = tmigr_get_capacity(cpu); + struct tmigr_hierarchy *iter; + + list_for_each_entry(iter, &tmigr_hierarchy_list, node) { + if (iter->capacity == capacity) + return iter; + } + + return NULL; +} + static int tmigr_clear_cpu_available(unsigned int cpu) { struct tmigr_cpu *tmc = this_cpu_ptr(&tmigr_cpu); @@ -1493,8 +1520,21 @@ static int tmigr_clear_cpu_available(unsigned int cpu) } if (firstexp != KTIME_MAX) { - migrator = cpumask_any(tmigr_available_cpumask); - work_on_cpu(migrator, tmigr_trigger_active, NULL); + struct tmigr_hierarchy *hier = __tmigr_get_hierarchy(cpu); + + if (WARN_ON_ONCE(!hier)) + return -EINVAL; + + migrator = cpumask_any_and(tmigr_available_cpumask, hier->cpumask); + if (migrator < nr_cpu_ids) { + work_on_cpu(migrator, tmigr_trigger_active, NULL); + } else { + /* + * If deactivation returned an expiration, it belongs to an available + * nohz CPU in the hierarchy. + */ + WARN_ONCE(1, "Expected available CPU in the hierarchy\n"); + } } return 0; @@ -1657,14 +1697,14 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, group->groupevt.ignore = true; } -static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl) +static struct tmigr_group *tmigr_get_group(struct tmigr_hierarchy *hier, int node, unsigned int lvl) { struct tmigr_group *tmp, *group = NULL; lockdep_assert_held(&tmigr_mutex); /* Try to attach to an existing group first */ - list_for_each_entry(tmp, &tmigr_level_list[lvl], list) { + list_for_each_entry(tmp, &hier->level_list[lvl], list) { /* * If @lvl is below the cross NUMA node level, check whether * this group belongs to the same NUMA node. @@ -1698,14 +1738,14 @@ static struct tmigr_group *tmigr_get_group(int node, unsigned int lvl) tmigr_init_group(group, lvl, node); /* Setup successful. Add it to the hierarchy */ - list_add(&group->list, &tmigr_level_list[lvl]); + list_add(&group->list, &hier->level_list[lvl]); trace_tmigr_group_set(group); return group; } -static bool tmigr_init_root(struct tmigr_group *group, bool activate) +static bool tmigr_init_root(struct tmigr_hierarchy *hier, struct tmigr_group *group, bool activate) { - if (!group->parent && group != tmigr_root) { + if (!group->parent && group != hier->root) { /* * This is the new top-level, prepare its groupmask in advance * to avoid accidents where yet another new top-level is @@ -1721,11 +1761,10 @@ static bool tmigr_init_root(struct tmigr_group *group, bool activate) } -static void tmigr_connect_child_parent(struct tmigr_group *child, - struct tmigr_group *parent, - bool activate) +static void tmigr_connect_child_parent(struct tmigr_hierarchy *hier, struct tmigr_group *child, + struct tmigr_group *parent, bool activate) { - if (tmigr_init_root(parent, activate)) { + if (tmigr_init_root(hier, parent, activate)) { /* * The previous top level had prepared its groupmask already, * simply account it in advance as the first child. If some groups @@ -1758,13 +1797,13 @@ static void tmigr_connect_child_parent(struct tmigr_group *child, */ smp_store_release(&child->parent, parent); - trace_tmigr_connect_child_parent(child); + trace_tmigr_connect_child_parent(hier, child); } -static int tmigr_setup_groups(unsigned int cpu, unsigned int node, - struct tmigr_group *start, bool activate) +static int tmigr_setup_groups(struct tmigr_hierarchy *hier, unsigned int cpu, + unsigned int node, struct tmigr_group *start, bool activate) { - struct tmigr_group *group, *child, **stack; + struct tmigr_group *root = hier->root, *group, *child, **stack; int i, top = 0, err = 0, start_lvl = 0; bool root_mismatch = false; @@ -1777,11 +1816,11 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node, start_lvl = start->level + 1; } - if (tmigr_root) - root_mismatch = tmigr_root->numa_node != node; + if (root) + root_mismatch = root->numa_node != node; for (i = start_lvl; i < tmigr_hierarchy_levels; i++) { - group = tmigr_get_group(node, i); + group = tmigr_get_group(hier, node, i); if (IS_ERR(group)) { err = PTR_ERR(group); i--; @@ -1803,7 +1842,7 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node, if (group->parent) break; if ((!root_mismatch || i >= tmigr_crossnode_level) && - list_is_singular(&tmigr_level_list[i])) + list_is_singular(&hier->level_list[i])) break; } @@ -1831,15 +1870,15 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node, tmc->tmgroup = group; tmc->groupmask = BIT(group->num_children++); - tmigr_init_root(group, activate); + tmigr_init_root(hier, group, activate); - trace_tmigr_connect_cpu_parent(tmc); + trace_tmigr_connect_cpu_parent(hier, tmc); /* There are no children that need to be connected */ continue; } else { child = stack[i - 1]; - tmigr_connect_child_parent(child, group, activate); + tmigr_connect_child_parent(hier, child, group, activate); } } @@ -1895,18 +1934,23 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node, data.childmask = start->groupmask; __walk_groups_from(tmigr_active_up, &data, start, start->parent); } + } else if (start) { + union tmigr_state state; + + /* Remote activation assumes the whole target's hierarchy is inactive */ + state.state = atomic_read(&start->migr_state); + WARN_ON_ONCE(state.active); } /* Root update */ - if (list_is_singular(&tmigr_level_list[top])) { - group = list_first_entry(&tmigr_level_list[top], - typeof(*group), list); + if (list_is_singular(&hier->level_list[top])) { + group = list_first_entry(&hier->level_list[top], typeof(*group), list); WARN_ON_ONCE(group->parent); - if (tmigr_root) { + if (root) { /* Old root should be the same or below */ - WARN_ON_ONCE(tmigr_root->level > top); + WARN_ON_ONCE(root->level > top); } - tmigr_root = group; + hier->root = group; } out: kfree(stack); @@ -1914,34 +1958,123 @@ out: return err; } +static struct tmigr_hierarchy *tmigr_get_hierarchy(int cpu) +{ + struct tmigr_hierarchy *hier; + + hier = __tmigr_get_hierarchy(cpu); + + if (hier) + return hier; + + hier = kzalloc_flex(*hier, level_list, tmigr_hierarchy_levels); + if (!hier) + return ERR_PTR(-ENOMEM); + + hier->cpumask = kzalloc(cpumask_size(), GFP_KERNEL); + if (!hier->cpumask) { + kfree(hier); + return ERR_PTR(-ENOMEM); + } + + for (int i = 0; i < tmigr_hierarchy_levels; i++) + INIT_LIST_HEAD(&hier->level_list[i]); + + hier->capacity = tmigr_get_capacity(cpu); + list_add_tail(&hier->node, &tmigr_hierarchy_list); + + return hier; +} + +static int tmigr_connect_old_root(struct tmigr_hierarchy *hier, int cpu, + struct tmigr_group *old_root, bool activate) +{ + /* + * The target CPU must never do the prepare work, except + * on early boot when the boot CPU is the target. Otherwise + * it may spuriously activate the old top level group inside + * the new one (nevertheless whether old top level group is + * active or not) and/or release an uninitialized childmask. + */ + WARN_ON_ONCE(cpu == smp_processor_id()); + if (activate) { + /* + * The current CPU is expected to be online in the hierarchy, + * otherwise the old root may not be active as expected. + */ + WARN_ON_ONCE(!__this_cpu_read(tmigr_cpu.available)); + } + + return tmigr_setup_groups(hier, -1, old_root->numa_node, old_root, activate); +} + +static long connect_old_root_work(void *arg) +{ + struct tmigr_group *old_root = arg; + struct tmigr_hierarchy *hier; + int cpu = smp_processor_id(); + + hier = __tmigr_get_hierarchy(cpu); + if (WARN_ON_ONCE(!hier)) + return -EINVAL; + + return tmigr_connect_old_root(hier, cpu, old_root, true); +} + static int tmigr_add_cpu(unsigned int cpu) { - struct tmigr_group *old_root = tmigr_root; + struct tmigr_hierarchy *hier; + struct tmigr_group *old_root; int node = cpu_to_node(cpu); int ret; guard(mutex)(&tmigr_mutex); - ret = tmigr_setup_groups(cpu, node, NULL, false); + hier = tmigr_get_hierarchy(cpu); + if (IS_ERR(hier)) + return PTR_ERR(hier); + + old_root = hier->root; + + ret = tmigr_setup_groups(hier, cpu, node, NULL, false); + + if (ret < 0) + return ret; /* Root has changed? Connect the old one to the new */ - if (ret >= 0 && old_root && old_root != tmigr_root) { - /* - * The target CPU must never do the prepare work, except - * on early boot when the boot CPU is the target. Otherwise - * it may spuriously activate the old top level group inside - * the new one (nevertheless whether old top level group is - * active or not) and/or release an uninitialized childmask. - */ - WARN_ON_ONCE(cpu == raw_smp_processor_id()); - /* - * The (likely) current CPU is expected to be online in the hierarchy, - * otherwise the old root may not be active as expected. - */ - WARN_ON_ONCE(!per_cpu_ptr(&tmigr_cpu, raw_smp_processor_id())->available); - ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true); + if (old_root && old_root != hier->root) { + guard(migrate)(); + + if (cpumask_test_cpu(smp_processor_id(), hier->cpumask)) { + /* + * If the target belong to the same hierarchy, the old root is expected + * to be active. Link and propagate to the new root. + */ + ret = tmigr_connect_old_root(hier, cpu, old_root, true); + } else { + int target = cpumask_first_and(hier->cpumask, tmigr_available_cpumask); + + if (target < nr_cpu_ids) { + /* + * If the target doesn't belong to the same hierarchy as the current + * CPU, activate from a relevant one to make sure the old root is + * active. + */ + ret = work_on_cpu(target, connect_old_root_work, old_root); + } else { + /* + * No other available CPUs in the remote hierarchy. Link the + * old root remotely but don't propagate activation since the + * old root is not expected to be active. + */ + ret = tmigr_connect_old_root(hier, cpu, old_root, false); + } + } } + if (ret >= 0) + cpumask_set_cpu(cpu, hier->cpumask); + return ret; } @@ -1974,7 +2107,7 @@ static int tmigr_cpu_prepare(unsigned int cpu) static int __init tmigr_init(void) { - unsigned int cpulvl, nodelvl, cpus_per_node, i; + unsigned int cpulvl, nodelvl, cpus_per_node; unsigned int nnodes = num_possible_nodes(); unsigned int ncpus = num_possible_cpus(); int ret = -ENOMEM; @@ -2021,14 +2154,6 @@ static int __init tmigr_init(void) */ tmigr_crossnode_level = cpulvl; - tmigr_level_list = kzalloc_objs(struct list_head, - tmigr_hierarchy_levels); - if (!tmigr_level_list) - goto err; - - for (i = 0; i < tmigr_hierarchy_levels; i++) - INIT_LIST_HEAD(&tmigr_level_list[i]); - pr_info("Timer migration: %d hierarchy levels; %d children per group;" " %d crossnode level\n", tmigr_hierarchy_levels, TMIGR_CHILDREN_PER_GROUP, diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h index 70879cde6fdd..31735dd52327 100644 --- a/kernel/time/timer_migration.h +++ b/kernel/time/timer_migration.h @@ -6,6 +6,24 @@ #define TMIGR_CHILDREN_PER_GROUP 8 /** + * struct tmigr_hierarchy - a hierarchy associated to a given CPU capacity. + * Homogeneous systems have only one hierarchy. + * Heterogenous have one hierarchy per CPU capacity. + * @cpumask: CPUs belonging to this hierarchy + * @root: The current root of the hierarchy + * @capacity: CPU capacity associated to this hierarchy + * @node: Node in the global hierarchy list + * @level_list: Per level lists of tmigr groups + */ +struct tmigr_hierarchy { + struct cpumask *cpumask; + struct tmigr_group *root; + unsigned long capacity; + struct list_head node; + struct list_head level_list[]; +}; + +/** * struct tmigr_event - a timer event associated to a CPU * @nextevt: The node to enqueue an event in the parent group queue * @cpu: The CPU to which this event belongs @@ -75,15 +93,17 @@ struct tmigr_group { /** * struct tmigr_cpu - timer migration per CPU group * @lock: Lock protecting the tmigr_cpu group information - * @online: Indicates whether the CPU is online; In deactivate path - * it is required to know whether the migrator in the top - * level group is to be set offline, while a timer is - * pending. Then another online CPU needs to be notified to - * take over the migrator role. Furthermore the information - * is required in CPU hotplug path as the CPU is able to go - * idle before the timer migration hierarchy hotplug AP is - * reached. During this phase, the CPU has to handle the + * @available: Indicates whether the CPU is available for handling + * global timers. In the deactivate path it is required to + * know whether the migrator in the top level group is to + * be set offline, while a timer is pending. Then another + * available CPU needs to be notified to take over the + * migrator role. Furthermore the information is required + * in the CPU hotplug path as the CPU is able to go idle + * before the timer migration hierarchy hotplug callback is + * reached. During this phase, the CPU has to handle the * global timers on its own and must not act as a migrator. + * @idle: Indicates whether the CPU is idle in the timer migration * hierarchy * @remote: Is set when timers of the CPU are expired remotely diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index 517106165ad2..bfcf2d44e93d 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -115,6 +115,21 @@ static void idletimer_tg_alarmproc(struct alarm *alarm, ktime_t now) schedule_work(&timer->work); } +static void idletimer_start_alarm_ktime(struct idletimer_tg *timer, ktime_t timeout) +{ + /* + * The timer should always be queued as @tout it should be least one + * second, but handle it correctly in any case. Virt will manage! + */ + if (!alarm_start_timer(&timer->alarm, timeout, true)) + schedule_work(&timer->work); +} + +static void idletimer_start_alarm_sec(struct idletimer_tg *timer, unsigned int seconds) +{ + idletimer_start_alarm_ktime(timer, ktime_set(seconds, 0)); +} + static int idletimer_check_sysfs_name(const char *name, unsigned int size) { int ret; @@ -220,12 +235,10 @@ static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info) INIT_WORK(&info->timer->work, idletimer_tg_work); if (info->timer->timer_type & XT_IDLETIMER_ALARM) { - ktime_t tout; alarm_init(&info->timer->alarm, ALARM_BOOTTIME, idletimer_tg_alarmproc); info->timer->alarm.data = info->timer; - tout = ktime_set(info->timeout, 0); - alarm_start_relative(&info->timer->alarm, tout); + idletimer_start_alarm_sec(info->timer, info->timeout); } else { timer_setup(&info->timer->timer, idletimer_tg_expired, 0); mod_timer(&info->timer->timer, @@ -271,8 +284,7 @@ static unsigned int idletimer_tg_target_v1(struct sk_buff *skb, info->label, info->timeout); if (info->timer->timer_type & XT_IDLETIMER_ALARM) { - ktime_t tout = ktime_set(info->timeout, 0); - alarm_start_relative(&info->timer->alarm, tout); + idletimer_start_alarm_sec(info->timer, info->timeout); } else { mod_timer(&info->timer->timer, secs_to_jiffies(info->timeout) + jiffies); @@ -384,7 +396,7 @@ static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par) if (ktimespec.tv_sec > 0) { pr_debug("time_expiry_remaining %lld\n", ktimespec.tv_sec); - alarm_start_relative(&info->timer->alarm, tout); + idletimer_start_alarm_ktime(info->timer, tout); } } else { mod_timer(&info->timer->timer, diff --git a/scripts/timer_migration_tree.py b/scripts/timer_migration_tree.py new file mode 100755 index 000000000000..faac9de854bd --- /dev/null +++ b/scripts/timer_migration_tree.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: GPL-2.0 + +""" +Draw the timer migration tree. + +1) Boot with trace_event==tmigr_connect_cpu_parent,tmigr_connect_child_parent +2) ./timer_migration_tree.py < /sys/kernel/tracing/trace +""" + +import re, sys +from ete3 import Tree + +class Node: + def __init__(self, group): + self.group = group + self.children = [] + self.parent = None + self.num_children = 0 + self.groupmask = 0 + self.lvl = -1 + + def set_groupmask(self, groupmask): + self.groupmask = groupmask + + def set_parent(self, parent): + self.parent = parent + + def add_child(self, child): + self.children.append(child) + + def set_lvl(self, lvl): + self.lvl = lvl + + def set_numa(self, numa): + self.numa = numa + + def set_num_children(self, num_children): + self.num_children = num_children + + def __repr__(self): + if self.parent: + parent_grp = self.parent.group + else: + parent_grp = "-" + return "Group: %s mask: %s parent: %s lvl: %d numa: %d num_children: %d" % (self.group, self.groupmask, parent_grp, self.lvl, self.numa, self.num_children) + +hierarchies = { } + +def get_hierarchy(capacity): + if capacity not in hierarchies: + hierarchies[capacity] = {} + return hierarchies[capacity] + +def get_node(capacity, group): + hier = get_hierarchy(capacity) + if group in hier: + return hier[group] + else: + n = Node(group) + hier[group] = n + return n + +def tmigr_connect_cpu_parent(ts, line): + s = re.search("tmigr_connect_cpu_parent: cpu=([0-9]+) groupmask=([0-9a-zA-Z]+) parent=([0-9a-zA-Z]+) lvl=([0-9]+) numa=([-]?[0-9]+) capacity=([-]?[0-9]+) num_children=([0-9]+)", line) + if s is None: + return False + (cpu, groupmask, parent, lvl, numa, capacity, num_children) = (int(s.group(1)), s.group(2), s.group(3), int(s.group(4)), int(s.group(5)), int(s.group(6)), int(s.group(7))) + n = get_node(capacity, cpu) + p = get_node(capacity, parent) + n.set_parent(p) + n.set_groupmask(groupmask) + n.set_lvl(-1) + p.set_lvl(lvl) + p.set_numa(numa) + n.set_numa(numa) + p.set_num_children(num_children) + p.add_child(n) + +def tmigr_connect_child_parent(ts, line): + s = re.search("tmigr_connect_child_parent: group=([0-9a-zA-Z]+) groupmask=([0-9a-zA-Z]+) parent=([0-9a-zA-Z]+) lvl=([0-9]+) numa=([-]?[0-9]+) capacity=([-]?[0-9]+) num_children=([0-9]+)", line) + if s is None: + return False + (group, groupmask, parent, lvl, numa, capacity, num_children) = (s.group(1), s.group(2), s.group(3), int(s.group(4)), int(s.group(5)), int(s.group(6)), int(s.group(7))) + n = get_node(capacity, group) + p = get_node(capacity, parent) + n.set_parent(p) + n.set_groupmask(groupmask) + p.set_lvl(lvl) + p.set_numa(numa) + p.set_num_children(num_children) + p.add_child(n) + +def populate(enode, node): + enode = enode.add_child(name = node.group) + enode.add_feature("groupmask", "m:%s" % node.groupmask) + enode.add_feature("lvl", "lvl:%d" % node.lvl) + enode.add_feature("numa", "node %d" % node.numa) + enode.add_feature("num_children", "c=%d" % node.num_children) + for child in node.children: + populate(enode, child) + +if __name__ == "__main__": + for line in sys.stdin: + s = re.search("([0-9]+[.][0-9]{6}): (.+?)$", line, re.S) + if s is not None: + if tmigr_connect_cpu_parent(float(s.group(1)), s.group(2)): + continue + if tmigr_connect_child_parent(float(s.group(1)), s.group(2)): + continue + + for cap in hierarchies: + h = hierarchies[cap] + print("Tree for capacity %d" % cap) + for k in h: + n = h[k] + while n.parent != None: + n = n.parent + root = Tree() + populate(root, n) + print(root.get_ascii(show_internal=True, attributes=["name", "numa", "lvl"])) + break diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index 38512623622a..2f3bac9fc6e8 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c @@ -78,19 +78,25 @@ static void sig_handler(int nr) done = 1; } +static inline int64_t calcdiff_ns(struct timespec t1, struct timespec t2) +{ + int64_t diff; + + diff = NSEC_PER_SEC * (int64_t)((int) t1.tv_sec - (int) t2.tv_sec); + diff += ((int) t1.tv_nsec - (int) t2.tv_nsec); + return diff; +} + /* * Check the expected timer expiration matches the GTOD elapsed delta since * we armed the timer. Keep a 0.5 sec error margin due to various jitter. */ -static int check_diff(struct timeval start, struct timeval end) +static int check_diff(struct timespec start, struct timespec end) { - long long diff; - - diff = end.tv_usec - start.tv_usec; - diff += (end.tv_sec - start.tv_sec) * USEC_PER_SEC; + long long diff = calcdiff_ns(end, start); - if (llabs(diff - DELAY * USEC_PER_SEC) > USEC_PER_SEC / 2) { - printf("Diff too high: %lld..", diff); + if (llabs(diff - DELAY * NSEC_PER_SEC) > NSEC_PER_SEC / 2) { + printf("Diff too high: %lld ns..", diff); return -1; } @@ -99,22 +105,25 @@ static int check_diff(struct timeval start, struct timeval end) static void check_itimer(int which, const char *name) { - struct timeval start, end; + struct timespec start, end; struct itimerval val = { .it_value.tv_sec = DELAY, }; + int clock_id = CLOCK_REALTIME; done = 0; if (which == ITIMER_VIRTUAL) signal(SIGVTALRM, sig_handler); - else if (which == ITIMER_PROF) + else if (which == ITIMER_PROF) { + clock_id = CLOCK_THREAD_CPUTIME_ID; signal(SIGPROF, sig_handler); + } else if (which == ITIMER_REAL) signal(SIGALRM, sig_handler); - if (gettimeofday(&start, NULL) < 0) - fatal_error(name, "gettimeofday()"); + if (clock_gettime(clock_id, &start)) + fatal_error(name, "clock_gettime()"); if (setitimer(which, &val, NULL) < 0) fatal_error(name, "setitimer()"); @@ -126,18 +135,19 @@ static void check_itimer(int which, const char *name) else if (which == ITIMER_REAL) idle_loop(); - if (gettimeofday(&end, NULL) < 0) - fatal_error(name, "gettimeofday()"); + if (clock_gettime(clock_id, &end)) + fatal_error(name, "clock_gettime()"); ksft_test_result(check_diff(start, end) == 0, "%s\n", name); } static void check_timer_create(int which, const char *name) { - struct timeval start, end; + struct timespec start, end; struct itimerspec val = { .it_value.tv_sec = DELAY, }; + int clock_id = CLOCK_REALTIME; timer_t id; done = 0; @@ -148,16 +158,16 @@ static void check_timer_create(int which, const char *name) if (signal(SIGALRM, sig_handler) == SIG_ERR) fatal_error(name, "signal()"); - if (gettimeofday(&start, NULL) < 0) - fatal_error(name, "gettimeofday()"); + if (clock_gettime(clock_id, &start)) + fatal_error(name, "clock_gettime()"); if (timer_settime(id, 0, &val, NULL) < 0) fatal_error(name, "timer_settime()"); user_loop(); - if (gettimeofday(&end, NULL) < 0) - fatal_error(name, "gettimeofday()"); + if (clock_gettime(clock_id, &end)) + fatal_error(name, "clock_gettime()"); ksft_test_result(check_diff(start, end) == 0, "timer_create() per %s\n", name); @@ -445,15 +455,6 @@ static void check_delete(void) ksft_test_result(!tsig.signals, "check_delete\n"); } -static inline int64_t calcdiff_ns(struct timespec t1, struct timespec t2) -{ - int64_t diff; - - diff = NSEC_PER_SEC * (int64_t)((int) t1.tv_sec - (int) t2.tv_sec); - diff += ((int) t1.tv_nsec - (int) t2.tv_nsec); - return diff; -} - static void check_sigev_none(int which, const char *name) { struct timespec start, now; |
