From 1c20091e77fc5a9b7d7d905176443b4822a23cdb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 10 Aug 2011 23:21:01 +0200 Subject: nohz: Wake up full dynticks CPUs when a timer gets enqueued Wake up a CPU when a timer list timer is enqueued there and the target is part of the full dynticks range. Sending an IPI to it makes it reconsidering the next timer to program on top of recent updates. This may later be improved by checking if the tick is really stopped on the target. This would need some careful synchronization though. So deal with such optimization later and start simple. Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Namhyung Kim Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- kernel/sched/core.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 849deb96e61e..e91ee589f793 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -587,7 +587,7 @@ unlock: * account when the CPU goes back to idle and evaluates the timer * wheel for the next timer event. */ -void wake_up_idle_cpu(int cpu) +static void wake_up_idle_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -617,6 +617,24 @@ void wake_up_idle_cpu(int cpu) smp_send_reschedule(cpu); } +static bool wake_up_extended_nohz_cpu(int cpu) +{ + if (tick_nohz_extended_cpu(cpu)) { + if (cpu != smp_processor_id() || + tick_nohz_tick_stopped()) + smp_send_reschedule(cpu); + return true; + } + + return false; +} + +void wake_up_nohz_cpu(int cpu) +{ + if (!wake_up_extended_nohz_cpu(cpu)) + wake_up_idle_cpu(cpu); +} + static inline bool got_nohz_idle_kick(void) { int cpu = smp_processor_id(); -- cgit v1.2.3 From 3451d0243c3cdfd729b36f9684a14659d4895ca3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 10 Aug 2011 23:21:01 +0200 Subject: nohz: Rename CONFIG_NO_HZ to CONFIG_NO_HZ_COMMON We are planning to convert the dynticks Kconfig options layout into a choice menu. The user must be able to easily pick any of the following implementations: constant periodic tick, idle dynticks, full dynticks. As this implies a mutual exclusion, the two dynticks implementions need to converge on the selection of a common Kconfig option in order to ease the sharing of a common infrastructure. It would thus seem pretty natural to reuse CONFIG_NO_HZ to that end. It already implements all the idle dynticks code and the full dynticks depends on all that code for now. So ideally the choice menu would propose CONFIG_NO_HZ_IDLE and CONFIG_NO_HZ_EXTENDED then both would select CONFIG_NO_HZ. On the other hand we want to stay backward compatible: if CONFIG_NO_HZ is set in an older config file, we want to enable CONFIG_NO_HZ_IDLE by default. But we can't afford both at the same time or we run into a circular dependency: 1) CONFIG_NO_HZ_IDLE and CONFIG_NO_HZ_EXTENDED both select CONFIG_NO_HZ 2) If CONFIG_NO_HZ is set, we default to CONFIG_NO_HZ_IDLE We might be able to support that from Kconfig/Kbuild but it may not be wise to introduce such a confusing behaviour. So to solve this, create a new CONFIG_NO_HZ_COMMON option which gathers the common code between idle and full dynticks (that common code for now is simply the idle dynticks code) and select it from their referring Kconfig. Then we'll later create CONFIG_NO_HZ_IDLE and map CONFIG_NO_HZ to it for backward compatibility. Signed-off-by: Frederic Weisbecker Cc: Andrew Morton Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Namhyung Kim Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- Documentation/RCU/stallwarn.txt | 2 +- Documentation/cpu-freq/governors.txt | 4 ++-- arch/um/include/shared/common-offsets.h | 4 ++-- arch/um/os-Linux/time.c | 2 +- include/linux/sched.h | 8 ++++---- include/linux/tick.h | 8 ++++---- init/Kconfig | 2 +- kernel/hrtimer.c | 4 ++-- kernel/sched/core.c | 18 +++++++++--------- kernel/sched/fair.c | 10 +++++----- kernel/sched/sched.h | 4 ++-- kernel/softirq.c | 2 +- kernel/time/Kconfig | 13 +++++++++---- kernel/time/tick-sched.c | 12 ++++++------ kernel/timer.c | 4 ++-- 15 files changed, 51 insertions(+), 46 deletions(-) (limited to 'kernel/sched') diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 1927151b386b..b336755b71ed 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -176,7 +176,7 @@ o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that o A hardware or software issue shuts off the scheduler-clock interrupt on a CPU that is not in dyntick-idle mode. This problem really has happened, and seems to be most likely to - result in RCU CPU stall warnings for CONFIG_NO_HZ=n kernels. + result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels. o A bug in the RCU implementation. diff --git a/Documentation/cpu-freq/governors.txt b/Documentation/cpu-freq/governors.txt index c7a2eb8450c2..e3e5d9ae50cd 100644 --- a/Documentation/cpu-freq/governors.txt +++ b/Documentation/cpu-freq/governors.txt @@ -131,8 +131,8 @@ sampling_rate_min: The sampling rate is limited by the HW transition latency: transition_latency * 100 Or by kernel restrictions: -If CONFIG_NO_HZ is set, the limit is 10ms fixed. -If CONFIG_NO_HZ is not set or nohz=off boot parameter is used, the +If CONFIG_NO_HZ_COMMON is set, the limit is 10ms fixed. +If CONFIG_NO_HZ_COMMON is not set or nohz=off boot parameter is used, the limits depend on the CONFIG_HZ option: HZ=1000: min=20000us (20ms) HZ=250: min=80000us (80ms) diff --git a/arch/um/include/shared/common-offsets.h b/arch/um/include/shared/common-offsets.h index 2df313b6a586..c92306809029 100644 --- a/arch/um/include/shared/common-offsets.h +++ b/arch/um/include/shared/common-offsets.h @@ -30,8 +30,8 @@ DEFINE(UM_NSEC_PER_USEC, NSEC_PER_USEC); #ifdef CONFIG_PRINTK DEFINE(UML_CONFIG_PRINTK, CONFIG_PRINTK); #endif -#ifdef CONFIG_NO_HZ -DEFINE(UML_CONFIG_NO_HZ, CONFIG_NO_HZ); +#ifdef CONFIG_NO_HZ_COMMON +DEFINE(UML_CONFIG_NO_HZ_COMMON, CONFIG_NO_HZ_COMMON); #endif #ifdef CONFIG_UML_X86 DEFINE(UML_CONFIG_UML_X86, CONFIG_UML_X86); diff --git a/arch/um/os-Linux/time.c b/arch/um/os-Linux/time.c index fac388cb464f..e9824d5dd7d5 100644 --- a/arch/um/os-Linux/time.c +++ b/arch/um/os-Linux/time.c @@ -79,7 +79,7 @@ long long os_nsecs(void) return timeval_to_ns(&tv); } -#ifdef UML_CONFIG_NO_HZ +#ifdef UML_CONFIG_NO_HZ_COMMON static int after_sleep_interval(struct timespec *ts) { return 0; diff --git a/include/linux/sched.h b/include/linux/sched.h index 10626e2ee688..1ff9e0a5de27 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -230,7 +230,7 @@ extern void init_idle_bootup_task(struct task_struct *idle); extern int runqueue_is_locked(int cpu); -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) extern void nohz_balance_enter_idle(int cpu); extern void set_cpu_sd_state_idle(void); extern int get_nohz_timer_target(void); @@ -1758,13 +1758,13 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, } #endif -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON void calc_load_enter_idle(void); void calc_load_exit_idle(void); #else static inline void calc_load_enter_idle(void) { } static inline void calc_load_exit_idle(void) { } -#endif /* CONFIG_NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ #ifndef CONFIG_CPUMASK_OFFSTACK static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) @@ -1850,7 +1850,7 @@ extern void idle_task_exit(void); static inline void idle_task_exit(void) {} #endif -#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) +#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) extern void wake_up_nohz_cpu(int cpu); #else static inline void wake_up_nohz_cpu(int cpu) { } diff --git a/include/linux/tick.h b/include/linux/tick.h index 44bfa8aa439f..5e403339ee14 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -82,7 +82,7 @@ extern int tick_program_event(ktime_t expires, int force); extern void tick_setup_sched_timer(void); # endif -# if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS +# if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS extern void tick_cancel_sched_timer(int cpu); # else static inline void tick_cancel_sched_timer(int cpu) { } @@ -123,7 +123,7 @@ static inline void tick_check_idle(int cpu) { } static inline int tick_oneshot_mode_active(void) { return 0; } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ -# ifdef CONFIG_NO_HZ +# ifdef CONFIG_NO_HZ_COMMON DECLARE_PER_CPU(struct tick_sched, tick_cpu_sched); static inline int tick_nohz_tick_stopped(void) @@ -138,7 +138,7 @@ extern ktime_t tick_nohz_get_sleep_length(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); -# else /* !CONFIG_NO_HZ */ +# else /* !CONFIG_NO_HZ_COMMON */ static inline int tick_nohz_tick_stopped(void) { return 0; @@ -155,7 +155,7 @@ static inline ktime_t tick_nohz_get_sleep_length(void) } static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } -# endif /* !NO_HZ */ +# endif /* !CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_EXTENDED extern int tick_nohz_extended_cpu(int cpu); diff --git a/init/Kconfig b/init/Kconfig index 8a1dac2f80a9..edc8132584f1 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -580,7 +580,7 @@ config RCU_FANOUT_EXACT config RCU_FAST_NO_HZ bool "Accelerate last non-dyntick-idle CPU's grace periods" - depends on NO_HZ && SMP + depends on NO_HZ_COMMON && SMP default n help This option causes RCU to attempt to accelerate grace periods in diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index cc47812d3feb..ec60482d8b03 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -160,7 +160,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, */ static int hrtimer_get_target(int this_cpu, int pinned) { -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) return get_nohz_timer_target(); #endif @@ -1106,7 +1106,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) } EXPORT_SYMBOL_GPL(hrtimer_get_remaining); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /** * hrtimer_get_next_event - get the time until next expiry event * diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e91ee589f793..9bb397da63d6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -549,7 +549,7 @@ void resched_cpu(int cpu) raw_spin_unlock_irqrestore(&rq->lock, flags); } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * In the semi idle case, use the nearest busy cpu for migrating timers * from an idle cpu. This is good for power-savings. @@ -641,14 +641,14 @@ static inline bool got_nohz_idle_kick(void) return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); } -#else /* CONFIG_NO_HZ */ +#else /* CONFIG_NO_HZ_COMMON */ static inline bool got_nohz_idle_kick(void) { return false; } -#endif /* CONFIG_NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ void sched_avg_update(struct rq *rq) { @@ -2139,7 +2139,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) return load >> FSHIFT; } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * Handle NO_HZ for the global load-average. * @@ -2365,12 +2365,12 @@ static void calc_global_nohz(void) smp_wmb(); calc_load_idx++; } -#else /* !CONFIG_NO_HZ */ +#else /* !CONFIG_NO_HZ_COMMON */ static inline long calc_load_fold_idle(void) { return 0; } static inline void calc_global_nohz(void) { } -#endif /* CONFIG_NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ /* * calc_load - update the avenrun load estimates 10 ticks after the @@ -2530,7 +2530,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, sched_avg_update(this_rq); } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * There is no sane way to deal with nohz on smp when using jiffies because the * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading @@ -2590,7 +2590,7 @@ void update_cpu_load_nohz(void) } raw_spin_unlock(&this_rq->lock); } -#endif /* CONFIG_NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ /* * Called from scheduler_tick() @@ -7023,7 +7023,7 @@ void __init sched_init(void) INIT_LIST_HEAD(&rq->cfs_tasks); rq_attach_root(rq, &def_root_domain); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON rq->nohz_flags = 0; #endif #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 539760ef00c4..5c97fca091a7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5331,7 +5331,7 @@ out_unlock: return 0; } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * idle load balancing details * - When one of the busy CPUs notice that there may be an idle rebalancing @@ -5541,9 +5541,9 @@ out: rq->next_balance = next_balance; } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* - * In CONFIG_NO_HZ case, the idle balance kickee will do the + * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the * rebalancing for all the cpus for whom scheduler ticks are stopped. */ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) @@ -5686,7 +5686,7 @@ void trigger_load_balance(struct rq *rq, int cpu) if (time_after_eq(jiffies, rq->next_balance) && likely(!on_null_domain(cpu))) raise_softirq(SCHED_SOFTIRQ); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) nohz_balancer_kick(cpu); #endif @@ -6156,7 +6156,7 @@ __init void init_sched_fair_class(void) #ifdef CONFIG_SMP open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON nohz.next_balance = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); cpu_notifier(sched_ilb_notifier, 0); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3bd15a43eebc..889904dd6d77 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -404,7 +404,7 @@ struct rq { #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; unsigned long last_load_update_tick; -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON u64 nohz_stamp; unsigned long nohz_flags; #endif @@ -1333,7 +1333,7 @@ extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); extern void account_cfs_bandwidth_used(int enabled, int was_enabled); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON enum rq_nohz_flag_bits { NOHZ_TICK_STOPPED, NOHZ_BALANCE_KICK, diff --git a/kernel/softirq.c b/kernel/softirq.c index b4d252fd195b..de15813f2a66 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -348,7 +348,7 @@ void irq_exit(void) if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* Make sure that timer wheel updates are propagated */ if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) tick_nohz_irq_exit(); diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 726c33e00da2..c88fc43494c9 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -64,16 +64,21 @@ config GENERIC_CMOS_UPDATE if GENERIC_CLOCKEVENTS menu "Timers subsystem" -# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is +# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is # only related to the tick functionality. Oneshot clockevent devices # are supported independ of this. config TICK_ONESHOT bool +config NO_HZ_COMMON + bool + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + select TICK_ONESHOT + config NO_HZ bool "Tickless System (Dynamic Ticks)" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS - select TICK_ONESHOT + select NO_HZ_COMMON help This option enables a tickless system: timer interrupts will only trigger on an as-needed basis both when the system is @@ -81,14 +86,14 @@ config NO_HZ config NO_HZ_EXTENDED bool "Full dynticks system" - # NO_HZ dependency + # NO_HZ_COMMON dependency depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS # RCU_USER_QS depends on HAVE_CONTEXT_TRACKING && SMP # RCU_NOCB_CPU dependency depends on TREE_RCU || TREE_PREEMPT_RCU depends on VIRT_CPU_ACCOUNTING_GEN - select NO_HZ + select NO_HZ_COMMON select RCU_USER_QS select RCU_NOCB_CPU select CONTEXT_TRACKING_FORCE diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 57bb3fe5aaa3..ccfc2086cd4b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -104,7 +104,7 @@ static void tick_sched_do_timer(ktime_t now) { int cpu = smp_processor_id(); -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * Check if the do_timer duty was dropped. We don't care about * concurrency: This happens only when the cpu in charge went @@ -124,7 +124,7 @@ static void tick_sched_do_timer(ktime_t now) static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) { -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * When we are idle and the tick is stopped, we have to touch * the watchdog as we might not schedule for a really long @@ -235,7 +235,7 @@ core_initcall(init_tick_nohz_extended); /* * NOHZ - aka dynamic tick functionality */ -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * NO HZ enabled ? */ @@ -907,7 +907,7 @@ static inline void tick_check_nohz(int cpu) static inline void tick_nohz_switch_to_nohz(void) { } static inline void tick_check_nohz(int cpu) { } -#endif /* NO_HZ */ +#endif /* CONFIG_NO_HZ_COMMON */ /* * Called from irq_enter to notify about the possible interruption of idle() @@ -992,14 +992,14 @@ void tick_setup_sched_timer(void) now = ktime_get(); } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON if (tick_nohz_enabled) ts->nohz_mode = NOHZ_MODE_HIGHRES; #endif } #endif /* HIGH_RES_TIMERS */ -#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS +#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS void tick_cancel_sched_timer(int cpu) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); diff --git a/kernel/timer.c b/kernel/timer.c index 4e3040b40d16..1b7489fdea41 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -738,7 +738,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, cpu = smp_processor_id(); -#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) +#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP) if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) cpu = get_nohz_timer_target(); #endif @@ -1188,7 +1188,7 @@ static inline void __run_timers(struct tvec_base *base) spin_unlock_irq(&base->lock); } -#ifdef CONFIG_NO_HZ +#ifdef CONFIG_NO_HZ_COMMON /* * Find out when the next timer event is due to happen. This * is used on S/390 to stop all activity when a CPU is idle. -- cgit v1.2.3 From c5bfece2d6129131b4ade985e63bc35ddf5868d4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Apr 2013 16:45:34 +0200 Subject: nohz: Switch from "extended nohz" to "full nohz" based naming "Extended nohz" was used as a naming base for the full dynticks API and Kconfig symbols. It reflects the fact the system tries to stop the tick in more places than just idle. But that "extended" name is a bit opaque and vague. Rename it to "full" makes it clearer what the system tries to do under this config: try to shutdown the tick anytime it can. The various constraints that prevent that to happen shouldn't be considered as fundamental properties of this feature but rather technical issues that may be solved in the future. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- Documentation/kernel-parameters.txt | 4 +-- include/linux/tick.h | 6 ++--- kernel/sched/core.c | 6 ++--- kernel/time/Kconfig | 4 +-- kernel/time/tick-broadcast.c | 2 +- kernel/time/tick-common.c | 2 +- kernel/time/tick-sched.c | 54 ++++++++++++++++++------------------- 7 files changed, 39 insertions(+), 39 deletions(-) (limited to 'kernel/sched') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 231698feaddc..82365dde00a8 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1913,8 +1913,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Valid arguments: on, off Default: on - nohz_extended= [KNL,BOOT] - In kernels built with CONFIG_NO_HZ_EXTENDED=y, set + nohz_full= [KNL,BOOT] + In kernels built with CONFIG_NO_HZ_FULL=y, set the specified list of CPUs whose tick will be stopped whenever possible. You need to keep at least one online CPU outside the range to maintain the timekeeping. diff --git a/include/linux/tick.h b/include/linux/tick.h index 5e403339ee14..b4e3b0c9639e 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -157,10 +157,10 @@ static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } # endif /* !CONFIG_NO_HZ_COMMON */ -#ifdef CONFIG_NO_HZ_EXTENDED -extern int tick_nohz_extended_cpu(int cpu); +#ifdef CONFIG_NO_HZ_FULL +extern int tick_nohz_full_cpu(int cpu); #else -static inline int tick_nohz_extended_cpu(int cpu) { return 0; } +static inline int tick_nohz_full_cpu(int cpu) { return 0; } #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9bb397da63d6..0f0a5b3fd62c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -617,9 +617,9 @@ static void wake_up_idle_cpu(int cpu) smp_send_reschedule(cpu); } -static bool wake_up_extended_nohz_cpu(int cpu) +static bool wake_up_full_nohz_cpu(int cpu) { - if (tick_nohz_extended_cpu(cpu)) { + if (tick_nohz_full_cpu(cpu)) { if (cpu != smp_processor_id() || tick_nohz_tick_stopped()) smp_send_reschedule(cpu); @@ -631,7 +631,7 @@ static bool wake_up_extended_nohz_cpu(int cpu) void wake_up_nohz_cpu(int cpu) { - if (!wake_up_extended_nohz_cpu(cpu)) + if (!wake_up_full_nohz_cpu(cpu)) wake_up_idle_cpu(cpu); } diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index cbe64be17d1f..4a17b5069466 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -96,7 +96,7 @@ config NO_HZ_IDLE Most of the time you want to say Y here. -config NO_HZ_EXTENDED +config NO_HZ_FULL bool "Full dynticks system (tickless single task)" # NO_HZ_COMMON dependency depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS @@ -115,7 +115,7 @@ config NO_HZ_EXTENDED task on the CPU. Chances for running tickless are maximized when the task mostly runs in userspace and has few kernel activity. - You need to fill up the nohz_extended boot parameter with the + You need to fill up the nohz_full boot parameter with the desired range of dynticks CPUs. This is implemented at the expense of some overhead in user <-> kernel diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 8a6875cc1879..a3a3123f6272 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -573,7 +573,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) bc->event_handler = tick_handle_oneshot_broadcast; /* Take the do_timer update */ - if (!tick_nohz_extended_cpu(cpu)) + if (!tick_nohz_full_cpu(cpu)) tick_do_timer_cpu = cpu; /* diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index b7dc0cbdb59b..83f2bd967161 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -163,7 +163,7 @@ static void tick_setup_device(struct tick_device *td, * this cpu: */ if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { - if (!tick_nohz_extended_cpu(cpu)) + if (!tick_nohz_full_cpu(cpu)) tick_do_timer_cpu = cpu; else tick_do_timer_cpu = TICK_DO_TIMER_NONE; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e057d338daa4..369b5769fc97 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -113,7 +113,7 @@ static void tick_sched_do_timer(ktime_t now) * jiffies_lock. */ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE) - && !tick_nohz_extended_cpu(cpu)) + && !tick_nohz_full_cpu(cpu)) tick_do_timer_cpu = cpu; #endif @@ -143,29 +143,29 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) profile_tick(CPU_PROFILING); } -#ifdef CONFIG_NO_HZ_EXTENDED -static cpumask_var_t nohz_extended_mask; -bool have_nohz_extended_mask; +#ifdef CONFIG_NO_HZ_FULL +static cpumask_var_t nohz_full_mask; +bool have_nohz_full_mask; -int tick_nohz_extended_cpu(int cpu) +int tick_nohz_full_cpu(int cpu) { - if (!have_nohz_extended_mask) + if (!have_nohz_full_mask) return 0; - return cpumask_test_cpu(cpu, nohz_extended_mask); + return cpumask_test_cpu(cpu, nohz_full_mask); } /* Parse the boot-time nohz CPU list from the kernel parameters. */ -static int __init tick_nohz_extended_setup(char *str) +static int __init tick_nohz_full_setup(char *str) { - alloc_bootmem_cpumask_var(&nohz_extended_mask); - if (cpulist_parse(str, nohz_extended_mask) < 0) - pr_warning("NOHZ: Incorrect nohz_extended cpumask\n"); + alloc_bootmem_cpumask_var(&nohz_full_mask); + if (cpulist_parse(str, nohz_full_mask) < 0) + pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); else - have_nohz_extended_mask = true; + have_nohz_full_mask = true; return 1; } -__setup("nohz_extended=", tick_nohz_extended_setup); +__setup("nohz_full=", tick_nohz_full_setup); static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, unsigned long action, @@ -179,7 +179,7 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, * If we handle the timekeeping duty for full dynticks CPUs, * we can't safely shutdown that CPU. */ - if (have_nohz_extended_mask && tick_do_timer_cpu == cpu) + if (have_nohz_full_mask && tick_do_timer_cpu == cpu) return -EINVAL; break; } @@ -191,20 +191,20 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, * separations: 0,2,4,6,... * This is NR_CPUS + sizeof('\0') */ -static char __initdata nohz_ext_buf[NR_CPUS + 1]; +static char __initdata nohz_full_buf[NR_CPUS + 1]; -static int __init init_tick_nohz_extended(void) +static int __init init_tick_nohz_full(void) { cpumask_var_t online_nohz; int cpu; - if (!have_nohz_extended_mask) + if (!have_nohz_full_mask) return 0; cpu_notifier(tick_nohz_cpu_down_callback, 0); if (!zalloc_cpumask_var(&online_nohz, GFP_KERNEL)) { - pr_warning("NO_HZ: Not enough memory to check extended nohz mask\n"); + pr_warning("NO_HZ: Not enough memory to check full nohz mask\n"); return -ENOMEM; } @@ -215,31 +215,31 @@ static int __init init_tick_nohz_extended(void) get_online_cpus(); /* Ensure we keep a CPU outside the dynticks range for timekeeping */ - cpumask_and(online_nohz, cpu_online_mask, nohz_extended_mask); + cpumask_and(online_nohz, cpu_online_mask, nohz_full_mask); if (cpumask_equal(online_nohz, cpu_online_mask)) { pr_warning("NO_HZ: Must keep at least one online CPU " - "out of nohz_extended range\n"); + "out of nohz_full range\n"); /* * We know the current CPU doesn't have its tick stopped. * Let's use it for the timekeeping duty. */ preempt_disable(); cpu = smp_processor_id(); - pr_warning("NO_HZ: Clearing %d from nohz_extended range\n", cpu); - cpumask_clear_cpu(cpu, nohz_extended_mask); + pr_warning("NO_HZ: Clearing %d from nohz_full range\n", cpu); + cpumask_clear_cpu(cpu, nohz_full_mask); preempt_enable(); } put_online_cpus(); free_cpumask_var(online_nohz); - cpulist_scnprintf(nohz_ext_buf, sizeof(nohz_ext_buf), nohz_extended_mask); - pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_ext_buf); + cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); + pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); return 0; } -core_initcall(init_tick_nohz_extended); +core_initcall(init_tick_nohz_full); #else -#define have_nohz_extended_mask (0) +#define have_nohz_full_mask (0) #endif /* @@ -589,7 +589,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; } - if (have_nohz_extended_mask) { + if (have_nohz_full_mask) { /* * Keep the tick alive to guarantee timekeeping progression * if there are full dynticks CPUs around -- cgit v1.2.3 From 9f3660c2c1a221c886474587103c69f6034d3e4f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 20 Apr 2013 14:35:09 +0200 Subject: sched: Kick full dynticks CPU that have more than one task enqueued. Kick the tick on full dynticks CPUs when they get more than one task running on their queue. This makes sure that local fairness is maintained by the tick on the destination. This is done regardless of these tasks' class. We should be able to be more clever in the future depending on these. eg: a CPU that runs a SCHED_FIFO task doesn't need to maintain fairness against local pending tasks of the fair class. But keep things simple for now. Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- kernel/sched/sched.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel/sched') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 889904dd6d77..eb363aa5d83c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -5,6 +5,7 @@ #include #include #include +#include #include "cpupri.h" @@ -1106,6 +1107,16 @@ static inline u64 steal_ticks(u64 steal) static inline void inc_nr_running(struct rq *rq) { rq->nr_running++; + +#ifdef CONFIG_NO_HZ_FULL + if (rq->nr_running == 2) { + if (tick_nohz_full_cpu(rq->cpu)) { + /* Order rq->nr_running write against the IPI */ + smp_wmb(); + smp_send_reschedule(rq->cpu); + } + } +#endif } static inline void dec_nr_running(struct rq *rq) -- cgit v1.2.3 From ce831b38ca4920739a7a5b0c73b921da41f03718 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 20 Apr 2013 15:15:35 +0200 Subject: sched: New helper to prevent from stopping the tick in full dynticks Provide a new helper to be called from the full dynticks engine before stopping the tick in order to make sure we don't stop it when there is more than one task running on the CPU. This way we make sure that the tick stays alive to maintain fairness. Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- include/linux/sched.h | 6 ++++++ kernel/sched/core.c | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'kernel/sched') diff --git a/include/linux/sched.h b/include/linux/sched.h index 1ff9e0a5de27..a74adedcdd10 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1856,6 +1856,12 @@ extern void wake_up_nohz_cpu(int cpu); static inline void wake_up_nohz_cpu(int cpu) { } #endif +#ifdef CONFIG_NO_HZ_FULL +extern bool sched_can_stop_tick(void); +#else +static inline bool sched_can_stop_tick(void) { return false; } +#endif + #ifdef CONFIG_SCHED_AUTOGROUP extern void sched_autogroup_create_attach(struct task_struct *p); extern void sched_autogroup_detach(struct task_struct *p); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0f0a5b3fd62c..69f71335984f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -650,6 +650,24 @@ static inline bool got_nohz_idle_kick(void) #endif /* CONFIG_NO_HZ_COMMON */ +#ifdef CONFIG_NO_HZ_FULL +bool sched_can_stop_tick(void) +{ + struct rq *rq; + + rq = this_rq(); + + /* Make sure rq->nr_running update is visible after the IPI */ + smp_rmb(); + + /* More than one running task need preemption */ + if (rq->nr_running > 1) + return false; + + return true; +} +#endif /* CONFIG_NO_HZ_FULL */ + void sched_avg_update(struct rq *rq) { s64 period = sched_avg_period(); -- cgit v1.2.3 From ff442c51f6543378cf23107c75b7949dc64a9119 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 20 Apr 2013 15:27:08 +0200 Subject: nohz: Re-evaluate the tick from the scheduler IPI The scheduler IPI is used by the scheduler to kick full dynticks CPUs asynchronously when more than one task are running or when a new timer list timer is enqueued. This way the destination CPU can decide to restart the tick to handle this new situation. Now let's call that kick in the scheduler IPI. (Reusing the scheduler IPI rather than implementing a new IPI was suggested by Peter Zijlstra a while ago) Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- include/linux/tick.h | 2 ++ kernel/sched/core.c | 4 +++- kernel/time/tick-sched.c | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/include/linux/tick.h b/include/linux/tick.h index d290168335bc..e31e67623ea1 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -160,11 +160,13 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } #ifdef CONFIG_NO_HZ_FULL extern void tick_nohz_init(void); extern int tick_nohz_full_cpu(int cpu); +extern void tick_nohz_full_check(void); extern void tick_nohz_full_kick(void); extern void tick_nohz_full_kick_all(void); #else static inline void tick_nohz_init(void) { } static inline int tick_nohz_full_cpu(int cpu) { return 0; } +static inline void tick_nohz_full_check(void) { } static inline void tick_nohz_full_kick(void) { } static inline void tick_nohz_full_kick_all(void) { } #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 69f71335984f..9ad35005f1cb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1398,7 +1398,8 @@ static void sched_ttwu_pending(void) void scheduler_ipi(void) { - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) + if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() + && !tick_nohz_full_cpu(smp_processor_id())) return; /* @@ -1415,6 +1416,7 @@ void scheduler_ipi(void) * somewhat pessimize the simple resched case. */ irq_enter(); + tick_nohz_full_check(); sched_ttwu_pending(); /* diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 884a9f302a06..4d74a68b2c34 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -151,7 +151,7 @@ bool have_nohz_full_mask; * Re-evaluate the need for the tick on the current CPU * and restart it if necessary. */ -static void tick_nohz_full_check(void) +void tick_nohz_full_check(void) { /* * STUB for now, will be filled with the full tick stop/restart -- cgit v1.2.3 From 99e5ada9407cc19d7c4c05ce2165f20dc46fc093 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 20 Apr 2013 17:11:50 +0200 Subject: nohz: Re-evaluate the tick for the new task after a context switch When a task is scheduled in, it may have some properties of its own that could make the CPU reconsider the need for the tick: posix cpu timers, perf events, ... So notify the full dynticks subsystem when a task gets scheduled in and re-check the tick dependency at this stage. This is done through a self IPI to avoid messing up with any current lock scenario. Signed-off-by: Frederic Weisbecker Cc: Chris Metcalf Cc: Christoph Lameter Cc: Geoff Levand Cc: Gilad Ben Yossef Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- include/linux/tick.h | 2 ++ kernel/sched/core.c | 2 ++ kernel/time/tick-sched.c | 20 ++++++++++++++++++++ 3 files changed, 24 insertions(+) (limited to 'kernel/sched') diff --git a/include/linux/tick.h b/include/linux/tick.h index e31e67623ea1..9180f4b85e6d 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -163,12 +163,14 @@ extern int tick_nohz_full_cpu(int cpu); extern void tick_nohz_full_check(void); extern void tick_nohz_full_kick(void); extern void tick_nohz_full_kick_all(void); +extern void tick_nohz_task_switch(struct task_struct *tsk); #else static inline void tick_nohz_init(void) { } static inline int tick_nohz_full_cpu(int cpu) { return 0; } static inline void tick_nohz_full_check(void) { } static inline void tick_nohz_full_kick(void) { } static inline void tick_nohz_full_kick_all(void) { } +static inline void tick_nohz_task_switch(struct task_struct *tsk) { } #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9ad35005f1cb..dd09def88567 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1896,6 +1896,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) kprobe_flush_task(prev); put_task_struct(prev); } + + tick_nohz_task_switch(current); } #ifdef CONFIG_SMP diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d0ed1905a85c..12a900dbb819 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -232,6 +232,26 @@ void tick_nohz_full_kick_all(void) preempt_enable(); } +/* + * Re-evaluate the need for the tick as we switch the current task. + * It might need the tick due to per task/process properties: + * perf events, posix cpu timers, ... + */ +void tick_nohz_task_switch(struct task_struct *tsk) +{ + unsigned long flags; + + if (!tick_nohz_full_cpu(smp_processor_id())) + return; + + local_irq_save(flags); + + if (tick_nohz_tick_stopped() && !can_stop_full_tick()) + tick_nohz_full_kick(); + + local_irq_restore(flags); +} + int tick_nohz_full_cpu(int cpu) { if (!have_nohz_full_mask) -- cgit v1.2.3 From 265f22a975c1e4cc3a4d1f94a3ec53ffbb6f5b9f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 3 May 2013 03:39:05 +0200 Subject: sched: Keep at least 1 tick per second for active dynticks tasks The scheduler doesn't yet fully support environments with a single task running without a periodic tick. In order to ensure we still maintain the duties of scheduler_tick(), keep at least 1 tick per second. This makes sure that we keep the progression of various scheduler accounting and background maintainance even with a very low granularity. Examples include cpu load, sched average, CFS entity vruntime, avenrun and events such as load balancing, amongst other details handled in sched_class::task_tick(). This limitation will be removed in the future once we get these individual items to work in full dynticks CPUs. Suggested-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Christoph Lameter Cc: Hakan Akkan Cc: Ingo Molnar Cc: Kevin Hilman Cc: Li Zhong Cc: Paul E. McKenney Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner --- include/linux/sched.h | 1 + kernel/sched/core.c | 30 ++++++++++++++++++++++++++++++ kernel/sched/idle_task.c | 1 + kernel/sched/sched.h | 10 ++++++++++ kernel/time/tick-sched.c | 7 +++++++ 5 files changed, 49 insertions(+) (limited to 'kernel/sched') diff --git a/include/linux/sched.h b/include/linux/sched.h index ebf7095158a9..af008d7bad57 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1862,6 +1862,7 @@ static inline void wake_up_nohz_cpu(int cpu) { } #ifdef CONFIG_NO_HZ_FULL extern bool sched_can_stop_tick(void); +extern u64 scheduler_tick_max_deferment(void); #else static inline bool sched_can_stop_tick(void) { return false; } #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e94842d4400c..3bdf986a091a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2736,8 +2736,35 @@ void scheduler_tick(void) rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif + rq_last_tick_reset(rq); } +#ifdef CONFIG_NO_HZ_FULL +/** + * scheduler_tick_max_deferment + * + * Keep at least one tick per second when a single + * active task is running because the scheduler doesn't + * yet completely support full dynticks environment. + * + * This makes sure that uptime, CFS vruntime, load + * balancing, etc... continue to move forward, even + * with a very low granularity. + */ +u64 scheduler_tick_max_deferment(void) +{ + struct rq *rq = this_rq(); + unsigned long next, now = ACCESS_ONCE(jiffies); + + next = rq->last_sched_tick + HZ; + + if (time_before_eq(next, now)) + return 0; + + return jiffies_to_usecs(next - now) * NSEC_PER_USEC; +} +#endif + notrace unsigned long get_parent_ip(unsigned long addr) { if (in_lock_functions(addr)) { @@ -6993,6 +7020,9 @@ void __init sched_init(void) #ifdef CONFIG_NO_HZ_COMMON rq->nohz_flags = 0; #endif +#ifdef CONFIG_NO_HZ_FULL + rq->last_sched_tick = 0; +#endif #endif init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b8ce77328341..d8da01008d39 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -17,6 +17,7 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) { idle_exit_fair(rq); + rq_last_tick_reset(rq); } static void post_schedule_idle(struct rq *rq) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 24dc29897749..ce39224d6155 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -409,6 +409,9 @@ struct rq { #ifdef CONFIG_NO_HZ_COMMON u64 nohz_stamp; unsigned long nohz_flags; +#endif +#ifdef CONFIG_NO_HZ_FULL + unsigned long last_sched_tick; #endif int skip_clock_update; @@ -1090,6 +1093,13 @@ static inline void dec_nr_running(struct rq *rq) rq->nr_running--; } +static inline void rq_last_tick_reset(struct rq *rq) +{ +#ifdef CONFIG_NO_HZ_FULL + rq->last_sched_tick = jiffies; +#endif +} + extern void update_rq_clock(struct rq *rq); extern void activate_task(struct rq *rq, struct task_struct *p, int flags); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 1c9f53b2ddb7..07929c633570 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -600,6 +600,13 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, time_delta = KTIME_MAX; } +#ifdef CONFIG_NO_HZ_FULL + if (!ts->inidle) { + time_delta = min(time_delta, + scheduler_tick_max_deferment()); + } +#endif + /* * calculate the expiry time for the next timer wheel * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals -- cgit v1.2.3