From 3dc167ba5729ddd2d8e3fa1841653792c295d3f1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 19 May 2020 19:25:06 +0200 Subject: sched/cputime: Improve cputime_adjust() People report that utime and stime from /proc//stat become very wrong when the numbers are big enough, especially if you watch these counters incrementally. Specifically, the current implementation of: stime*rtime/total, results in a saw-tooth function on top of the desired line, where the teeth grow in size the larger the values become. IOW, it has a relative error. The result is that, when watching incrementally as time progresses (for large values), we'll see periods of pure stime or utime increase, irrespective of the actual ratio we're striving for. Replace scale_stime() with a math64.h helper: mul_u64_u64_div_u64() that is far more accurate. This also allows architectures to override the implementation -- for instance they can opt for the old algorithm if this new one turns out to be too expensive for them. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200519172506.GA317395@hirez.programming.kicks-ass.net --- arch/x86/include/asm/div64.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h index 9b8cb50768c2..b8f1dc0761e4 100644 --- a/arch/x86/include/asm/div64.h +++ b/arch/x86/include/asm/div64.h @@ -74,16 +74,26 @@ static inline u64 mul_u32_u32(u32 a, u32 b) #else # include -static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div) +/* + * Will generate an #DE when the result doesn't fit u64, could fix with an + * __ex_table[] entry when it becomes an issue. + */ +static inline u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div) { u64 q; asm ("mulq %2; divq %3" : "=a" (q) - : "a" (a), "rm" ((u64)mul), "rm" ((u64)div) + : "a" (a), "rm" (mul), "rm" (div) : "rdx"); return q; } +#define mul_u64_u64_div_u64 mul_u64_u64_div_u64 + +static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div) +{ + return mul_u64_u64_div_u64(a, mul, div); +} #define mul_u64_u32_div mul_u64_u32_div #endif /* CONFIG_X86_32 */ -- cgit v1.2.3 From e2b0d619b400ae326f954a018a1d65d736c237c5 Mon Sep 17 00:00:00 2001 From: Giovanni Gherdovich Date: Sun, 31 May 2020 20:24:51 +0200 Subject: x86, sched: check for counters overflow in frequency invariant accounting The product mcnt * arch_max_freq_ratio can overflows u64. For context, a large value for arch_max_freq_ratio would be 5000, corresponding to a turbo_freq/base_freq ratio of 5 (normally it's more like 1500-2000). A large increment frequency for the MPERF counter would be 5GHz (the base clock of all CPUs on the market today is less than that). With these figures, a CPU would need to go without a scheduler tick for around 8 days for the u64 overflow to happen. It is unlikely, but the check is warranted. Under similar conditions, the difference acnt of two consecutive APERF readings can overflow as well. In these circumstances is appropriate to disable frequency invariant accounting: the feature relies on measures of the clock frequency done at every scheduler tick, which need to be "fresh" to be at all meaningful. A note on i386: prior to version 5.1, the GCC compiler didn't have the builtin function __builtin_mul_overflow. In these GCC versions the macro check_mul_overflow needs __udivdi3() to do (u64)a/b, which the kernel doesn't provide. For this reason this change fails to build on i386 if GCC<5.1, and we protect the entire frequency invariant code behind CONFIG_X86_64 (special thanks to "kbuild test robot" ). Fixes: 1567c3e3467c ("x86, sched: Add support for frequency invariance") Signed-off-by: Giovanni Gherdovich Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Rafael J. Wysocki Link: https://lkml.kernel.org/r/20200531182453.15254-2-ggherdovich@suse.cz --- arch/x86/include/asm/topology.h | 2 +- arch/x86/kernel/smpboot.c | 33 ++++++++++++++++++++++++++++----- 2 files changed, 29 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 79d8d5496330..f4234575f3fd 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -193,7 +193,7 @@ static inline void sched_clear_itmt_support(void) } #endif /* CONFIG_SCHED_MC_PRIO */ -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_X86_64) #include DECLARE_STATIC_KEY_FALSE(arch_scale_freq_key); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ffbd9a3d78d8..18d292fc466c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include @@ -1777,6 +1778,7 @@ void native_play_dead(void) #endif +#ifdef CONFIG_X86_64 /* * APERF/MPERF frequency ratio computation. * @@ -2048,11 +2050,19 @@ static void init_freq_invariance(bool secondary) } } +static void disable_freq_invariance_workfn(struct work_struct *work) +{ + static_branch_disable(&arch_scale_freq_key); +} + +static DECLARE_WORK(disable_freq_invariance_work, + disable_freq_invariance_workfn); + DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; void arch_scale_freq_tick(void) { - u64 freq_scale; + u64 freq_scale = SCHED_CAPACITY_SCALE; u64 aperf, mperf; u64 acnt, mcnt; @@ -2064,19 +2074,32 @@ void arch_scale_freq_tick(void) acnt = aperf - this_cpu_read(arch_prev_aperf); mcnt = mperf - this_cpu_read(arch_prev_mperf); - if (!mcnt) - return; this_cpu_write(arch_prev_aperf, aperf); this_cpu_write(arch_prev_mperf, mperf); - acnt <<= 2*SCHED_CAPACITY_SHIFT; - mcnt *= arch_max_freq_ratio; + if (check_shl_overflow(acnt, 2*SCHED_CAPACITY_SHIFT, &acnt)) + goto error; + + if (check_mul_overflow(mcnt, arch_max_freq_ratio, &mcnt) || !mcnt) + goto error; freq_scale = div64_u64(acnt, mcnt); + if (!freq_scale) + goto error; if (freq_scale > SCHED_CAPACITY_SCALE) freq_scale = SCHED_CAPACITY_SCALE; this_cpu_write(arch_freq_scale, freq_scale); + return; + +error: + pr_warn("Scheduler frequency invariance went wobbly, disabling!\n"); + schedule_work(&disable_freq_invariance_work); +} +#else +static inline void init_freq_invariance(bool secondary) +{ } +#endif /* CONFIG_X86_64 */ -- cgit v1.2.3 From 51beea8862a3095559862df39554f05042e1195b Mon Sep 17 00:00:00 2001 From: Giovanni Gherdovich Date: Sun, 31 May 2020 20:24:52 +0200 Subject: x86, sched: Bail out of frequency invariance if turbo frequency is unknown There may be CPUs that support turbo boost but don't declare any turbo ratio, i.e. their MSR_TURBO_RATIO_LIMIT is all zeroes. In that condition scale-invariant calculations can't be performed. Fixes: 1567c3e3467c ("x86, sched: Add support for frequency invariance") Suggested-by: Ricardo Neri Signed-off-by: Giovanni Gherdovich Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Rafael J. Wysocki Tested-by: Ricardo Neri Link: https://lkml.kernel.org/r/20200531182453.15254-3-ggherdovich@suse.cz --- arch/x86/kernel/smpboot.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 18d292fc466c..20e1cea262e4 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -2002,9 +2002,11 @@ out: /* * Some hypervisors advertise X86_FEATURE_APERFMPERF * but then fill all MSR's with zeroes. + * Some CPUs have turbo boost but don't declare any turbo ratio + * in MSR_TURBO_RATIO_LIMIT. */ - if (!base_freq) { - pr_debug("Couldn't determine cpu base frequency, necessary for scale-invariant accounting.\n"); + if (!base_freq || !turbo_freq) { + pr_debug("Couldn't determine cpu base or turbo frequency, necessary for scale-invariant accounting.\n"); return false; } -- cgit v1.2.3 From f4291df103315a696f0b8c4f45ca8ae773c17441 Mon Sep 17 00:00:00 2001 From: Giovanni Gherdovich Date: Sun, 31 May 2020 20:24:53 +0200 Subject: x86, sched: Bail out of frequency invariance if turbo_freq/base_freq gives 0 Be defensive against the case where the processor reports a base_freq larger than turbo_freq (the ratio would be zero). Fixes: 1567c3e3467c ("x86, sched: Add support for frequency invariance") Signed-off-by: Giovanni Gherdovich Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Rafael J. Wysocki Link: https://lkml.kernel.org/r/20200531182453.15254-4-ggherdovich@suse.cz --- arch/x86/kernel/smpboot.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 20e1cea262e4..518ac6bf752e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1977,6 +1977,7 @@ static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) static bool intel_set_max_freq_ratio(void) { u64 base_freq, turbo_freq; + u64 turbo_ratio; if (slv_set_max_freq_ratio(&base_freq, &turbo_freq)) goto out; @@ -2010,9 +2011,15 @@ out: return false; } - arch_turbo_freq_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, - base_freq); + turbo_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, base_freq); + if (!turbo_ratio) { + pr_debug("Non-zero turbo and base frequencies led to a 0 ratio.\n"); + return false; + } + + arch_turbo_freq_ratio = turbo_ratio; arch_set_max_freq_ratio(turbo_disabled()); + return true; } -- cgit v1.2.3 From 25980c7a79af42f2daa73e2f475ebf4cbac8253e Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Sun, 12 Jul 2020 17:59:15 +0100 Subject: arch_topology, sched/core: Cleanup thermal pressure definition The following commit: 14533a16c46d ("thermal/cpu-cooling, sched/core: Move the arch_set_thermal_pressure() API to generic scheduler code") moved the definition of arch_set_thermal_pressure() to sched/core.c, but kept its declaration in linux/arch_topology.h. When building e.g. an x86 kernel with CONFIG_SCHED_THERMAL_PRESSURE=y, cpufreq_cooling.c ends up getting the declaration of arch_set_thermal_pressure() from include/linux/arch_topology.h, which is somewhat awkward. On top of this, sched/core.c unconditionally defines o The thermal_pressure percpu variable o arch_set_thermal_pressure() while arch_scale_thermal_pressure() does nothing unless redefined by the architecture. arch_*() functions are meant to be defined by architectures, so revert the aforementioned commit and re-implement it in a way that keeps arch_set_thermal_pressure() architecture-definable, and doesn't define the thermal pressure percpu variable for kernels that don't need it (CONFIG_SCHED_THERMAL_PRESSURE=n). Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200712165917.9168-2-valentin.schneider@arm.com --- arch/arm/include/asm/topology.h | 3 ++- arch/arm64/include/asm/topology.h | 3 ++- drivers/base/arch_topology.c | 11 +++++++++++ include/linux/arch_topology.h | 4 ++-- include/linux/sched/topology.h | 7 +++++++ kernel/sched/core.c | 11 ----------- 6 files changed, 24 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 435aba289fc5..e0593cf095d0 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -16,8 +16,9 @@ /* Enable topology flag updates */ #define arch_update_cpu_topology topology_update_cpu_topology -/* Replace task scheduler's default thermal pressure retrieve API */ +/* Replace task scheduler's default thermal pressure API */ #define arch_scale_thermal_pressure topology_get_thermal_pressure +#define arch_set_thermal_pressure topology_set_thermal_pressure #else diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 0cc835ddfcd1..e042f6527981 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -34,8 +34,9 @@ void topology_scale_freq_tick(void); /* Enable topology flag updates */ #define arch_update_cpu_topology topology_update_cpu_topology -/* Replace task scheduler's default thermal pressure retrieve API */ +/* Replace task scheduler's default thermal pressure API */ #define arch_scale_thermal_pressure topology_get_thermal_pressure +#define arch_set_thermal_pressure topology_set_thermal_pressure #include diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 4d0a0038b476..75f72d684294 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -54,6 +54,17 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity) per_cpu(cpu_scale, cpu) = capacity; } +DEFINE_PER_CPU(unsigned long, thermal_pressure); + +void topology_set_thermal_pressure(const struct cpumask *cpus, + unsigned long th_pressure) +{ + int cpu; + + for_each_cpu(cpu, cpus) + WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); +} + static ssize_t cpu_capacity_show(struct device *dev, struct device_attribute *attr, char *buf) diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 0566cb3314ef..69b1dabe39dc 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -39,8 +39,8 @@ static inline unsigned long topology_get_thermal_pressure(int cpu) return per_cpu(thermal_pressure, cpu); } -void arch_set_thermal_pressure(struct cpumask *cpus, - unsigned long th_pressure); +void topology_set_thermal_pressure(const struct cpumask *cpus, + unsigned long th_pressure); struct cpu_topology { int thread_id; diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index fb11091129b3..764222d637b7 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -232,6 +232,13 @@ unsigned long arch_scale_thermal_pressure(int cpu) } #endif +#ifndef arch_set_thermal_pressure +static __always_inline +void arch_set_thermal_pressure(const struct cpumask *cpus, + unsigned long th_pressure) +{ } +#endif + static inline int task_node(const struct task_struct *p) { return cpu_to_node(task_cpu(p)); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 12db8fbd9c97..bd8e5211d31f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3869,17 +3869,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -DEFINE_PER_CPU(unsigned long, thermal_pressure); - -void arch_set_thermal_pressure(struct cpumask *cpus, - unsigned long th_pressure) -{ - int cpu; - - for_each_cpu(cpu, cpus) - WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); -} - /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. -- cgit v1.2.3 From e17ae7fea8714caa743aa0d5e446a25a999ad726 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Sun, 12 Jul 2020 17:59:17 +0100 Subject: arm, arm64: Select CONFIG_SCHED_THERMAL_PRESSURE This option now correctly depends on CPU_FREQ_THERMAL, so select it on the architectures that implement the required functions, arch_set_thermal_pressure() and arch_get_thermal_pressure(). Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Acked-by: Catalin Marinas Link: https://lkml.kernel.org/r/20200712165917.9168-4-valentin.schneider@arm.com --- arch/arm/Kconfig | 1 + arch/arm64/Kconfig | 1 + 2 files changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 2ac74904a3ce..939c4d6bbc2e 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -46,6 +46,7 @@ config ARM select EDAC_ATOMIC_SCRUB select GENERIC_ALLOCATOR select GENERIC_ARCH_TOPOLOGY if ARM_CPU_TOPOLOGY + select SCHED_THERMAL_PRESSURE if ARM_CPU_TOPOLOGY select GENERIC_ATOMIC64 if CPU_V7M || CPU_V6 || !CPU_32v6K || !AEABI select GENERIC_CLOCKEVENTS_BROADCAST if SMP select GENERIC_CPU_AUTOPROBE diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 66dc41fd49f2..c403e6f5db86 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -192,6 +192,7 @@ config ARM64 select PCI_SYSCALL if PCI select POWER_RESET select POWER_SUPPLY + select SCHED_THERMAL_PRESSURE select SPARSE_IRQ select SWIOTLB select SYSCTL_EXCEPTION_TRACE -- cgit v1.2.3 From fcd7c9c3c35aed58aba2eea6d375f0e5b03bd6d6 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 29 Jul 2020 14:57:18 +0100 Subject: arm, arm64: Fix selection of CONFIG_SCHED_THERMAL_PRESSURE Qian reported that the current setup forgoes the Kconfig dependencies and results in warnings such as: WARNING: unmet direct dependencies detected for SCHED_THERMAL_PRESSURE Depends on [n]: SMP [=y] && CPU_FREQ_THERMAL [=n] Selected by [y]: - ARM64 [=y] Revert commit e17ae7fea871 ("arm, arm64: Select CONFIG_SCHED_THERMAL_PRESSURE") and re-implement it by making the option default to 'y' for arm64 and arm, which respects Kconfig dependencies (i.e. will remain 'n' if CPU_FREQ_THERMAL=n). Fixes: e17ae7fea871 ("arm, arm64: Select CONFIG_SCHED_THERMAL_PRESSURE") Reported-by: Qian Cai Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200729135718.1871-1-valentin.schneider@arm.com --- arch/arm/Kconfig | 1 - arch/arm64/Kconfig | 1 - init/Kconfig | 2 ++ 3 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 939c4d6bbc2e..2ac74904a3ce 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -46,7 +46,6 @@ config ARM select EDAC_ATOMIC_SCRUB select GENERIC_ALLOCATOR select GENERIC_ARCH_TOPOLOGY if ARM_CPU_TOPOLOGY - select SCHED_THERMAL_PRESSURE if ARM_CPU_TOPOLOGY select GENERIC_ATOMIC64 if CPU_V7M || CPU_V6 || !CPU_32v6K || !AEABI select GENERIC_CLOCKEVENTS_BROADCAST if SMP select GENERIC_CPU_AUTOPROBE diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c403e6f5db86..66dc41fd49f2 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -192,7 +192,6 @@ config ARM64 select PCI_SYSCALL if PCI select POWER_RESET select POWER_SUPPLY - select SCHED_THERMAL_PRESSURE select SPARSE_IRQ select SWIOTLB select SYSCTL_EXCEPTION_TRACE diff --git a/init/Kconfig b/init/Kconfig index 0a97d85568b2..9f7f249dab43 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -493,6 +493,8 @@ config HAVE_SCHED_AVG_IRQ config SCHED_THERMAL_PRESSURE bool + default y if ARM && ARM_CPU_TOPOLOGY + default y if ARM64 depends on SMP depends on CPU_FREQ_THERMAL help -- cgit v1.2.3