diff options
-rw-r--r-- | arch/x86/include/asm/topology.h | 20 | ||||
-rw-r--r-- | arch/x86/kernel/smpboot.c | 183 | ||||
-rw-r--r-- | kernel/sched/core.c | 1 | ||||
-rw-r--r-- | kernel/sched/sched.h | 7 |
4 files changed, 210 insertions, 1 deletions
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 4b14d2318251..2ebf7b7b2126 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -193,4 +193,24 @@ static inline void sched_clear_itmt_support(void) } #endif /* CONFIG_SCHED_MC_PRIO */ +#ifdef CONFIG_SMP +#include <asm/cpufeature.h> + +DECLARE_STATIC_KEY_FALSE(arch_scale_freq_key); + +#define arch_scale_freq_invariant() static_branch_likely(&arch_scale_freq_key) + +DECLARE_PER_CPU(unsigned long, arch_freq_scale); + +static inline long arch_scale_freq_capacity(int cpu) +{ + return per_cpu(arch_freq_scale, cpu); +} +#define arch_scale_freq_capacity arch_scale_freq_capacity + +extern void arch_scale_freq_tick(void); +#define arch_scale_freq_tick arch_scale_freq_tick + +#endif + #endif /* _ASM_X86_TOPOLOGY_H */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 69881b2d446c..28696bccf912 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -147,6 +147,8 @@ static inline void smpboot_restore_warm_reset_vector(void) *((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0; } +static void init_freq_invariance(void); + /* * Report back to the Boot Processor during boot time or to the caller processor * during CPU online. @@ -183,6 +185,8 @@ static void smp_callin(void) */ set_cpu_sibling_map(raw_smp_processor_id()); + init_freq_invariance(); + /* * Get our bogomips. * Update loops_per_jiffy in cpu_data. Previous call to @@ -1337,7 +1341,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) set_sched_topology(x86_topology); set_cpu_sibling_map(0); - + init_freq_invariance(); smp_sanity_check(); switch (apic_intr_mode) { @@ -1764,3 +1768,180 @@ void native_play_dead(void) } #endif + +/* + * APERF/MPERF frequency ratio computation. + * + * The scheduler wants to do frequency invariant accounting and needs a <1 + * ratio to account for the 'current' frequency, corresponding to + * freq_curr / freq_max. + * + * Since the frequency freq_curr on x86 is controlled by micro-controller and + * our P-state setting is little more than a request/hint, we need to observe + * the effective frequency 'BusyMHz', i.e. the average frequency over a time + * interval after discarding idle time. This is given by: + * + * BusyMHz = delta_APERF / delta_MPERF * freq_base + * + * where freq_base is the max non-turbo P-state. + * + * The freq_max term has to be set to a somewhat arbitrary value, because we + * can't know which turbo states will be available at a given point in time: + * it all depends on the thermal headroom of the entire package. We set it to + * the turbo level with 4 cores active. + * + * Benchmarks show that's a good compromise between the 1C turbo ratio + * (freq_curr/freq_max would rarely reach 1) and something close to freq_base, + * which would ignore the entire turbo range (a conspicuous part, making + * freq_curr/freq_max always maxed out). + * + * Setting freq_max to anything less than the 1C turbo ratio makes the ratio + * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1. + */ + +DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key); + +static DEFINE_PER_CPU(u64, arch_prev_aperf); +static DEFINE_PER_CPU(u64, arch_prev_mperf); +static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE; + +static bool turbo_disabled(void) +{ + u64 misc_en; + int err; + + err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); + if (err) + return false; + + return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE); +} + +#include <asm/cpu_device_id.h> +#include <asm/intel-family.h> + +#define ICPU(model) \ + {X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF, 0} + +static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = { + ICPU(INTEL_FAM6_XEON_PHI_KNL), + ICPU(INTEL_FAM6_XEON_PHI_KNM), + {} +}; + +static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = { + ICPU(INTEL_FAM6_SKYLAKE_X), + {} +}; + +static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = { + ICPU(INTEL_FAM6_ATOM_GOLDMONT), + ICPU(INTEL_FAM6_ATOM_GOLDMONT_D), + ICPU(INTEL_FAM6_ATOM_GOLDMONT_PLUS), + {} +}; + +static bool core_set_max_freq_ratio(void) +{ + u64 base_freq, turbo_freq; + int err; + + err = rdmsrl_safe(MSR_PLATFORM_INFO, &base_freq); + if (err) + return false; + + err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &turbo_freq); + if (err) + return false; + + base_freq = (base_freq >> 8) & 0xFF; /* max P state */ + turbo_freq = (turbo_freq >> 24) & 0xFF; /* 4C turbo */ + + arch_max_freq_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE, + base_freq); + return true; +} + +static bool intel_set_max_freq_ratio(void) +{ + /* + * TODO: add support for: + * + * - Xeon Gold/Platinum + * - Xeon Phi (KNM, KNL) + * - Atom Goldmont + * - Atom Silvermont + */ + + if (x86_match_cpu(has_skx_turbo_ratio_limits) || + x86_match_cpu(has_knl_turbo_ratio_limits) || + x86_match_cpu(has_glm_turbo_ratio_limits)) + return false; + + if (turbo_disabled() || core_set_max_freq_ratio()) + return true; + + return false; +} + +static void init_counter_refs(void *arg) +{ + u64 aperf, mperf; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + + this_cpu_write(arch_prev_aperf, aperf); + this_cpu_write(arch_prev_mperf, mperf); +} + +static void init_freq_invariance(void) +{ + bool ret = false; + + if (smp_processor_id() != 0 || !boot_cpu_has(X86_FEATURE_APERFMPERF)) + return; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + ret = intel_set_max_freq_ratio(); + + if (ret) { + on_each_cpu(init_counter_refs, NULL, 1); + static_branch_enable(&arch_scale_freq_key); + } else { + pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n"); + } +} + +DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; + +void arch_scale_freq_tick(void) +{ + u64 freq_scale; + u64 aperf, mperf; + u64 acnt, mcnt; + + if (!arch_scale_freq_invariant()) + return; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + + acnt = aperf - this_cpu_read(arch_prev_aperf); + mcnt = mperf - this_cpu_read(arch_prev_mperf); + if (!mcnt) + return; + + this_cpu_write(arch_prev_aperf, aperf); + this_cpu_write(arch_prev_mperf, mperf); + + acnt <<= 2*SCHED_CAPACITY_SHIFT; + mcnt *= arch_max_freq_ratio; + + freq_scale = div64_u64(acnt, mcnt); + + if (freq_scale > SCHED_CAPACITY_SCALE) + freq_scale = SCHED_CAPACITY_SCALE; + + this_cpu_write(arch_freq_scale, freq_scale); +} diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 89e54f3ed571..45f79bcc3146 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3600,6 +3600,7 @@ void scheduler_tick(void) struct task_struct *curr = rq->curr; struct rq_flags rf; + arch_scale_freq_tick(); sched_clock_tick(); rq_lock(rq, &rf); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1a88dc8ad11b..0844e81964e5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1968,6 +1968,13 @@ static inline int hrtick_enabled(struct rq *rq) #endif /* CONFIG_SCHED_HRTICK */ +#ifndef arch_scale_freq_tick +static __always_inline +void arch_scale_freq_tick(void) +{ +} +#endif + #ifndef arch_scale_freq_capacity static __always_inline unsigned long arch_scale_freq_capacity(int cpu) |