summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/x86/include/asm/topology.h20
-rw-r--r--arch/x86/kernel/smpboot.c183
-rw-r--r--kernel/sched/core.c1
-rw-r--r--kernel/sched/sched.h7
4 files changed, 210 insertions, 1 deletions
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 4b14d2318251..2ebf7b7b2126 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -193,4 +193,24 @@ static inline void sched_clear_itmt_support(void)
}
#endif /* CONFIG_SCHED_MC_PRIO */
+#ifdef CONFIG_SMP
+#include <asm/cpufeature.h>
+
+DECLARE_STATIC_KEY_FALSE(arch_scale_freq_key);
+
+#define arch_scale_freq_invariant() static_branch_likely(&arch_scale_freq_key)
+
+DECLARE_PER_CPU(unsigned long, arch_freq_scale);
+
+static inline long arch_scale_freq_capacity(int cpu)
+{
+ return per_cpu(arch_freq_scale, cpu);
+}
+#define arch_scale_freq_capacity arch_scale_freq_capacity
+
+extern void arch_scale_freq_tick(void);
+#define arch_scale_freq_tick arch_scale_freq_tick
+
+#endif
+
#endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 69881b2d446c..28696bccf912 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -147,6 +147,8 @@ static inline void smpboot_restore_warm_reset_vector(void)
*((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
}
+static void init_freq_invariance(void);
+
/*
* Report back to the Boot Processor during boot time or to the caller processor
* during CPU online.
@@ -183,6 +185,8 @@ static void smp_callin(void)
*/
set_cpu_sibling_map(raw_smp_processor_id());
+ init_freq_invariance();
+
/*
* Get our bogomips.
* Update loops_per_jiffy in cpu_data. Previous call to
@@ -1337,7 +1341,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
set_sched_topology(x86_topology);
set_cpu_sibling_map(0);
-
+ init_freq_invariance();
smp_sanity_check();
switch (apic_intr_mode) {
@@ -1764,3 +1768,180 @@ void native_play_dead(void)
}
#endif
+
+/*
+ * APERF/MPERF frequency ratio computation.
+ *
+ * The scheduler wants to do frequency invariant accounting and needs a <1
+ * ratio to account for the 'current' frequency, corresponding to
+ * freq_curr / freq_max.
+ *
+ * Since the frequency freq_curr on x86 is controlled by micro-controller and
+ * our P-state setting is little more than a request/hint, we need to observe
+ * the effective frequency 'BusyMHz', i.e. the average frequency over a time
+ * interval after discarding idle time. This is given by:
+ *
+ * BusyMHz = delta_APERF / delta_MPERF * freq_base
+ *
+ * where freq_base is the max non-turbo P-state.
+ *
+ * The freq_max term has to be set to a somewhat arbitrary value, because we
+ * can't know which turbo states will be available at a given point in time:
+ * it all depends on the thermal headroom of the entire package. We set it to
+ * the turbo level with 4 cores active.
+ *
+ * Benchmarks show that's a good compromise between the 1C turbo ratio
+ * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
+ * which would ignore the entire turbo range (a conspicuous part, making
+ * freq_curr/freq_max always maxed out).
+ *
+ * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
+ * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
+ */
+
+DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);
+
+static DEFINE_PER_CPU(u64, arch_prev_aperf);
+static DEFINE_PER_CPU(u64, arch_prev_mperf);
+static u64 arch_max_freq_ratio = SCHED_CAPACITY_SCALE;
+
+static bool turbo_disabled(void)
+{
+ u64 misc_en;
+ int err;
+
+ err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en);
+ if (err)
+ return false;
+
+ return (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE);
+}
+
+#include <asm/cpu_device_id.h>
+#include <asm/intel-family.h>
+
+#define ICPU(model) \
+ {X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF, 0}
+
+static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = {
+ ICPU(INTEL_FAM6_XEON_PHI_KNL),
+ ICPU(INTEL_FAM6_XEON_PHI_KNM),
+ {}
+};
+
+static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = {
+ ICPU(INTEL_FAM6_SKYLAKE_X),
+ {}
+};
+
+static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = {
+ ICPU(INTEL_FAM6_ATOM_GOLDMONT),
+ ICPU(INTEL_FAM6_ATOM_GOLDMONT_D),
+ ICPU(INTEL_FAM6_ATOM_GOLDMONT_PLUS),
+ {}
+};
+
+static bool core_set_max_freq_ratio(void)
+{
+ u64 base_freq, turbo_freq;
+ int err;
+
+ err = rdmsrl_safe(MSR_PLATFORM_INFO, &base_freq);
+ if (err)
+ return false;
+
+ err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &turbo_freq);
+ if (err)
+ return false;
+
+ base_freq = (base_freq >> 8) & 0xFF; /* max P state */
+ turbo_freq = (turbo_freq >> 24) & 0xFF; /* 4C turbo */
+
+ arch_max_freq_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE,
+ base_freq);
+ return true;
+}
+
+static bool intel_set_max_freq_ratio(void)
+{
+ /*
+ * TODO: add support for:
+ *
+ * - Xeon Gold/Platinum
+ * - Xeon Phi (KNM, KNL)
+ * - Atom Goldmont
+ * - Atom Silvermont
+ */
+
+ if (x86_match_cpu(has_skx_turbo_ratio_limits) ||
+ x86_match_cpu(has_knl_turbo_ratio_limits) ||
+ x86_match_cpu(has_glm_turbo_ratio_limits))
+ return false;
+
+ if (turbo_disabled() || core_set_max_freq_ratio())
+ return true;
+
+ return false;
+}
+
+static void init_counter_refs(void *arg)
+{
+ u64 aperf, mperf;
+
+ rdmsrl(MSR_IA32_APERF, aperf);
+ rdmsrl(MSR_IA32_MPERF, mperf);
+
+ this_cpu_write(arch_prev_aperf, aperf);
+ this_cpu_write(arch_prev_mperf, mperf);
+}
+
+static void init_freq_invariance(void)
+{
+ bool ret = false;
+
+ if (smp_processor_id() != 0 || !boot_cpu_has(X86_FEATURE_APERFMPERF))
+ return;
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ ret = intel_set_max_freq_ratio();
+
+ if (ret) {
+ on_each_cpu(init_counter_refs, NULL, 1);
+ static_branch_enable(&arch_scale_freq_key);
+ } else {
+ pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
+ }
+}
+
+DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE;
+
+void arch_scale_freq_tick(void)
+{
+ u64 freq_scale;
+ u64 aperf, mperf;
+ u64 acnt, mcnt;
+
+ if (!arch_scale_freq_invariant())
+ return;
+
+ rdmsrl(MSR_IA32_APERF, aperf);
+ rdmsrl(MSR_IA32_MPERF, mperf);
+
+ acnt = aperf - this_cpu_read(arch_prev_aperf);
+ mcnt = mperf - this_cpu_read(arch_prev_mperf);
+ if (!mcnt)
+ return;
+
+ this_cpu_write(arch_prev_aperf, aperf);
+ this_cpu_write(arch_prev_mperf, mperf);
+
+ acnt <<= 2*SCHED_CAPACITY_SHIFT;
+ mcnt *= arch_max_freq_ratio;
+
+ freq_scale = div64_u64(acnt, mcnt);
+
+ if (freq_scale > SCHED_CAPACITY_SCALE)
+ freq_scale = SCHED_CAPACITY_SCALE;
+
+ this_cpu_write(arch_freq_scale, freq_scale);
+}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 89e54f3ed571..45f79bcc3146 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3600,6 +3600,7 @@ void scheduler_tick(void)
struct task_struct *curr = rq->curr;
struct rq_flags rf;
+ arch_scale_freq_tick();
sched_clock_tick();
rq_lock(rq, &rf);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1a88dc8ad11b..0844e81964e5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1968,6 +1968,13 @@ static inline int hrtick_enabled(struct rq *rq)
#endif /* CONFIG_SCHED_HRTICK */
+#ifndef arch_scale_freq_tick
+static __always_inline
+void arch_scale_freq_tick(void)
+{
+}
+#endif
+
#ifndef arch_scale_freq_capacity
static __always_inline
unsigned long arch_scale_freq_capacity(int cpu)