diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2009-10-22 18:07:55 +0530 |
---|---|---|
committer | Thomas Gleixner <tglx@linutronix.de> | 2009-10-29 09:08:25 +0100 |
commit | d65d153bc23b84e4709a49137a9a03ae139d51a0 (patch) | |
tree | 20ca61665d836089ab7755c5c3d164e0c551c660 | |
parent | 02644174779d41891fb5fd05ce8d94b4f2250a60 (diff) | |
download | lwn-d65d153bc23b84e4709a49137a9a03ae139d51a0.tar.gz lwn-d65d153bc23b84e4709a49137a9a03ae139d51a0.zip |
x86: sched: provide arch implementations using aperf/mperf
APERF/MPERF support for cpu_power.
APERF/MPERF is arch defined to be a relative scale of work capacity
per logical cpu, this is assumed to include SMT and Turbo mode.
APERF/MPERF are specified to both reset to 0 when either counter
wraps, which is highly inconvenient, since that'll give a blimp when
that happens. The manual specifies writing 0 to the counters after
each read, but that's 1) too expensive, and 2) destroys the
possibility of sharing these counters with other users, so we live
with the blimp - the other existing user does too.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Dinakar Guniguntala <dino@in.ibm.com>
Cc: John Stultz <johnstul@us.ibm.com>
Cc: Darren Hart <dvhltc@us.ibm.com>
Cc: John Kacur <jkacur@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-rw-r--r-- | arch/x86/kernel/cpu/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/sched.c | 58 | ||||
-rw-r--r-- | include/linux/sched.h | 4 |
3 files changed, 63 insertions, 1 deletions
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index c1f253dac155..8dd30638fe44 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -13,7 +13,7 @@ CFLAGS_common.o := $(nostackp) obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o -obj-y += vmware.o hypervisor.o +obj-y += vmware.o hypervisor.o sched.o obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += bugs_64.o diff --git a/arch/x86/kernel/cpu/sched.c b/arch/x86/kernel/cpu/sched.c new file mode 100644 index 000000000000..2e59c7162222 --- /dev/null +++ b/arch/x86/kernel/cpu/sched.c @@ -0,0 +1,58 @@ +#include <linux/sched.h> +#include <linux/math64.h> +#include <linux/percpu.h> +#include <linux/irqflags.h> + +#include <asm/cpufeature.h> +#include <asm/processor.h> + +static DEFINE_PER_CPU(struct aperfmperf, old_aperfmperf); + +static unsigned long scale_aperfmperf(void) +{ + struct aperfmperf cur, val, *old = &__get_cpu_var(old_aperfmperf); + unsigned long ratio = SCHED_LOAD_SCALE; + unsigned long flags; + + local_irq_save(flags); + get_aperfmperf(&val); + local_irq_restore(flags); + + cur = val; + cur.aperf -= old->aperf; + cur.mperf -= old->mperf; + *old = val; + + cur.mperf >>= SCHED_LOAD_SHIFT; + if (cur.mperf) + ratio = div_u64(cur.aperf, cur.mperf); + + return ratio; +} + +unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu) +{ + /* + * do aperf/mperf on the cpu level because it includes things + * like turbo mode, which are relevant to full cores. + */ + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) + return scale_aperfmperf(); + + /* + * maybe have something cpufreq here + */ + + return default_scale_freq_power(sd, cpu); +} + +unsigned long arch_scale_smt_power(struct sched_domain *sd, int cpu) +{ + /* + * aperf/mperf already includes the smt gain + */ + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) + return SCHED_LOAD_SCALE; + + return default_scale_smt_power(sd, cpu); +} diff --git a/include/linux/sched.h b/include/linux/sched.h index de9f89e184a4..c231a2467a83 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1047,6 +1047,10 @@ partition_sched_domains(int ndoms_new, struct cpumask *doms_new, } #endif /* !CONFIG_SMP */ + +unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu); +unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu); + struct io_context; /* See blkdev.h */ |