summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-10-01 22:06:40 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-10-01 22:06:40 -0400
commit1bca1000fa71a1092947b4a51928abe80a3316d2 (patch)
treed229750e2baeeba923722697fd0c40d4288442fc
parent3deaa4f531506a12ac4860ccd83cb6cbcb15a7eb (diff)
parenteb6d1c287ae1f7221248d5be26a5b1560073c09e (diff)
downloadlwn-1bca1000fa71a1092947b4a51928abe80a3316d2.tar.gz
lwn-1bca1000fa71a1092947b4a51928abe80a3316d2.zip
Merge tag 'pm+acpi-4.3-rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm
Pull power management and ACPI fixes from Rafael Wysocki: "These are fixes mostly, for a few changes made in this cycle (the intel_idle driver, the OPP library, the ACPI EC driver, turbostat) and for some issues that have just been discovered (ACPI PCI IRQ management, PCI power management documentation, turbostat), with a couple of cleanups on top of them. Specifics: - intel_idle driver fixup for the recently added Skylake chips support (Len Brown). - Operating Performance Points (OPP) library fix related to the recently added support for new DT bindings and a fix for a typo in a comment (Viresh Kumar, Stephen Boyd). - ACPI EC driver fix for a recently introduced memory leak in an error code path (Lv Zheng). - ACPI PCI IRQ management fix for the issue where an ISA IRQ is shared with a PCI device which requires it to be configured in a different way and may cause an interrupt storm to happen as a result with an extra ACPI SCI IRQ handling simplification on top of it (Jiang Liu). - Update of the PCI power management documentation that became outdated and started to actively confuse the readers to make it actually reflect the code (Rafael J Wysocki). - turbostat fixes including an IVB Xeon regression fix (related to the --debug command line option), Skylake adjustment for the TSC running at a frequency that doesn't match the base one exactly, and a Knights Landing quirk to account for the fact that it only updates APERF and MPERF every 1024 clock cycles plus bumping up the turbostat version number (Len Brown, Hubert Chrzaniuk)" * tag 'pm+acpi-4.3-rc4' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm: tools/power turbosat: update version number tools/power turbostat: SKL: Adjust for TSC difference from base frequency tools/power turbostat: KNL workaround for %Busy and Avg_MHz tools/power turbostat: IVB Xeon: fix --debug regression ACPI / PCI: Remove duplicated penalty on SCI IRQ ACPI, PCI, irq: Do not share PCI IRQ with ISA IRQ ACPI / EC: Fix a memory leak issue in acpi_ec_query() PM / OPP: Fix typo modifcation -> modification PCI / PM: Update runtime PM documentation for PCI devices PM / OPP: of_property_count_u32_elems() can return errors intel_idle: Skylake Client Support - updated
-rw-r--r--Documentation/power/pci.txt51
-rw-r--r--drivers/acpi/ec.c2
-rw-r--r--drivers/acpi/pci_irq.c1
-rw-r--r--drivers/acpi/pci_link.c16
-rw-r--r--drivers/base/power/opp.c17
-rw-r--r--drivers/idle/intel_idle.c12
-rw-r--r--drivers/pci/pci-driver.c7
-rw-r--r--include/linux/acpi.h1
-rw-r--r--tools/power/x86/turbostat/turbostat.c39
9 files changed, 116 insertions, 30 deletions
diff --git a/Documentation/power/pci.txt b/Documentation/power/pci.txt
index 62328d76b55b..b0e911e0e8f5 100644
--- a/Documentation/power/pci.txt
+++ b/Documentation/power/pci.txt
@@ -979,20 +979,45 @@ every time right after the runtime_resume() callback has returned
(alternatively, the runtime_suspend() callback will have to check if the
device should really be suspended and return -EAGAIN if that is not the case).
-The runtime PM of PCI devices is disabled by default. It is also blocked by
-pci_pm_init() that runs the pm_runtime_forbid() helper function. If a PCI
-driver implements the runtime PM callbacks and intends to use the runtime PM
-framework provided by the PM core and the PCI subsystem, it should enable this
-feature by executing the pm_runtime_enable() helper function. However, the
-driver should not call the pm_runtime_allow() helper function unblocking
-the runtime PM of the device. Instead, it should allow user space or some
-platform-specific code to do that (user space can do it via sysfs), although
-once it has called pm_runtime_enable(), it must be prepared to handle the
+The runtime PM of PCI devices is enabled by default by the PCI core. PCI
+device drivers do not need to enable it and should not attempt to do so.
+However, it is blocked by pci_pm_init() that runs the pm_runtime_forbid()
+helper function. In addition to that, the runtime PM usage counter of
+each PCI device is incremented by local_pci_probe() before executing the
+probe callback provided by the device's driver.
+
+If a PCI driver implements the runtime PM callbacks and intends to use the
+runtime PM framework provided by the PM core and the PCI subsystem, it needs
+to decrement the device's runtime PM usage counter in its probe callback
+function. If it doesn't do that, the counter will always be different from
+zero for the device and it will never be runtime-suspended. The simplest
+way to do that is by calling pm_runtime_put_noidle(), but if the driver
+wants to schedule an autosuspend right away, for example, it may call
+pm_runtime_put_autosuspend() instead for this purpose. Generally, it
+just needs to call a function that decrements the devices usage counter
+from its probe routine to make runtime PM work for the device.
+
+It is important to remember that the driver's runtime_suspend() callback
+may be executed right after the usage counter has been decremented, because
+user space may already have cuased the pm_runtime_allow() helper function
+unblocking the runtime PM of the device to run via sysfs, so the driver must
+be prepared to cope with that.
+
+The driver itself should not call pm_runtime_allow(), though. Instead, it
+should let user space or some platform-specific code do that (user space can
+do it via sysfs as stated above), but it must be prepared to handle the
runtime PM of the device correctly as soon as pm_runtime_allow() is called
-(which may happen at any time). [It also is possible that user space causes
-pm_runtime_allow() to be called via sysfs before the driver is loaded, so in
-fact the driver has to be prepared to handle the runtime PM of the device as
-soon as it calls pm_runtime_enable().]
+(which may happen at any time, even before the driver is loaded).
+
+When the driver's remove callback runs, it has to balance the decrementation
+of the device's runtime PM usage counter at the probe time. For this reason,
+if it has decremented the counter in its probe callback, it must run
+pm_runtime_get_noresume() in its remove callback. [Since the core carries
+out a runtime resume of the device and bumps up the device's usage counter
+before running the driver's remove callback, the runtime PM of the device
+is effectively disabled for the duration of the remove execution and all
+runtime PM helper functions incrementing the device's usage counter are
+then effectively equivalent to pm_runtime_get_noresume().]
The runtime PM framework works by processing requests to suspend or resume
devices, or to check if they are idle (in which cases it is reasonable to
diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c
index 2614a839c60d..42c66b64c12c 100644
--- a/drivers/acpi/ec.c
+++ b/drivers/acpi/ec.c
@@ -1044,8 +1044,10 @@ static int acpi_ec_query(struct acpi_ec *ec, u8 *data)
goto err_exit;
mutex_lock(&ec->mutex);
+ result = -ENODATA;
list_for_each_entry(handler, &ec->list, node) {
if (value == handler->query_bit) {
+ result = 0;
q->handler = acpi_ec_get_query_handler(handler);
ec_dbg_evt("Query(0x%02x) scheduled",
q->handler->query_bit);
diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c
index 6da0f9beab19..c9336751e5e3 100644
--- a/drivers/acpi/pci_irq.c
+++ b/drivers/acpi/pci_irq.c
@@ -372,6 +372,7 @@ static int acpi_isa_register_gsi(struct pci_dev *dev)
/* Interrupt Line values above 0xF are forbidden */
if (dev->irq > 0 && (dev->irq <= 0xF) &&
+ acpi_isa_irq_available(dev->irq) &&
(acpi_isa_irq_to_gsi(dev->irq, &dev_gsi) == 0)) {
dev_warn(&dev->dev, "PCI INT %c: no GSI - using ISA IRQ %d\n",
pin_name(dev->pin), dev->irq);
diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c
index 3b4ea98e3ea0..7c8408b946ca 100644
--- a/drivers/acpi/pci_link.c
+++ b/drivers/acpi/pci_link.c
@@ -498,8 +498,7 @@ int __init acpi_irq_penalty_init(void)
PIRQ_PENALTY_PCI_POSSIBLE;
}
}
- /* Add a penalty for the SCI */
- acpi_irq_penalty[acpi_gbl_FADT.sci_interrupt] += PIRQ_PENALTY_PCI_USING;
+
return 0;
}
@@ -553,6 +552,13 @@ static int acpi_pci_link_allocate(struct acpi_pci_link *link)
irq = link->irq.possible[i];
}
}
+ if (acpi_irq_penalty[irq] >= PIRQ_PENALTY_ISA_ALWAYS) {
+ printk(KERN_ERR PREFIX "No IRQ available for %s [%s]. "
+ "Try pci=noacpi or acpi=off\n",
+ acpi_device_name(link->device),
+ acpi_device_bid(link->device));
+ return -ENODEV;
+ }
/* Attempt to enable the link device at this IRQ. */
if (acpi_pci_link_set(link, irq)) {
@@ -821,6 +827,12 @@ void acpi_penalize_isa_irq(int irq, int active)
}
}
+bool acpi_isa_irq_available(int irq)
+{
+ return irq >= 0 && (irq >= ARRAY_SIZE(acpi_irq_penalty) ||
+ acpi_irq_penalty[irq] < PIRQ_PENALTY_ISA_ALWAYS);
+}
+
/*
* Penalize IRQ used by ACPI SCI. If ACPI SCI pin attributes conflict with
* PCI IRQ attributes, mark ACPI SCI as ISA_ALWAYS so it won't be use for
diff --git a/drivers/base/power/opp.c b/drivers/base/power/opp.c
index 28cd75c535b0..7ae7cd990fbf 100644
--- a/drivers/base/power/opp.c
+++ b/drivers/base/power/opp.c
@@ -892,10 +892,17 @@ static int opp_get_microvolt(struct dev_pm_opp *opp, struct device *dev)
u32 microvolt[3] = {0};
int count, ret;
- count = of_property_count_u32_elems(opp->np, "opp-microvolt");
- if (!count)
+ /* Missing property isn't a problem, but an invalid entry is */
+ if (!of_find_property(opp->np, "opp-microvolt", NULL))
return 0;
+ count = of_property_count_u32_elems(opp->np, "opp-microvolt");
+ if (count < 0) {
+ dev_err(dev, "%s: Invalid opp-microvolt property (%d)\n",
+ __func__, count);
+ return count;
+ }
+
/* There can be one or three elements here */
if (count != 1 && count != 3) {
dev_err(dev, "%s: Invalid number of elements in opp-microvolt property (%d)\n",
@@ -1063,7 +1070,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_add);
* share a common logic which is isolated here.
*
* Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
- * copy operation, returns 0 if no modifcation was done OR modification was
+ * copy operation, returns 0 if no modification was done OR modification was
* successful.
*
* Locking: The internal device_opp and opp structures are RCU protected.
@@ -1151,7 +1158,7 @@ unlock:
* mutex locking or synchronize_rcu() blocking calls cannot be used.
*
* Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
- * copy operation, returns 0 if no modifcation was done OR modification was
+ * copy operation, returns 0 if no modification was done OR modification was
* successful.
*/
int dev_pm_opp_enable(struct device *dev, unsigned long freq)
@@ -1177,7 +1184,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_enable);
* mutex locking or synchronize_rcu() blocking calls cannot be used.
*
* Return: -EINVAL for bad pointers, -ENOMEM if no memory available for the
- * copy operation, returns 0 if no modifcation was done OR modification was
+ * copy operation, returns 0 if no modification was done OR modification was
* successful.
*/
int dev_pm_opp_disable(struct device *dev, unsigned long freq)
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 3a3738fe016b..cd4510a63375 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -620,7 +620,7 @@ static struct cpuidle_state skl_cstates[] = {
.name = "C6-SKL",
.desc = "MWAIT 0x20",
.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
- .exit_latency = 75,
+ .exit_latency = 85,
.target_residency = 200,
.enter = &intel_idle,
.enter_freeze = intel_idle_freeze, },
@@ -636,11 +636,19 @@ static struct cpuidle_state skl_cstates[] = {
.name = "C8-SKL",
.desc = "MWAIT 0x40",
.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
- .exit_latency = 174,
+ .exit_latency = 200,
.target_residency = 800,
.enter = &intel_idle,
.enter_freeze = intel_idle_freeze, },
{
+ .name = "C9-SKL",
+ .desc = "MWAIT 0x50",
+ .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
+ .exit_latency = 480,
+ .target_residency = 5000,
+ .enter = &intel_idle,
+ .enter_freeze = intel_idle_freeze, },
+ {
.name = "C10-SKL",
.desc = "MWAIT 0x60",
.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index dd652f2ae03d..108a3118ace7 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -299,9 +299,10 @@ static long local_pci_probe(void *_ddi)
* Unbound PCI devices are always put in D0, regardless of
* runtime PM status. During probe, the device is set to
* active and the usage count is incremented. If the driver
- * supports runtime PM, it should call pm_runtime_put_noidle()
- * in its probe routine and pm_runtime_get_noresume() in its
- * remove routine.
+ * supports runtime PM, it should call pm_runtime_put_noidle(),
+ * or any other runtime PM helper function decrementing the usage
+ * count, in its probe routine and pm_runtime_get_noresume() in
+ * its remove routine.
*/
pm_runtime_get_sync(dev);
pci_dev->driver = pci_drv;
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 7235c4851460..43856d19cf4d 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -217,6 +217,7 @@ struct pci_dev;
int acpi_pci_irq_enable (struct pci_dev *dev);
void acpi_penalize_isa_irq(int irq, int active);
+bool acpi_isa_irq_available(int irq);
void acpi_penalize_sci_irq(int irq, int trigger, int polarity);
void acpi_pci_irq_disable (struct pci_dev *dev);
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 9655cb49c7cb..bde0ef1a63df 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -71,8 +71,11 @@ unsigned int extra_msr_offset32;
unsigned int extra_msr_offset64;
unsigned int extra_delta_offset32;
unsigned int extra_delta_offset64;
+unsigned int aperf_mperf_multiplier = 1;
int do_smi;
double bclk;
+double base_hz;
+double tsc_tweak = 1.0;
unsigned int show_pkg;
unsigned int show_core;
unsigned int show_cpu;
@@ -502,7 +505,7 @@ int format_counters(struct thread_data *t, struct core_data *c,
/* %Busy */
if (has_aperf) {
if (!skip_c0)
- outp += sprintf(outp, "%8.2f", 100.0 * t->mperf/t->tsc);
+ outp += sprintf(outp, "%8.2f", 100.0 * t->mperf/t->tsc/tsc_tweak);
else
outp += sprintf(outp, "********");
}
@@ -510,7 +513,7 @@ int format_counters(struct thread_data *t, struct core_data *c,
/* Bzy_MHz */
if (has_aperf)
outp += sprintf(outp, "%8.0f",
- 1.0 * t->tsc / units * t->aperf / t->mperf / interval_float);
+ 1.0 * t->tsc * tsc_tweak / units * t->aperf / t->mperf / interval_float);
/* TSC_MHz */
outp += sprintf(outp, "%8.0f", 1.0 * t->tsc/units/interval_float);
@@ -984,6 +987,8 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
return -3;
if (get_msr(cpu, MSR_IA32_MPERF, &t->mperf))
return -4;
+ t->aperf = t->aperf * aperf_mperf_multiplier;
+ t->mperf = t->mperf * aperf_mperf_multiplier;
}
if (do_smi) {
@@ -1149,6 +1154,19 @@ int slv_pkg_cstate_limits[16] = {PCL__0, PCL__1, PCLRSV, PCLRSV, PCL__4, PCLRSV,
int amt_pkg_cstate_limits[16] = {PCL__0, PCL__1, PCL__2, PCLRSV, PCLRSV, PCLRSV, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV};
int phi_pkg_cstate_limits[16] = {PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV, PCLRSV};
+
+static void
+calculate_tsc_tweak()
+{
+ unsigned long long msr;
+ unsigned int base_ratio;
+
+ get_msr(base_cpu, MSR_NHM_PLATFORM_INFO, &msr);
+ base_ratio = (msr >> 8) & 0xFF;
+ base_hz = base_ratio * bclk * 1000000;
+ tsc_tweak = base_hz / tsc_hz;
+}
+
static void
dump_nhm_platform_info(void)
{
@@ -1926,8 +1944,6 @@ int has_config_tdp(unsigned int family, unsigned int model)
switch (model) {
case 0x3A: /* IVB */
- case 0x3E: /* IVB Xeon */
-
case 0x3C: /* HSW */
case 0x3F: /* HSX */
case 0x45: /* HSW */
@@ -2543,6 +2559,13 @@ int is_knl(unsigned int family, unsigned int model)
return 0;
}
+unsigned int get_aperf_mperf_multiplier(unsigned int family, unsigned int model)
+{
+ if (is_knl(family, model))
+ return 1024;
+ return 1;
+}
+
#define SLM_BCLK_FREQS 5
double slm_freq_table[SLM_BCLK_FREQS] = { 83.3, 100.0, 133.3, 116.7, 80.0};
@@ -2744,6 +2767,9 @@ void process_cpuid()
}
}
+ if (has_aperf)
+ aperf_mperf_multiplier = get_aperf_mperf_multiplier(family, model);
+
do_nhm_platform_info = do_nhm_cstates = do_smi = probe_nhm_msrs(family, model);
do_snb_cstates = has_snb_msrs(family, model);
do_pc2 = do_snb_cstates && (pkg_cstate_limit >= PCL__2);
@@ -2762,6 +2788,9 @@ void process_cpuid()
if (debug)
dump_cstate_pstate_config_info();
+ if (has_skl_msrs(family, model))
+ calculate_tsc_tweak();
+
return;
}
@@ -3090,7 +3119,7 @@ int get_and_dump_counters(void)
}
void print_version() {
- fprintf(stderr, "turbostat version 4.7 17-June, 2015"
+ fprintf(stderr, "turbostat version 4.8 26-Sep, 2015"
" - Len Brown <lenb@kernel.org>\n");
}