From e5f1b245870d59be0e6cc3b33edf5406a3b59648 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Wed, 5 Oct 2016 09:33:12 +0200
Subject: cpuidle: governors: Remove remaining old module code

The governor's code use try_module_get() and put_module() to refcount
the governor's module. But the governors are not compiled as module.

The refcount does not prevent to switch the governor or unload
a module as they aren't compiled as modules. The code is pointless,
so remove it.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/governor.c         | 4 ----
 drivers/cpuidle/governors/ladder.c | 2 --
 drivers/cpuidle/governors/menu.c   | 2 --
 include/linux/cpuidle.h            | 2 --
 4 files changed, 10 deletions(-)

diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c
index fb9f511cca23..4e78263e34a4 100644
--- a/drivers/cpuidle/governor.c
+++ b/drivers/cpuidle/governor.c
@@ -9,7 +9,6 @@
  */
 
 #include <linux/mutex.h>
-#include <linux/module.h>
 #include <linux/cpuidle.h>
 
 #include "cpuidle.h"
@@ -53,14 +52,11 @@ int cpuidle_switch_governor(struct cpuidle_governor *gov)
 	if (cpuidle_curr_governor) {
 		list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
 			cpuidle_disable_device(dev);
-		module_put(cpuidle_curr_governor->owner);
 	}
 
 	cpuidle_curr_governor = gov;
 
 	if (gov) {
-		if (!try_module_get(cpuidle_curr_governor->owner))
-			return -EINVAL;
 		list_for_each_entry(dev, &cpuidle_detected_devices, device_list)
 			cpuidle_enable_device(dev);
 		cpuidle_install_idle_handler();
diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
index 63bd5a403e22..fe8f08948fcb 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -15,7 +15,6 @@
 #include <linux/kernel.h>
 #include <linux/cpuidle.h>
 #include <linux/pm_qos.h>
-#include <linux/module.h>
 #include <linux/jiffies.h>
 #include <linux/tick.h>
 
@@ -177,7 +176,6 @@ static struct cpuidle_governor ladder_governor = {
 	.enable =	ladder_enable_device,
 	.select =	ladder_select_state,
 	.reflect =	ladder_reflect,
-	.owner =	THIS_MODULE,
 };
 
 /**
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 03d38c291de6..d9b5b9398a0f 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -19,7 +19,6 @@
 #include <linux/tick.h>
 #include <linux/sched.h>
 #include <linux/math64.h>
-#include <linux/module.h>
 
 /*
  * Please note when changing the tuning values:
@@ -484,7 +483,6 @@ static struct cpuidle_governor menu_governor = {
 	.enable =	menu_enable_device,
 	.select =	menu_select,
 	.reflect =	menu_reflect,
-	.owner =	THIS_MODULE,
 };
 
 /**
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index bb31373c3478..15deea449edc 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -235,8 +235,6 @@ struct cpuidle_governor {
 	int  (*select)		(struct cpuidle_driver *drv,
 					struct cpuidle_device *dev);
 	void (*reflect)		(struct cpuidle_device *dev, int index);
-
-	struct module 		*owner;
 };
 
 #ifdef CONFIG_CPU_IDLE
-- 
cgit v1.2.3


From f5261402494a4893e7c5c6cd5ab99b8f7589b717 Mon Sep 17 00:00:00 2001
From: Shawn Lin <shawn.lin@rock-chips.com>
Date: Mon, 10 Oct 2016 20:44:22 +0800
Subject: PM / AVS: rockchip-io: make the log more consistent

When testing SD hotplug automatically, I got bunch of
useless log like this:

[  588.357838] mmc0: card 0007 removed
[  589.492664] rockchip-iodomain ff770000.syscon:io-domains: Setting to 3300000 done
[  589.500698] vccio_sd: ramp_delay not set
[  589.504817] rockchip-iodomain ff770000.syscon:io-domains: Setting to 3300000 done
[  589.669705] rockchip-iodomain ff770000.syscon:io-domains: Setting to 3300000 done
[  589.677593] vccio_sd: ramp_delay not set
[  589.681581] rockchip-iodomain ff770000.syscon:io-domains: Setting to 1800000 done
[  590.032820] dwmmc_rockchip ff0c0000.dwmmc: Successfully tuned phase to 140
[  590.039725] mmc0: new ultra high speed SDR50 SDHC card at address 0007
[  590.046641] mmcblk0: mmc0:0007 SD32G 29.3 GiB
[  590.052163]  mmcblk0: p1

Moreover the code is intent to print the 'uV' for debug but
later print it using dev_info. It looks more like to me that
it should be the real intention of the code. Anyway, let's
mark this verbose log as debug message.

Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com>
Reviewed-by: Heiko Stuebner <heiko@sntech.de>
Acked-by: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/power/avs/rockchip-io-domain.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/power/avs/rockchip-io-domain.c b/drivers/power/avs/rockchip-io-domain.c
index 01b6d3f9b8fb..56bce1908be2 100644
--- a/drivers/power/avs/rockchip-io-domain.c
+++ b/drivers/power/avs/rockchip-io-domain.c
@@ -143,7 +143,7 @@ static int rockchip_iodomain_notify(struct notifier_block *nb,
 	if (ret && event == REGULATOR_EVENT_PRE_VOLTAGE_CHANGE)
 		return NOTIFY_BAD;
 
-	dev_info(supply->iod->dev, "Setting to %d done\n", uV);
+	dev_dbg(supply->iod->dev, "Setting to %d done\n", uV);
 	return NOTIFY_OK;
 }
 
-- 
cgit v1.2.3


From 974f86498ed28d86575cd2327bd83e53abd872da Mon Sep 17 00:00:00 2001
From: "Prakash, Prashanth" <pprakash@codeaurora.org>
Date: Fri, 14 Oct 2016 12:00:23 -0600
Subject: cpufreq / CPPC: Add MODULE_DEVICE_TABLE for cppc_cpufreq driver

MODULE_DEVICE_TABLE is added so that CPPC cpufreq module can be
automatically loaded when we have a acpi processor device with
"ACPI0007" hid.

Signed-off-by: Prashanth Prakash <pprakash@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cppc_cpufreq.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index 4852d9efe74e..e82bb3c30b92 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -247,3 +247,10 @@ MODULE_DESCRIPTION("CPUFreq driver based on the ACPI CPPC v5.0+ spec");
 MODULE_LICENSE("GPL");
 
 late_initcall(cppc_cpufreq_init);
+
+static const struct acpi_device_id cppc_acpi_ids[] = {
+	{ACPI_PROCESSOR_DEVICE_HID, },
+	{}
+};
+
+MODULE_DEVICE_TABLE(acpi, cppc_acpi_ids);
-- 
cgit v1.2.3


From 4bee77dc734ccd4119bb750479809ebc82776bef Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 20 Oct 2016 15:44:53 +0900
Subject: PM / OPP: make _of_get_opp_desc_node() a static function

Since commit f47b72a15a96 ("PM / OPP: Move CONFIG_OF dependent code
in a separate file"), this function is defined and called only in
drivers/base/power/opp/of.c .

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/opp/of.c  | 2 +-
 drivers/base/power/opp/opp.h | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/base/power/opp/of.c b/drivers/base/power/opp/of.c
index 5552211e6fcd..6480137bd256 100644
--- a/drivers/base/power/opp/of.c
+++ b/drivers/base/power/opp/of.c
@@ -198,7 +198,7 @@ void dev_pm_opp_of_remove_table(struct device *dev)
 EXPORT_SYMBOL_GPL(dev_pm_opp_of_remove_table);
 
 /* Returns opp descriptor node for a device, caller must do of_node_put() */
-struct device_node *_of_get_opp_desc_node(struct device *dev)
+static struct device_node *_of_get_opp_desc_node(struct device *dev)
 {
 	/*
 	 * TODO: Support for multiple OPP tables.
diff --git a/drivers/base/power/opp/opp.h b/drivers/base/power/opp/opp.h
index fabd5ca1a083..96cd30ac6c1d 100644
--- a/drivers/base/power/opp/opp.h
+++ b/drivers/base/power/opp/opp.h
@@ -190,7 +190,6 @@ struct opp_table {
 /* Routines internal to opp core */
 struct opp_table *_find_opp_table(struct device *dev);
 struct opp_device *_add_opp_dev(const struct device *dev, struct opp_table *opp_table);
-struct device_node *_of_get_opp_desc_node(struct device *dev);
 void _dev_pm_opp_remove_table(struct device *dev, bool remove_all);
 struct dev_pm_opp *_allocate_opp(struct device *dev, struct opp_table **opp_table);
 int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, struct opp_table *opp_table);
-- 
cgit v1.2.3


From 349aa92e81bd14552d8d2335aff490f306038603 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 20 Oct 2016 16:12:49 +0900
Subject: PM / OPP: fix debug/error messages in
 dev_pm_opp_of_get_sharing_cpus()

These log messages are wrong because _of_get_opp_desc_node() returns
an operating-points-v2 node.

Commit a6eed752f5fb ("PM / OPP: passing NULL to PTR_ERR()") fixed
static checker warnings, and reworded the messages at the same time
(but the latter was not mentioned in the git-log).

Restore the correct messages.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/opp/of.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/base/power/opp/of.c b/drivers/base/power/opp/of.c
index 6480137bd256..5b3755e49731 100644
--- a/drivers/base/power/opp/of.c
+++ b/drivers/base/power/opp/of.c
@@ -562,7 +562,7 @@ int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev,
 	/* Get OPP descriptor node */
 	np = _of_get_opp_desc_node(cpu_dev);
 	if (!np) {
-		dev_dbg(cpu_dev, "%s: Couldn't find cpu_dev node.\n", __func__);
+		dev_dbg(cpu_dev, "%s: Couldn't find opp node.\n", __func__);
 		return -ENOENT;
 	}
 
@@ -587,7 +587,7 @@ int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev,
 		/* Get OPP descriptor node */
 		tmp_np = _of_get_opp_desc_node(tcpu_dev);
 		if (!tmp_np) {
-			dev_err(tcpu_dev, "%s: Couldn't find tcpu_dev node.\n",
+			dev_err(tcpu_dev, "%s: Couldn't find opp node.\n",
 				__func__);
 			ret = -ENOENT;
 			goto put_cpu_node;
-- 
cgit v1.2.3


From 62006c1702b3b1be0c0726949e0ee0ea2326be9c Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Mon, 17 Oct 2016 20:16:58 +0200
Subject: PM / Runtime: Remove the exported function pm_children_suspended()

The exported function pm_children_suspended() has only one caller, which is
the runtime PM internal function, rpm_check_suspend_allowed().

Let's clean-up this code, by removing pm_children_suspended() altogether
and instead do the one-liner check directly in rpm_check_suspend_allowed().

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/runtime.c | 3 ++-
 include/linux/pm_runtime.h   | 7 -------
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 82a081ea4317..53b427dfc403 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -241,7 +241,8 @@ static int rpm_check_suspend_allowed(struct device *dev)
 		retval = -EACCES;
 	else if (atomic_read(&dev->power.usage_count) > 0)
 		retval = -EAGAIN;
-	else if (!pm_children_suspended(dev))
+	else if (!dev->power.ignore_children &&
+			atomic_read(&dev->power.child_count))
 		retval = -EBUSY;
 
 	/* Pending resume requests take precedence over suspends. */
diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 2e14d2667b6c..61ea5666c94c 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -61,12 +61,6 @@ static inline void pm_suspend_ignore_children(struct device *dev, bool enable)
 	dev->power.ignore_children = enable;
 }
 
-static inline bool pm_children_suspended(struct device *dev)
-{
-	return dev->power.ignore_children
-		|| !atomic_read(&dev->power.child_count);
-}
-
 static inline void pm_runtime_get_noresume(struct device *dev)
 {
 	atomic_inc(&dev->power.usage_count);
@@ -162,7 +156,6 @@ static inline void pm_runtime_allow(struct device *dev) {}
 static inline void pm_runtime_forbid(struct device *dev) {}
 
 static inline void pm_suspend_ignore_children(struct device *dev, bool enable) {}
-static inline bool pm_children_suspended(struct device *dev) { return false; }
 static inline void pm_runtime_get_noresume(struct device *dev) {}
 static inline void pm_runtime_put_noidle(struct device *dev) {}
 static inline bool device_run_wake(struct device *dev) { return false; }
-- 
cgit v1.2.3


From 216ef0b6b8c7f041a618913e94da52c3fdf82a99 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Mon, 17 Oct 2016 20:16:59 +0200
Subject: PM / Runtime: Clarify comment in rpm_resume() when resuming the
 parent

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/runtime.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 53b427dfc403..117db71e84cb 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -713,8 +713,8 @@ static int rpm_resume(struct device *dev, int rpmflags)
 
 		spin_lock(&parent->power.lock);
 		/*
-		 * We can resume if the parent's runtime PM is disabled or it
-		 * is set to ignore children.
+		 * Resume the parent if it has runtime PM enabled and not been
+		 * set to ignore its children.
 		 */
 		if (!parent->power.disable_depth
 		    && !parent->power.ignore_children) {
-- 
cgit v1.2.3


From b1a60995a684f2b6052cda640b0704361ab40089 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Mon, 17 Oct 2016 20:17:00 +0200
Subject: PM / Runtime: Convert pm_runtime_set_suspended() to return an int

Because pm_runtime_set_suspended() invokes __pm_runtime_set_status(), which
can fail, pm_runtime_set_suspended() can also fail.

Instead of hiding a potential error, let's propagate it by converting
pm_runtime_set_suspended() from a void to return an int. In this way users
are able to check the error code and act accordingly.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm_runtime.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h
index 61ea5666c94c..4957fc185ea9 100644
--- a/include/linux/pm_runtime.h
+++ b/include/linux/pm_runtime.h
@@ -258,9 +258,9 @@ static inline int pm_runtime_set_active(struct device *dev)
 	return __pm_runtime_set_status(dev, RPM_ACTIVE);
 }
 
-static inline void pm_runtime_set_suspended(struct device *dev)
+static inline int pm_runtime_set_suspended(struct device *dev)
 {
-	__pm_runtime_set_status(dev, RPM_SUSPENDED);
+	return __pm_runtime_set_status(dev, RPM_SUSPENDED);
 }
 
 static inline void pm_runtime_disable(struct device *dev)
-- 
cgit v1.2.3


From 1d29815ef2d2dd0fb1a62494a9c158c0dc49e2c8 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 19 Oct 2016 02:53:26 +0200
Subject: cpufreq: intel_pstate: Drop boost_iowait flag

The "IOwait boosting" mechanism is only used by the
get_target_pstate_use_cpu_load() governor function and the
boost_iowait flag in pid_params is always set when that function
is in use (and it is never set otherwise).  This means that the
boost_iowait flag is in fact redundant and may be dropped.

For this reason, replace the boost_iowait flag check in
intel_pstate_update_util() with an equivalent check against
pstate_funcs.get_target_pstate and drop that flag.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
---
 drivers/cpufreq/intel_pstate.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index f535f8123258..16d071a618b7 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -233,7 +233,6 @@ static struct cpudata **all_cpu_data;
  * @p_gain_pct:		PID proportional gain
  * @i_gain_pct:		PID integral gain
  * @d_gain_pct:		PID derivative gain
- * @boost_iowait:	Whether or not to use iowait boosting.
  *
  * Stores per CPU model static PID configuration data.
  */
@@ -245,7 +244,6 @@ struct pstate_adjust_policy {
 	int p_gain_pct;
 	int d_gain_pct;
 	int i_gain_pct;
-	bool boost_iowait;
 };
 
 /**
@@ -1043,7 +1041,6 @@ static const struct cpu_defaults silvermont_params = {
 		.p_gain_pct = 14,
 		.d_gain_pct = 0,
 		.i_gain_pct = 4,
-		.boost_iowait = true,
 	},
 	.funcs = {
 		.get_max = atom_get_max_pstate,
@@ -1065,7 +1062,6 @@ static const struct cpu_defaults airmont_params = {
 		.p_gain_pct = 14,
 		.d_gain_pct = 0,
 		.i_gain_pct = 4,
-		.boost_iowait = true,
 	},
 	.funcs = {
 		.get_max = atom_get_max_pstate,
@@ -1107,7 +1103,6 @@ static const struct cpu_defaults bxt_params = {
 		.p_gain_pct = 14,
 		.d_gain_pct = 0,
 		.i_gain_pct = 4,
-		.boost_iowait = true,
 	},
 	.funcs = {
 		.get_max = core_get_max_pstate,
@@ -1347,7 +1342,7 @@ static void intel_pstate_update_util(struct update_util_data *data, u64 time,
 	struct cpudata *cpu = container_of(data, struct cpudata, update_util);
 	u64 delta_ns;
 
-	if (pid_params.boost_iowait) {
+	if (pstate_funcs.get_target_pstate == get_target_pstate_use_cpu_load) {
 		if (flags & SCHED_CPUFREQ_IOWAIT) {
 			cpu->iowait_boost = int_tofp(1);
 		} else if (cpu->iowait_boost) {
-- 
cgit v1.2.3


From 185d82456edaa5a7a8c8827840b8a675c60579c0 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Fri, 21 Oct 2016 09:38:20 -0700
Subject: cpufreq: intel_pstate: Remove PID debugfs when not used

When target state is calculated using get_target_pstate_use_cpu_load(),
PID controller is not used, hence it has no effect on performance.
So don't present debugfs entries to tune PID controller.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 16d071a618b7..d46e881ff4d7 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -636,8 +636,10 @@ static void __init intel_pstate_debug_expose_params(void)
 	struct dentry *debugfs_parent;
 	int i = 0;
 
-	if (hwp_active)
+	if (hwp_active ||
+	    pstate_funcs.get_target_pstate == get_target_pstate_use_cpu_load)
 		return;
+
 	debugfs_parent = debugfs_create_dir("pstate_snb", NULL);
 	if (IS_ERR_OR_NULL(debugfs_parent))
 		return;
-- 
cgit v1.2.3


From b8b97a4252547159fa4f3f46a8aa862c401cf969 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Thu, 20 Oct 2016 13:37:06 -0700
Subject: Documentation: intel_pstate: PID tuning is not always available

PID tuning is not available when the get_target_pstate_use_cpu_load()
is used to calculate target_state. So update the documentation.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/cpu-freq/intel-pstate.txt | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/Documentation/cpu-freq/intel-pstate.txt b/Documentation/cpu-freq/intel-pstate.txt
index e6bd1e6512a5..5528d6d59687 100644
--- a/Documentation/cpu-freq/intel-pstate.txt
+++ b/Documentation/cpu-freq/intel-pstate.txt
@@ -122,11 +122,12 @@ will run at is selected by the processor itself.
 
 Tuning Intel P-State driver
 
-When HWP mode is not used, debugfs files have also been added to allow the
-tuning of the internal governor algorithm. These files are located at
-/sys/kernel/debug/pstate_snb/. The algorithm uses a PID (Proportional
-Integral Derivative) controller. The PID tunable parameters are:
+When the performance can be tuned using PID (Proportional Integral
+Derivative) controller, debugfs files are provided for adjusting performance.
+They are presented under:
+/sys/kernel/debug/pstate_snb/
 
+The PID tunable parameters are:
       deadband
       d_gain_pct
       i_gain_pct
-- 
cgit v1.2.3


From 59d65b73a23cee48e6f3e44686f199d79b7ee854 Mon Sep 17 00:00:00 2001
From: Lina Iyer <lina.iyer@linaro.org>
Date: Fri, 14 Oct 2016 10:47:49 -0700
Subject: PM / Domains: Make genpd state allocation dynamic

Allow PM Domain states to be defined dynamically by the drivers. This
removes the limitation on the maximum number of states possible for a
domain.

Suggested-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Lina Iyer <lina.iyer@linaro.org>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/arm/mach-imx/gpc.c     | 17 ++++++++++-------
 drivers/base/power/domain.c | 35 +++++++++++++++++++++++------------
 include/linux/pm_domain.h   |  5 ++---
 3 files changed, 35 insertions(+), 22 deletions(-)

diff --git a/arch/arm/mach-imx/gpc.c b/arch/arm/mach-imx/gpc.c
index 0df062d8b2c9..57a410bbb6a2 100644
--- a/arch/arm/mach-imx/gpc.c
+++ b/arch/arm/mach-imx/gpc.c
@@ -380,13 +380,6 @@ static struct pu_domain imx6q_pu_domain = {
 		.name = "PU",
 		.power_off = imx6q_pm_pu_power_off,
 		.power_on = imx6q_pm_pu_power_on,
-		.states = {
-			[0] = {
-				.power_off_latency_ns = 25000,
-				.power_on_latency_ns = 2000000,
-			},
-		},
-		.state_count = 1,
 	},
 };
 
@@ -430,6 +423,16 @@ static int imx_gpc_genpd_init(struct device *dev, struct regulator *pu_reg)
 	if (!IS_ENABLED(CONFIG_PM_GENERIC_DOMAINS))
 		return 0;
 
+	imx6q_pu_domain.base.states = devm_kzalloc(dev,
+					sizeof(*imx6q_pu_domain.base.states),
+					GFP_KERNEL);
+	if (!imx6q_pu_domain.base.states)
+		return -ENOMEM;
+
+	imx6q_pu_domain.base.states[0].power_off_latency_ns = 25000;
+	imx6q_pu_domain.base.states[0].power_on_latency_ns = 2000000;
+	imx6q_pu_domain.base.state_count = 1;
+
 	pm_genpd_init(&imx6q_pu_domain.base, NULL, false);
 	return of_genpd_add_provider_onecell(dev->of_node,
 					     &imx_gpc_onecell_data);
diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index e023066e4215..37ab7f1ef178 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1282,6 +1282,21 @@ out:
 }
 EXPORT_SYMBOL_GPL(pm_genpd_remove_subdomain);
 
+static int genpd_set_default_power_state(struct generic_pm_domain *genpd)
+{
+	struct genpd_power_state *state;
+
+	state = kzalloc(sizeof(*state), GFP_KERNEL);
+	if (!state)
+		return -ENOMEM;
+
+	genpd->states = state;
+	genpd->state_count = 1;
+	genpd->free = state;
+
+	return 0;
+}
+
 /**
  * pm_genpd_init - Initialize a generic I/O PM domain object.
  * @genpd: PM domain object to initialize.
@@ -1293,6 +1308,8 @@ EXPORT_SYMBOL_GPL(pm_genpd_remove_subdomain);
 int pm_genpd_init(struct generic_pm_domain *genpd,
 		  struct dev_power_governor *gov, bool is_off)
 {
+	int ret;
+
 	if (IS_ERR_OR_NULL(genpd))
 		return -EINVAL;
 
@@ -1325,19 +1342,12 @@ int pm_genpd_init(struct generic_pm_domain *genpd,
 		genpd->dev_ops.start = pm_clk_resume;
 	}
 
-	if (genpd->state_idx >= GENPD_MAX_NUM_STATES) {
-		pr_warn("Initial state index out of bounds.\n");
-		genpd->state_idx = GENPD_MAX_NUM_STATES - 1;
-	}
-
-	if (genpd->state_count > GENPD_MAX_NUM_STATES) {
-		pr_warn("Limiting states to  %d\n", GENPD_MAX_NUM_STATES);
-		genpd->state_count = GENPD_MAX_NUM_STATES;
-	}
-
 	/* Use only one "off" state if there were no states declared */
-	if (genpd->state_count == 0)
-		genpd->state_count = 1;
+	if (genpd->state_count == 0) {
+		ret = genpd_set_default_power_state(genpd);
+		if (ret)
+			return ret;
+	}
 
 	mutex_lock(&gpd_list_lock);
 	list_add(&genpd->gpd_list_node, &gpd_list);
@@ -1377,6 +1387,7 @@ static int genpd_remove(struct generic_pm_domain *genpd)
 	list_del(&genpd->gpd_list_node);
 	mutex_unlock(&genpd->lock);
 	cancel_work_sync(&genpd->power_off_work);
+	kfree(genpd->free);
 	pr_debug("%s: removed %s\n", __func__, genpd->name);
 
 	return 0;
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index a09fe5c009c8..de1d8f331b03 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -19,8 +19,6 @@
 /* Defines used for the flags field in the struct generic_pm_domain */
 #define GENPD_FLAG_PM_CLK	(1U << 0) /* PM domain uses PM clk */
 
-#define GENPD_MAX_NUM_STATES	8 /* Number of possible low power states */
-
 enum gpd_status {
 	GPD_STATE_ACTIVE = 0,	/* PM domain is active */
 	GPD_STATE_POWER_OFF,	/* PM domain is off */
@@ -70,9 +68,10 @@ struct generic_pm_domain {
 	void (*detach_dev)(struct generic_pm_domain *domain,
 			   struct device *dev);
 	unsigned int flags;		/* Bit field of configs for genpd */
-	struct genpd_power_state states[GENPD_MAX_NUM_STATES];
+	struct genpd_power_state *states;
 	unsigned int state_count; /* number of states */
 	unsigned int state_idx; /* state that genpd will go to when off */
+	void *free; /* Free the state that was allocated for default */
 
 };
 
-- 
cgit v1.2.3


From 405f7226014093a2809f27ba32a8230e770ac876 Mon Sep 17 00:00:00 2001
From: Lina Iyer <lina.iyer@linaro.org>
Date: Fri, 14 Oct 2016 10:47:50 -0700
Subject: PM / Domains: Add residency property to genpd states

Residency of a domain's idle state indicates that the minimum idle time
for the domain's idle state to be beneficial for power. Add the
parameter to the state node. Future patches, will use the residency
value in the genpd governor to determine if it is worth while to enter
an idle state.

Signed-off-by: Lina Iyer <lina.iyer@linaro.org>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm_domain.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index de1d8f331b03..f4492eb71701 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -38,6 +38,7 @@ struct gpd_dev_ops {
 struct genpd_power_state {
 	s64 power_off_latency_ns;
 	s64 power_on_latency_ns;
+	s64 residency_ns;
 };
 
 struct generic_pm_domain {
-- 
cgit v1.2.3


From 30f604283e05d34cb10108c7ba017e5f4fc9d62c Mon Sep 17 00:00:00 2001
From: Lina Iyer <lina.iyer@linaro.org>
Date: Fri, 14 Oct 2016 10:47:51 -0700
Subject: PM / Domains: Allow domain power states to be read from DT

This patch allows domains to define idle states in the DT. SoC's can
define domain idle states in DT using the "domain-idle-states" property
of the domain provider. Add API to read the idle states from DT that can
be set in the genpd object.

This patch is based on the original patch by Marc Titinger.

Signed-off-by: Marc Titinger <mtitinger+renesas@baylibre.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Lina Iyer <lina.iyer@linaro.org>
Reviewed-by: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 94 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/pm_domain.h   |  8 ++++
 2 files changed, 102 insertions(+)

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 37ab7f1ef178..9af75ba0472a 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1916,6 +1916,100 @@ out:
 	return ret ? -EPROBE_DEFER : 0;
 }
 EXPORT_SYMBOL_GPL(genpd_dev_pm_attach);
+
+static const struct of_device_id idle_state_match[] = {
+	{ .compatible = "arm,idle-state", },
+	{ }
+};
+
+static int genpd_parse_state(struct genpd_power_state *genpd_state,
+				    struct device_node *state_node)
+{
+	int err;
+	u32 residency;
+	u32 entry_latency, exit_latency;
+	const struct of_device_id *match_id;
+
+	match_id = of_match_node(idle_state_match, state_node);
+	if (!match_id)
+		return -EINVAL;
+
+	err = of_property_read_u32(state_node, "entry-latency-us",
+						&entry_latency);
+	if (err) {
+		pr_debug(" * %s missing entry-latency-us property\n",
+						state_node->full_name);
+		return -EINVAL;
+	}
+
+	err = of_property_read_u32(state_node, "exit-latency-us",
+						&exit_latency);
+	if (err) {
+		pr_debug(" * %s missing exit-latency-us property\n",
+						state_node->full_name);
+		return -EINVAL;
+	}
+
+	err = of_property_read_u32(state_node, "min-residency-us", &residency);
+	if (!err)
+		genpd_state->residency_ns = 1000 * residency;
+
+	genpd_state->power_on_latency_ns = 1000 * exit_latency;
+	genpd_state->power_off_latency_ns = 1000 * entry_latency;
+
+	return 0;
+}
+
+/**
+ * of_genpd_parse_idle_states: Return array of idle states for the genpd.
+ *
+ * @dn: The genpd device node
+ * @states: The pointer to which the state array will be saved.
+ * @n: The count of elements in the array returned from this function.
+ *
+ * Returns the device states parsed from the OF node. The memory for the states
+ * is allocated by this function and is the responsibility of the caller to
+ * free the memory after use.
+ */
+int of_genpd_parse_idle_states(struct device_node *dn,
+			struct genpd_power_state **states, int *n)
+{
+	struct genpd_power_state *st;
+	struct device_node *np;
+	int i = 0;
+	int err, ret;
+	int count;
+	struct of_phandle_iterator it;
+
+	count = of_count_phandle_with_args(dn, "domain-idle-states", NULL);
+	if (!count)
+		return -EINVAL;
+
+	st = kcalloc(count, sizeof(*st), GFP_KERNEL);
+	if (!st)
+		return -ENOMEM;
+
+	/* Loop over the phandles until all the requested entry is found */
+	of_for_each_phandle(&it, err, dn, "domain-idle-states", NULL, 0) {
+		np = it.node;
+		ret = genpd_parse_state(&st[i++], np);
+		if (ret) {
+			pr_err
+			("Parsing idle state node %s failed with err %d\n",
+							np->full_name, ret);
+			of_node_put(np);
+			kfree(st);
+			return ret;
+		}
+	}
+
+	*n = count;
+	*states = st;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(of_genpd_parse_idle_states);
+
 #endif /* CONFIG_PM_GENERIC_DOMAINS_OF */
 
 
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index f4492eb71701..b4894969fbec 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -205,6 +205,8 @@ extern int of_genpd_add_device(struct of_phandle_args *args,
 extern int of_genpd_add_subdomain(struct of_phandle_args *parent,
 				  struct of_phandle_args *new_subdomain);
 extern struct generic_pm_domain *of_genpd_remove_last(struct device_node *np);
+extern int of_genpd_parse_idle_states(struct device_node *dn,
+			struct genpd_power_state **states, int *n);
 
 int genpd_dev_pm_attach(struct device *dev);
 #else /* !CONFIG_PM_GENERIC_DOMAINS_OF */
@@ -234,6 +236,12 @@ static inline int of_genpd_add_subdomain(struct of_phandle_args *parent,
 	return -ENODEV;
 }
 
+static inline int of_genpd_parse_idle_states(struct device_node *dn,
+			struct genpd_power_state **states, int *n)
+{
+	return -ENODEV;
+}
+
 static inline int genpd_dev_pm_attach(struct device *dev)
 {
 	return -ENODEV;
-- 
cgit v1.2.3


From 0c9b694a8a7d4853318c4f2ce315afa2bd3664b6 Mon Sep 17 00:00:00 2001
From: Lina Iyer <lina.iyer@linaro.org>
Date: Fri, 14 Oct 2016 10:47:52 -0700
Subject: PM / Domains: Save the fwnode in genpd_power_state

Save the fwnode for the genpd state in the state node. PM Domain clients
may use the fwnode to read in the platform specific domain state
properties and associate them with the state.

Signed-off-by: Lina Iyer <lina.iyer@linaro.org>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 1 +
 include/linux/pm_domain.h   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 9af75ba0472a..1a6073aaca0e 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1956,6 +1956,7 @@ static int genpd_parse_state(struct genpd_power_state *genpd_state,
 
 	genpd_state->power_on_latency_ns = 1000 * exit_latency;
 	genpd_state->power_off_latency_ns = 1000 * entry_latency;
+	genpd_state->fwnode = &state_node->fwnode;
 
 	return 0;
 }
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index b4894969fbec..6a8988166899 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -39,6 +39,7 @@ struct genpd_power_state {
 	s64 power_off_latency_ns;
 	s64 power_on_latency_ns;
 	s64 residency_ns;
+	struct fwnode_handle *fwnode;
 };
 
 struct generic_pm_domain {
-- 
cgit v1.2.3


From 70bb510e42797792d9fc4842f01fa2fb685717b6 Mon Sep 17 00:00:00 2001
From: Lina Iyer <lina.iyer@linaro.org>
Date: Fri, 14 Oct 2016 10:47:53 -0700
Subject: dt/bindings / PM/Domains: Update binding for PM domain idle states

Update DT bindings to describe idle states of PM domains.

This patch is based on the original patch by Marc Titinger.

Signed-off-by: Marc Titinger <mtitinger+renesas@baylibre.com>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Lina Iyer <lina.iyer@linaro.org>
Acked-by: Rob Herring <robh@kernel.org>
Reviewed-by: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 .../devicetree/bindings/power/power_domain.txt     | 43 ++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/Documentation/devicetree/bindings/power/power_domain.txt b/Documentation/devicetree/bindings/power/power_domain.txt
index 025b5e7df61c..e1650364b296 100644
--- a/Documentation/devicetree/bindings/power/power_domain.txt
+++ b/Documentation/devicetree/bindings/power/power_domain.txt
@@ -29,6 +29,15 @@ Optional properties:
    specified by this binding. More details about power domain specifier are
    available in the next section.
 
+- domain-idle-states : A phandle of an idle-state that shall be soaked into a
+                generic domain power state. The idle state definitions are
+                compatible with arm,idle-state specified in [1].
+  The domain-idle-state property reflects the idle state of this PM domain and
+  not the idle states of the devices or sub-domains in the PM domain. Devices
+  and sub-domains have their own idle-states independent of the parent
+  domain's idle states. In the absence of this property, the domain would be
+  considered as capable of being powered-on or powered-off.
+
 Example:
 
 	power: power-controller@12340000 {
@@ -59,6 +68,38 @@ The nodes above define two power controllers: 'parent' and 'child'.
 Domains created by the 'child' power controller are subdomains of '0' power
 domain provided by the 'parent' power controller.
 
+Example 3:
+	parent: power-controller@12340000 {
+		compatible = "foo,power-controller";
+		reg = <0x12340000 0x1000>;
+		#power-domain-cells = <0>;
+		domain-idle-states = <&DOMAIN_RET>, <&DOMAIN_PWR_DN>;
+	};
+
+	child: power-controller@12341000 {
+		compatible = "foo,power-controller";
+		reg = <0x12341000 0x1000>;
+		power-domains = <&parent 0>;
+		#power-domain-cells = <0>;
+		domain-idle-states = <&DOMAIN_PWR_DN>;
+	};
+
+	DOMAIN_RET: state@0 {
+		compatible = "arm,idle-state";
+		reg = <0x0>;
+		entry-latency-us = <1000>;
+		exit-latency-us = <2000>;
+		min-residency-us = <10000>;
+	};
+
+	DOMAIN_PWR_DN: state@1 {
+		compatible = "arm,idle-state";
+		reg = <0x1>;
+		entry-latency-us = <5000>;
+		exit-latency-us = <8000>;
+		min-residency-us = <7000>;
+	};
+
 ==PM domain consumers==
 
 Required properties:
@@ -76,3 +117,5 @@ Example:
 The node above defines a typical PM domain consumer device, which is located
 inside a PM domain with index 0 of a power controller represented by a node
 with the label "power".
+
+[1]. Documentation/devicetree/bindings/arm/idle-states.txt
-- 
cgit v1.2.3


From 35241d12f750d2f1556a9c85f175ce7044716881 Mon Sep 17 00:00:00 2001
From: Lina Iyer <lina.iyer@linaro.org>
Date: Fri, 14 Oct 2016 10:47:54 -0700
Subject: PM / Domains: Abstract genpd locking

Abstract genpd lock/unlock calls, in preparation for domain specific
locks added in the following patches.

Signed-off-by: Lina Iyer <lina.iyer@linaro.org>
Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 121 +++++++++++++++++++++++++++++---------------
 include/linux/pm_domain.h   |   5 +-
 2 files changed, 85 insertions(+), 41 deletions(-)

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 1a6073aaca0e..4194012cdf86 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -39,6 +39,46 @@
 static LIST_HEAD(gpd_list);
 static DEFINE_MUTEX(gpd_list_lock);
 
+struct genpd_lock_ops {
+	void (*lock)(struct generic_pm_domain *genpd);
+	void (*lock_nested)(struct generic_pm_domain *genpd, int depth);
+	int (*lock_interruptible)(struct generic_pm_domain *genpd);
+	void (*unlock)(struct generic_pm_domain *genpd);
+};
+
+static void genpd_lock_mtx(struct generic_pm_domain *genpd)
+{
+	mutex_lock(&genpd->mlock);
+}
+
+static void genpd_lock_nested_mtx(struct generic_pm_domain *genpd,
+					int depth)
+{
+	mutex_lock_nested(&genpd->mlock, depth);
+}
+
+static int genpd_lock_interruptible_mtx(struct generic_pm_domain *genpd)
+{
+	return mutex_lock_interruptible(&genpd->mlock);
+}
+
+static void genpd_unlock_mtx(struct generic_pm_domain *genpd)
+{
+	return mutex_unlock(&genpd->mlock);
+}
+
+static const struct genpd_lock_ops genpd_mtx_ops = {
+	.lock = genpd_lock_mtx,
+	.lock_nested = genpd_lock_nested_mtx,
+	.lock_interruptible = genpd_lock_interruptible_mtx,
+	.unlock = genpd_unlock_mtx,
+};
+
+#define genpd_lock(p)			p->lock_ops->lock(p)
+#define genpd_lock_nested(p, d)		p->lock_ops->lock_nested(p, d)
+#define genpd_lock_interruptible(p)	p->lock_ops->lock_interruptible(p)
+#define genpd_unlock(p)			p->lock_ops->unlock(p)
+
 /*
  * Get the generic PM domain for a particular struct device.
  * This validates the struct device pointer, the PM domain pointer,
@@ -200,9 +240,9 @@ static int genpd_poweron(struct generic_pm_domain *genpd, unsigned int depth)
 
 		genpd_sd_counter_inc(master);
 
-		mutex_lock_nested(&master->lock, depth + 1);
+		genpd_lock_nested(master, depth + 1);
 		ret = genpd_poweron(master, depth + 1);
-		mutex_unlock(&master->lock);
+		genpd_unlock(master);
 
 		if (ret) {
 			genpd_sd_counter_dec(master);
@@ -255,9 +295,9 @@ static int genpd_dev_pm_qos_notifier(struct notifier_block *nb,
 		spin_unlock_irq(&dev->power.lock);
 
 		if (!IS_ERR(genpd)) {
-			mutex_lock(&genpd->lock);
+			genpd_lock(genpd);
 			genpd->max_off_time_changed = true;
-			mutex_unlock(&genpd->lock);
+			genpd_unlock(genpd);
 		}
 
 		dev = dev->parent;
@@ -354,9 +394,9 @@ static void genpd_power_off_work_fn(struct work_struct *work)
 
 	genpd = container_of(work, struct generic_pm_domain, power_off_work);
 
-	mutex_lock(&genpd->lock);
+	genpd_lock(genpd);
 	genpd_poweroff(genpd, true);
-	mutex_unlock(&genpd->lock);
+	genpd_unlock(genpd);
 }
 
 /**
@@ -472,9 +512,9 @@ static int genpd_runtime_suspend(struct device *dev)
 	if (dev->power.irq_safe)
 		return 0;
 
-	mutex_lock(&genpd->lock);
+	genpd_lock(genpd);
 	genpd_poweroff(genpd, false);
-	mutex_unlock(&genpd->lock);
+	genpd_unlock(genpd);
 
 	return 0;
 }
@@ -509,9 +549,9 @@ static int genpd_runtime_resume(struct device *dev)
 		goto out;
 	}
 
-	mutex_lock(&genpd->lock);
+	genpd_lock(genpd);
 	ret = genpd_poweron(genpd, 0);
-	mutex_unlock(&genpd->lock);
+	genpd_unlock(genpd);
 
 	if (ret)
 		return ret;
@@ -547,9 +587,9 @@ err_stop:
 	genpd_stop_dev(genpd, dev);
 err_poweroff:
 	if (!dev->power.irq_safe) {
-		mutex_lock(&genpd->lock);
+		genpd_lock(genpd);
 		genpd_poweroff(genpd, 0);
-		mutex_unlock(&genpd->lock);
+		genpd_unlock(genpd);
 	}
 
 	return ret;
@@ -732,20 +772,20 @@ static int pm_genpd_prepare(struct device *dev)
 	if (resume_needed(dev, genpd))
 		pm_runtime_resume(dev);
 
-	mutex_lock(&genpd->lock);
+	genpd_lock(genpd);
 
 	if (genpd->prepared_count++ == 0)
 		genpd->suspended_count = 0;
 
-	mutex_unlock(&genpd->lock);
+	genpd_unlock(genpd);
 
 	ret = pm_generic_prepare(dev);
 	if (ret) {
-		mutex_lock(&genpd->lock);
+		genpd_lock(genpd);
 
 		genpd->prepared_count--;
 
-		mutex_unlock(&genpd->lock);
+		genpd_unlock(genpd);
 	}
 
 	return ret;
@@ -936,13 +976,13 @@ static void pm_genpd_complete(struct device *dev)
 
 	pm_generic_complete(dev);
 
-	mutex_lock(&genpd->lock);
+	genpd_lock(genpd);
 
 	genpd->prepared_count--;
 	if (!genpd->prepared_count)
 		genpd_queue_power_off_work(genpd);
 
-	mutex_unlock(&genpd->lock);
+	genpd_unlock(genpd);
 }
 
 /**
@@ -1071,7 +1111,7 @@ static int genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
 	if (IS_ERR(gpd_data))
 		return PTR_ERR(gpd_data);
 
-	mutex_lock(&genpd->lock);
+	genpd_lock(genpd);
 
 	if (genpd->prepared_count > 0) {
 		ret = -EAGAIN;
@@ -1088,7 +1128,7 @@ static int genpd_add_device(struct generic_pm_domain *genpd, struct device *dev,
 	list_add_tail(&gpd_data->base.list_node, &genpd->dev_list);
 
  out:
-	mutex_unlock(&genpd->lock);
+	genpd_unlock(genpd);
 
 	if (ret)
 		genpd_free_dev_data(dev, gpd_data);
@@ -1130,7 +1170,7 @@ static int genpd_remove_device(struct generic_pm_domain *genpd,
 	gpd_data = to_gpd_data(pdd);
 	dev_pm_qos_remove_notifier(dev, &gpd_data->nb);
 
-	mutex_lock(&genpd->lock);
+	genpd_lock(genpd);
 
 	if (genpd->prepared_count > 0) {
 		ret = -EAGAIN;
@@ -1145,14 +1185,14 @@ static int genpd_remove_device(struct generic_pm_domain *genpd,
 
 	list_del_init(&pdd->list_node);
 
-	mutex_unlock(&genpd->lock);
+	genpd_unlock(genpd);
 
 	genpd_free_dev_data(dev, gpd_data);
 
 	return 0;
 
  out:
-	mutex_unlock(&genpd->lock);
+	genpd_unlock(genpd);
 	dev_pm_qos_add_notifier(dev, &gpd_data->nb);
 
 	return ret;
@@ -1187,8 +1227,8 @@ static int genpd_add_subdomain(struct generic_pm_domain *genpd,
 	if (!link)
 		return -ENOMEM;
 
-	mutex_lock(&subdomain->lock);
-	mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING);
+	genpd_lock(subdomain);
+	genpd_lock_nested(genpd, SINGLE_DEPTH_NESTING);
 
 	if (genpd->status == GPD_STATE_POWER_OFF
 	    &&  subdomain->status != GPD_STATE_POWER_OFF) {
@@ -1211,8 +1251,8 @@ static int genpd_add_subdomain(struct generic_pm_domain *genpd,
 		genpd_sd_counter_inc(genpd);
 
  out:
-	mutex_unlock(&genpd->lock);
-	mutex_unlock(&subdomain->lock);
+	genpd_unlock(genpd);
+	genpd_unlock(subdomain);
 	if (ret)
 		kfree(link);
 	return ret;
@@ -1250,8 +1290,8 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
 	if (IS_ERR_OR_NULL(genpd) || IS_ERR_OR_NULL(subdomain))
 		return -EINVAL;
 
-	mutex_lock(&subdomain->lock);
-	mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING);
+	genpd_lock(subdomain);
+	genpd_lock_nested(genpd, SINGLE_DEPTH_NESTING);
 
 	if (!list_empty(&subdomain->master_links) || subdomain->device_count) {
 		pr_warn("%s: unable to remove subdomain %s\n", genpd->name,
@@ -1275,8 +1315,8 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
 	}
 
 out:
-	mutex_unlock(&genpd->lock);
-	mutex_unlock(&subdomain->lock);
+	genpd_unlock(genpd);
+	genpd_unlock(subdomain);
 
 	return ret;
 }
@@ -1316,7 +1356,8 @@ int pm_genpd_init(struct generic_pm_domain *genpd,
 	INIT_LIST_HEAD(&genpd->master_links);
 	INIT_LIST_HEAD(&genpd->slave_links);
 	INIT_LIST_HEAD(&genpd->dev_list);
-	mutex_init(&genpd->lock);
+	mutex_init(&genpd->mlock);
+	genpd->lock_ops = &genpd_mtx_ops;
 	genpd->gov = gov;
 	INIT_WORK(&genpd->power_off_work, genpd_power_off_work_fn);
 	atomic_set(&genpd->sd_count, 0);
@@ -1364,16 +1405,16 @@ static int genpd_remove(struct generic_pm_domain *genpd)
 	if (IS_ERR_OR_NULL(genpd))
 		return -EINVAL;
 
-	mutex_lock(&genpd->lock);
+	genpd_lock(genpd);
 
 	if (genpd->has_provider) {
-		mutex_unlock(&genpd->lock);
+		genpd_unlock(genpd);
 		pr_err("Provider present, unable to remove %s\n", genpd->name);
 		return -EBUSY;
 	}
 
 	if (!list_empty(&genpd->master_links) || genpd->device_count) {
-		mutex_unlock(&genpd->lock);
+		genpd_unlock(genpd);
 		pr_err("%s: unable to remove %s\n", __func__, genpd->name);
 		return -EBUSY;
 	}
@@ -1385,7 +1426,7 @@ static int genpd_remove(struct generic_pm_domain *genpd)
 	}
 
 	list_del(&genpd->gpd_list_node);
-	mutex_unlock(&genpd->lock);
+	genpd_unlock(genpd);
 	cancel_work_sync(&genpd->power_off_work);
 	kfree(genpd->free);
 	pr_debug("%s: removed %s\n", __func__, genpd->name);
@@ -1909,9 +1950,9 @@ int genpd_dev_pm_attach(struct device *dev)
 	dev->pm_domain->detach = genpd_dev_pm_detach;
 	dev->pm_domain->sync = genpd_dev_pm_sync;
 
-	mutex_lock(&pd->lock);
+	genpd_lock(pd);
 	ret = genpd_poweron(pd, 0);
-	mutex_unlock(&pd->lock);
+	genpd_unlock(pd);
 out:
 	return ret ? -EPROBE_DEFER : 0;
 }
@@ -2064,7 +2105,7 @@ static int pm_genpd_summary_one(struct seq_file *s,
 	char state[16];
 	int ret;
 
-	ret = mutex_lock_interruptible(&genpd->lock);
+	ret = genpd_lock_interruptible(genpd);
 	if (ret)
 		return -ERESTARTSYS;
 
@@ -2101,7 +2142,7 @@ static int pm_genpd_summary_one(struct seq_file *s,
 
 	seq_puts(s, "\n");
 exit:
-	mutex_unlock(&genpd->lock);
+	genpd_unlock(genpd);
 
 	return 0;
 }
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 6a8988166899..811b968eb740 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -42,13 +42,14 @@ struct genpd_power_state {
 	struct fwnode_handle *fwnode;
 };
 
+struct genpd_lock_ops;
+
 struct generic_pm_domain {
 	struct dev_pm_domain domain;	/* PM domain operations */
 	struct list_head gpd_list_node;	/* Node in the global PM domains list */
 	struct list_head master_links;	/* Links with PM domain as a master */
 	struct list_head slave_links;	/* Links with PM domain as a slave */
 	struct list_head dev_list;	/* List of devices */
-	struct mutex lock;
 	struct dev_power_governor *gov;
 	struct work_struct power_off_work;
 	struct fwnode_handle *provider;	/* Identity of the domain provider */
@@ -74,6 +75,8 @@ struct generic_pm_domain {
 	unsigned int state_count; /* number of states */
 	unsigned int state_idx; /* state that genpd will go to when off */
 	void *free; /* Free the state that was allocated for default */
+	const struct genpd_lock_ops *lock_ops;
+	struct mutex mlock;
 
 };
 
-- 
cgit v1.2.3


From d716f4798ff8c65ace4a6ab291f9a4ff265df4ba Mon Sep 17 00:00:00 2001
From: Lina Iyer <lina.iyer@linaro.org>
Date: Fri, 14 Oct 2016 10:47:55 -0700
Subject: PM / Domains: Support IRQ safe PM domains

Generic Power Domains currently support turning on/off only in process
context. This prevents the usage of PM domains for domains that could be
powered on/off in a context where IRQs are disabled. Many such domains
exist today and do not get powered off, when the IRQ safe devices in
that domain are powered off, because of this limitation.

However, not all domains can operate in IRQ safe contexts. Genpd
therefore, has to support both cases where the domain may or may not
operate in IRQ safe contexts. Configuring genpd to use an appropriate
lock for that domain, would allow domains that have IRQ safe devices to
runtime suspend and resume, in atomic context.

To achieve domain specific locking, set the domain's ->flag to
GENPD_FLAG_IRQ_SAFE while defining the domain. This indicates that genpd
should use a spinlock instead of a mutex for locking the domain. Locking
is abstracted through genpd_lock() and genpd_unlock() functions that use
the flag to determine the appropriate lock to be used for that domain.

Domains that have lower latency to suspend and resume and can operate
with IRQs disabled may now be able to save power, when the component
devices and sub-domains are idle at runtime.

The restriction this imposes on the domain hierarchy is that non-IRQ
safe domains may not have IRQ-safe subdomains, but IRQ safe domains may
have IRQ safe and non-IRQ safe subdomains and devices.

Signed-off-by: Lina Iyer <lina.iyer@linaro.org>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 111 ++++++++++++++++++++++++++++++++++++++++----
 include/linux/pm_domain.h   |  10 +++-
 2 files changed, 110 insertions(+), 11 deletions(-)

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 4194012cdf86..aac656a889dc 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -74,11 +74,70 @@ static const struct genpd_lock_ops genpd_mtx_ops = {
 	.unlock = genpd_unlock_mtx,
 };
 
+static void genpd_lock_spin(struct generic_pm_domain *genpd)
+	__acquires(&genpd->slock)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&genpd->slock, flags);
+	genpd->lock_flags = flags;
+}
+
+static void genpd_lock_nested_spin(struct generic_pm_domain *genpd,
+					int depth)
+	__acquires(&genpd->slock)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave_nested(&genpd->slock, flags, depth);
+	genpd->lock_flags = flags;
+}
+
+static int genpd_lock_interruptible_spin(struct generic_pm_domain *genpd)
+	__acquires(&genpd->slock)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&genpd->slock, flags);
+	genpd->lock_flags = flags;
+	return 0;
+}
+
+static void genpd_unlock_spin(struct generic_pm_domain *genpd)
+	__releases(&genpd->slock)
+{
+	spin_unlock_irqrestore(&genpd->slock, genpd->lock_flags);
+}
+
+static const struct genpd_lock_ops genpd_spin_ops = {
+	.lock = genpd_lock_spin,
+	.lock_nested = genpd_lock_nested_spin,
+	.lock_interruptible = genpd_lock_interruptible_spin,
+	.unlock = genpd_unlock_spin,
+};
+
 #define genpd_lock(p)			p->lock_ops->lock(p)
 #define genpd_lock_nested(p, d)		p->lock_ops->lock_nested(p, d)
 #define genpd_lock_interruptible(p)	p->lock_ops->lock_interruptible(p)
 #define genpd_unlock(p)			p->lock_ops->unlock(p)
 
+#define genpd_is_irq_safe(genpd)	(genpd->flags & GENPD_FLAG_IRQ_SAFE)
+
+static inline bool irq_safe_dev_in_no_sleep_domain(struct device *dev,
+		struct generic_pm_domain *genpd)
+{
+	bool ret;
+
+	ret = pm_runtime_is_irq_safe(dev) && !genpd_is_irq_safe(genpd);
+
+	/* Warn once for each IRQ safe dev in no sleep domain */
+	if (ret)
+		dev_warn_once(dev, "PM domain %s will not be powered off\n",
+				genpd->name);
+
+	return ret;
+}
+
 /*
  * Get the generic PM domain for a particular struct device.
  * This validates the struct device pointer, the PM domain pointer,
@@ -343,7 +402,12 @@ static int genpd_poweroff(struct generic_pm_domain *genpd, bool is_async)
 		if (stat > PM_QOS_FLAGS_NONE)
 			return -EBUSY;
 
-		if (!pm_runtime_suspended(pdd->dev) || pdd->dev->power.irq_safe)
+		/*
+		 * Do not allow PM domain to be powered off, when an IRQ safe
+		 * device is part of a non-IRQ safe domain.
+		 */
+		if (!pm_runtime_suspended(pdd->dev) ||
+			irq_safe_dev_in_no_sleep_domain(pdd->dev, genpd))
 			not_suspended++;
 	}
 
@@ -506,10 +570,10 @@ static int genpd_runtime_suspend(struct device *dev)
 	}
 
 	/*
-	 * If power.irq_safe is set, this routine will be run with interrupts
-	 * off, so it can't use mutexes.
+	 * If power.irq_safe is set, this routine may be run with
+	 * IRQs disabled, so suspend only if the PM domain also is irq_safe.
 	 */
-	if (dev->power.irq_safe)
+	if (irq_safe_dev_in_no_sleep_domain(dev, genpd))
 		return 0;
 
 	genpd_lock(genpd);
@@ -543,8 +607,11 @@ static int genpd_runtime_resume(struct device *dev)
 	if (IS_ERR(genpd))
 		return -EINVAL;
 
-	/* If power.irq_safe, the PM domain is never powered off. */
-	if (dev->power.irq_safe) {
+	/*
+	 * As we don't power off a non IRQ safe domain, which holds
+	 * an IRQ safe device, we don't need to restore power to it.
+	 */
+	if (irq_safe_dev_in_no_sleep_domain(dev, genpd)) {
 		timed = false;
 		goto out;
 	}
@@ -586,7 +653,8 @@ static int genpd_runtime_resume(struct device *dev)
 err_stop:
 	genpd_stop_dev(genpd, dev);
 err_poweroff:
-	if (!dev->power.irq_safe) {
+	if (!pm_runtime_is_irq_safe(dev) ||
+		(pm_runtime_is_irq_safe(dev) && genpd_is_irq_safe(genpd))) {
 		genpd_lock(genpd);
 		genpd_poweroff(genpd, 0);
 		genpd_unlock(genpd);
@@ -1223,6 +1291,17 @@ static int genpd_add_subdomain(struct generic_pm_domain *genpd,
 	    || genpd == subdomain)
 		return -EINVAL;
 
+	/*
+	 * If the domain can be powered on/off in an IRQ safe
+	 * context, ensure that the subdomain can also be
+	 * powered on/off in that context.
+	 */
+	if (!genpd_is_irq_safe(genpd) && genpd_is_irq_safe(subdomain)) {
+		WARN("Parent %s of subdomain %s must be IRQ safe\n",
+				genpd->name, subdomain->name);
+		return -EINVAL;
+	}
+
 	link = kzalloc(sizeof(*link), GFP_KERNEL);
 	if (!link)
 		return -ENOMEM;
@@ -1337,6 +1416,17 @@ static int genpd_set_default_power_state(struct generic_pm_domain *genpd)
 	return 0;
 }
 
+static void genpd_lock_init(struct generic_pm_domain *genpd)
+{
+	if (genpd->flags & GENPD_FLAG_IRQ_SAFE) {
+		spin_lock_init(&genpd->slock);
+		genpd->lock_ops = &genpd_spin_ops;
+	} else {
+		mutex_init(&genpd->mlock);
+		genpd->lock_ops = &genpd_mtx_ops;
+	}
+}
+
 /**
  * pm_genpd_init - Initialize a generic I/O PM domain object.
  * @genpd: PM domain object to initialize.
@@ -1356,8 +1446,7 @@ int pm_genpd_init(struct generic_pm_domain *genpd,
 	INIT_LIST_HEAD(&genpd->master_links);
 	INIT_LIST_HEAD(&genpd->slave_links);
 	INIT_LIST_HEAD(&genpd->dev_list);
-	mutex_init(&genpd->mlock);
-	genpd->lock_ops = &genpd_mtx_ops;
+	genpd_lock_init(genpd);
 	genpd->gov = gov;
 	INIT_WORK(&genpd->power_off_work, genpd_power_off_work_fn);
 	atomic_set(&genpd->sd_count, 0);
@@ -2131,7 +2220,9 @@ static int pm_genpd_summary_one(struct seq_file *s,
 	}
 
 	list_for_each_entry(pm_data, &genpd->dev_list, list_node) {
-		kobj_path = kobject_get_path(&pm_data->dev->kobj, GFP_KERNEL);
+		kobj_path = kobject_get_path(&pm_data->dev->kobj,
+				genpd_is_irq_safe(genpd) ?
+				GFP_ATOMIC : GFP_KERNEL);
 		if (kobj_path == NULL)
 			continue;
 
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index 811b968eb740..81ece61075df 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -15,9 +15,11 @@
 #include <linux/err.h>
 #include <linux/of.h>
 #include <linux/notifier.h>
+#include <linux/spinlock.h>
 
 /* Defines used for the flags field in the struct generic_pm_domain */
 #define GENPD_FLAG_PM_CLK	(1U << 0) /* PM domain uses PM clk */
+#define GENPD_FLAG_IRQ_SAFE	(1U << 1) /* PM domain operates in atomic */
 
 enum gpd_status {
 	GPD_STATE_ACTIVE = 0,	/* PM domain is active */
@@ -76,7 +78,13 @@ struct generic_pm_domain {
 	unsigned int state_idx; /* state that genpd will go to when off */
 	void *free; /* Free the state that was allocated for default */
 	const struct genpd_lock_ops *lock_ops;
-	struct mutex mlock;
+	union {
+		struct mutex mlock;
+		struct {
+			spinlock_t slock;
+			unsigned long lock_flags;
+		};
+	};
 
 };
 
-- 
cgit v1.2.3


From 411bf2a8db76d7ed6f710df77121e5113ee8fe64 Mon Sep 17 00:00:00 2001
From: Lina Iyer <lina.iyer@linaro.org>
Date: Mon, 24 Oct 2016 14:38:47 -0700
Subject: PM / doc: Update device documentation for devices in IRQ-safe PM
 domains

Update documentation to reflect the changes made to support IRQ safe PM
domains.

Suggested-by: Rafael J. Wysocki <rjw@rjwysocki.net>
Signed-off-by: Lina Iyer <lina.iyer@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/power/devices.txt | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt
index 8ba6625fdd63..73ddea39a9ce 100644
--- a/Documentation/power/devices.txt
+++ b/Documentation/power/devices.txt
@@ -607,7 +607,9 @@ individually.  Instead, a set of devices sharing a power resource can be put
 into a low-power state together at the same time by turning off the shared
 power resource.  Of course, they also need to be put into the full-power state
 together, by turning the shared power resource on.  A set of devices with this
-property is often referred to as a power domain.
+property is often referred to as a power domain. A power domain may also be
+nested inside another power domain. The nested domain is referred to as the
+sub-domain of the parent domain.
 
 Support for power domains is provided through the pm_domain field of struct
 device.  This field is a pointer to an object of type struct dev_pm_domain,
@@ -629,6 +631,16 @@ support for power domains into subsystem-level callbacks, for example by
 modifying the platform bus type.  Other platforms need not implement it or take
 it into account in any way.
 
+Devices may be defined as IRQ-safe which indicates to the PM core that their
+runtime PM callbacks may be invoked with disabled interrupts (see
+Documentation/power/runtime_pm.txt for more information).  If an IRQ-safe
+device belongs to a PM domain, the runtime PM of the domain will be
+disallowed, unless the domain itself is defined as IRQ-safe. However, it
+makes sense to define a PM domain as IRQ-safe only if all the devices in it
+are IRQ-safe. Moreover, if an IRQ-safe domain has a parent domain, the runtime
+PM of the parent is only allowed if the parent itself is IRQ-safe too with the
+additional restriction that all child domains of an IRQ-safe parent must also
+be IRQ-safe.
 
 Device Low Power (suspend) States
 ---------------------------------
-- 
cgit v1.2.3


From a1fee00dc95644e0590b4f1a3755c9f6b1243b3a Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Tue, 25 Oct 2016 17:33:27 +0100
Subject: PM / Domains: check for negative return from
 of_count_phandle_with_args()

The return from of_count_phandle_with_args can be negative, so we
should avoid kcalloc of a negative count of genpd_power_stat structs
by sanity checking if count is zero or less.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Acked-by: Kevin Hilman <khilman@baylibre.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index aac656a889dc..661737c2bae0 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -2113,7 +2113,7 @@ int of_genpd_parse_idle_states(struct device_node *dn,
 	struct of_phandle_iterator it;
 
 	count = of_count_phandle_with_args(dn, "domain-idle-states", NULL);
-	if (!count)
+	if (count <= 0)
 		return -EINVAL;
 
 	st = kcalloc(count, sizeof(*st), GFP_KERNEL);
-- 
cgit v1.2.3


From 650ec6cfe323311165f0c7b6882a63dfd7ae8b63 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 25 Oct 2016 09:21:24 +0200
Subject: cpufreq: enable the DT cpufreq driver on the Integrators

This enables the generic DT and OPP-based cpufreq driver on the
ARM Integrator/AP and Integrator/CP.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq-dt-platdev.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c
index 71267626456b..142a6f615e52 100644
--- a/drivers/cpufreq/cpufreq-dt-platdev.c
+++ b/drivers/cpufreq/cpufreq-dt-platdev.c
@@ -26,6 +26,9 @@ static const struct of_device_id machines[] __initconst = {
 	{ .compatible = "allwinner,sun8i-a83t", },
 	{ .compatible = "allwinner,sun8i-h3", },
 
+	{ .compatible = "arm,integrator-ap", },
+	{ .compatible = "arm,integrator-cp", },
+
 	{ .compatible = "hisilicon,hi6220", },
 
 	{ .compatible = "fsl,imx27", },
-- 
cgit v1.2.3


From ae8b8d8f86a03c19c5ecfd848609b2e9438f1cf2 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 25 Oct 2016 09:21:25 +0200
Subject: cpufreq: retire the Integrator cpufreq driver

After switching the core module clocks controlling the Integrator
clock frequencies to the common clock framework, defining the
operating points in the device tree, and activating the generic
DT-based CPUfreq driver, we can retire the old Integrator
cpufreq driver.

Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/Kconfig.arm          |   8 --
 drivers/cpufreq/Makefile             |   1 -
 drivers/cpufreq/integrator-cpufreq.c | 239 -----------------------------------
 3 files changed, 248 deletions(-)
 delete mode 100644 drivers/cpufreq/integrator-cpufreq.c

diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index d89b8afe23b6..fdbc630272b3 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -60,14 +60,6 @@ config ARM_IMX6Q_CPUFREQ
 
 	  If in doubt, say N.
 
-config ARM_INTEGRATOR
-	tristate "CPUfreq driver for ARM Integrator CPUs"
-	depends on ARCH_INTEGRATOR
-	default y
-	help
-	  This enables the CPUfreq driver for ARM Integrator CPUs.
-	  If in doubt, say Y.
-
 config ARM_KIRKWOOD_CPUFREQ
 	def_bool MACH_KIRKWOOD
 	help
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index 0a9b6a093646..7dde82179d62 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -56,7 +56,6 @@ obj-$(CONFIG_UX500_SOC_DB8500)		+= dbx500-cpufreq.o
 obj-$(CONFIG_ARM_EXYNOS5440_CPUFREQ)	+= exynos5440-cpufreq.o
 obj-$(CONFIG_ARM_HIGHBANK_CPUFREQ)	+= highbank-cpufreq.o
 obj-$(CONFIG_ARM_IMX6Q_CPUFREQ)		+= imx6q-cpufreq.o
-obj-$(CONFIG_ARM_INTEGRATOR)		+= integrator-cpufreq.o
 obj-$(CONFIG_ARM_KIRKWOOD_CPUFREQ)	+= kirkwood-cpufreq.o
 obj-$(CONFIG_ARM_MT8173_CPUFREQ)	+= mt8173-cpufreq.o
 obj-$(CONFIG_ARM_OMAP2PLUS_CPUFREQ)	+= omap-cpufreq.o
diff --git a/drivers/cpufreq/integrator-cpufreq.c b/drivers/cpufreq/integrator-cpufreq.c
deleted file mode 100644
index 79e3ff2771a6..000000000000
--- a/drivers/cpufreq/integrator-cpufreq.c
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- *  Copyright (C) 2001-2002 Deep Blue Solutions Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * CPU support functions
- */
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/cpufreq.h>
-#include <linux/sched.h>
-#include <linux/smp.h>
-#include <linux/init.h>
-#include <linux/io.h>
-#include <linux/platform_device.h>
-#include <linux/of.h>
-#include <linux/of_address.h>
-
-#include <asm/mach-types.h>
-#include <asm/hardware/icst.h>
-
-static void __iomem *cm_base;
-/* The cpufreq driver only use the OSC register */
-#define INTEGRATOR_HDR_OSC_OFFSET       0x08
-#define INTEGRATOR_HDR_LOCK_OFFSET      0x14
-
-static struct cpufreq_driver integrator_driver;
-
-static const struct icst_params lclk_params = {
-	.ref		= 24000000,
-	.vco_max	= ICST525_VCO_MAX_5V,
-	.vco_min	= ICST525_VCO_MIN,
-	.vd_min		= 8,
-	.vd_max		= 132,
-	.rd_min		= 24,
-	.rd_max		= 24,
-	.s2div		= icst525_s2div,
-	.idx2s		= icst525_idx2s,
-};
-
-static const struct icst_params cclk_params = {
-	.ref		= 24000000,
-	.vco_max	= ICST525_VCO_MAX_5V,
-	.vco_min	= ICST525_VCO_MIN,
-	.vd_min		= 12,
-	.vd_max		= 160,
-	.rd_min		= 24,
-	.rd_max		= 24,
-	.s2div		= icst525_s2div,
-	.idx2s		= icst525_idx2s,
-};
-
-/*
- * Validate the speed policy.
- */
-static int integrator_verify_policy(struct cpufreq_policy *policy)
-{
-	struct icst_vco vco;
-
-	cpufreq_verify_within_cpu_limits(policy);
-
-	vco = icst_hz_to_vco(&cclk_params, policy->max * 1000);
-	policy->max = icst_hz(&cclk_params, vco) / 1000;
-
-	vco = icst_hz_to_vco(&cclk_params, policy->min * 1000);
-	policy->min = icst_hz(&cclk_params, vco) / 1000;
-
-	cpufreq_verify_within_cpu_limits(policy);
-	return 0;
-}
-
-
-static int integrator_set_target(struct cpufreq_policy *policy,
-				 unsigned int target_freq,
-				 unsigned int relation)
-{
-	cpumask_t cpus_allowed;
-	int cpu = policy->cpu;
-	struct icst_vco vco;
-	struct cpufreq_freqs freqs;
-	u_int cm_osc;
-
-	/*
-	 * Save this threads cpus_allowed mask.
-	 */
-	cpus_allowed = current->cpus_allowed;
-
-	/*
-	 * Bind to the specified CPU.  When this call returns,
-	 * we should be running on the right CPU.
-	 */
-	set_cpus_allowed_ptr(current, cpumask_of(cpu));
-	BUG_ON(cpu != smp_processor_id());
-
-	/* get current setting */
-	cm_osc = __raw_readl(cm_base + INTEGRATOR_HDR_OSC_OFFSET);
-
-	if (machine_is_integrator())
-		vco.s = (cm_osc >> 8) & 7;
-	else if (machine_is_cintegrator())
-		vco.s = 1;
-	vco.v = cm_osc & 255;
-	vco.r = 22;
-	freqs.old = icst_hz(&cclk_params, vco) / 1000;
-
-	/* icst_hz_to_vco rounds down -- so we need the next
-	 * larger freq in case of CPUFREQ_RELATION_L.
-	 */
-	if (relation == CPUFREQ_RELATION_L)
-		target_freq += 999;
-	if (target_freq > policy->max)
-		target_freq = policy->max;
-	vco = icst_hz_to_vco(&cclk_params, target_freq * 1000);
-	freqs.new = icst_hz(&cclk_params, vco) / 1000;
-
-	if (freqs.old == freqs.new) {
-		set_cpus_allowed_ptr(current, &cpus_allowed);
-		return 0;
-	}
-
-	cpufreq_freq_transition_begin(policy, &freqs);
-
-	cm_osc = __raw_readl(cm_base + INTEGRATOR_HDR_OSC_OFFSET);
-
-	if (machine_is_integrator()) {
-		cm_osc &= 0xfffff800;
-		cm_osc |= vco.s << 8;
-	} else if (machine_is_cintegrator()) {
-		cm_osc &= 0xffffff00;
-	}
-	cm_osc |= vco.v;
-
-	__raw_writel(0xa05f, cm_base + INTEGRATOR_HDR_LOCK_OFFSET);
-	__raw_writel(cm_osc, cm_base + INTEGRATOR_HDR_OSC_OFFSET);
-	__raw_writel(0, cm_base + INTEGRATOR_HDR_LOCK_OFFSET);
-
-	/*
-	 * Restore the CPUs allowed mask.
-	 */
-	set_cpus_allowed_ptr(current, &cpus_allowed);
-
-	cpufreq_freq_transition_end(policy, &freqs, 0);
-
-	return 0;
-}
-
-static unsigned int integrator_get(unsigned int cpu)
-{
-	cpumask_t cpus_allowed;
-	unsigned int current_freq;
-	u_int cm_osc;
-	struct icst_vco vco;
-
-	cpus_allowed = current->cpus_allowed;
-
-	set_cpus_allowed_ptr(current, cpumask_of(cpu));
-	BUG_ON(cpu != smp_processor_id());
-
-	/* detect memory etc. */
-	cm_osc = __raw_readl(cm_base + INTEGRATOR_HDR_OSC_OFFSET);
-
-	if (machine_is_integrator())
-		vco.s = (cm_osc >> 8) & 7;
-	else
-		vco.s = 1;
-	vco.v = cm_osc & 255;
-	vco.r = 22;
-
-	current_freq = icst_hz(&cclk_params, vco) / 1000; /* current freq */
-
-	set_cpus_allowed_ptr(current, &cpus_allowed);
-
-	return current_freq;
-}
-
-static int integrator_cpufreq_init(struct cpufreq_policy *policy)
-{
-
-	/* set default policy and cpuinfo */
-	policy->max = policy->cpuinfo.max_freq = 160000;
-	policy->min = policy->cpuinfo.min_freq = 12000;
-	policy->cpuinfo.transition_latency = 1000000; /* 1 ms, assumed */
-
-	return 0;
-}
-
-static struct cpufreq_driver integrator_driver = {
-	.flags		= CPUFREQ_NEED_INITIAL_FREQ_CHECK,
-	.verify		= integrator_verify_policy,
-	.target		= integrator_set_target,
-	.get		= integrator_get,
-	.init		= integrator_cpufreq_init,
-	.name		= "integrator",
-};
-
-static int __init integrator_cpufreq_probe(struct platform_device *pdev)
-{
-	struct resource *res;
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res)
-		return -ENODEV;
-
-	cm_base = devm_ioremap(&pdev->dev, res->start, resource_size(res));
-	if (!cm_base)
-		return -ENODEV;
-
-	return cpufreq_register_driver(&integrator_driver);
-}
-
-static int __exit integrator_cpufreq_remove(struct platform_device *pdev)
-{
-	return cpufreq_unregister_driver(&integrator_driver);
-}
-
-static const struct of_device_id integrator_cpufreq_match[] = {
-	{ .compatible = "arm,core-module-integrator"},
-	{ },
-};
-
-MODULE_DEVICE_TABLE(of, integrator_cpufreq_match);
-
-static struct platform_driver integrator_cpufreq_driver = {
-	.driver = {
-		.name = "integrator-cpufreq",
-		.of_match_table = integrator_cpufreq_match,
-	},
-	.remove = __exit_p(integrator_cpufreq_remove),
-};
-
-module_platform_driver_probe(integrator_cpufreq_driver,
-			     integrator_cpufreq_probe);
-
-MODULE_AUTHOR("Russell M. King");
-MODULE_DESCRIPTION("cpufreq driver for ARM Integrator CPUs");
-MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From eae48f046ffa117afb782cd9b3ae5469df0042e2 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Tue, 25 Oct 2016 13:20:40 -0700
Subject: cpufreq: intel_pstate: Per CPU P-State limits

Intel P-State offers two interface to set performance limits:
- Intel P-State sysfs
	/sys/devices/system/cpu/intel_pstate/max_perf_pct
	/sys/devices/system/cpu/intel_pstate/min_perf_pct
- cpufreq
	/sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq
	/sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq

In the current implementation both of the above methods, change limits
to every CPU in the system. Moreover the limits placed using cpufreq
policy interface also presented in the Intel P-State sysfs via modified
max_perf_pct and min_per_pct during sysfs reads. This allows to check
percent of reduced/increased performance, irrespective of method used to
limit.

There are some new generations of processors, where it is possible to
have limits placed on individual CPU cores. Using cpufreq interface it
is possible to set limits on each CPU. But the current processing will
use last limits placed on all CPUs. So the per core limit feature of
CPUs can't be used.

This change brings in capability to set P-States limits for each CPU,
with some limitations. In this case what should be the read of
max_perf_pct and min_perf_pct? It can be most restrictive limits placed
on any CPU or max possible performance on any given CPU on which no
limits are placed. In either case someone will have issue.

So the consensus is, we can't have both sysfs controls present when user
wants to use limit per core limits.
- By default per-core-control feature is not enabled. So no one will
notice any difference.
- The way to enable is by kernel command line
intel_pstate=per_cpu_perf_limits
- When the per-core-controls are enabled there is no display of for both
read and write on
	/sys/devices/system/cpu/intel_pstate/max_perf_pct
	/sys/devices/system/cpu/intel_pstate/min_perf_pct
- User can change limits using
	/sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq
	/sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq
	/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
- User can still observe turbo percent and number of P-States from
	/sys/devices/system/cpu/intel_pstate/turbo_pct
	/sys/devices/system/cpu/intel_pstate/num_pstates
- User can read write system wide turbo status
	/sys/devices/system/cpu/no_turbo

While changing this BUG_ON is changed to WARN_ON, as they are not fatal
errors for the system.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 236 +++++++++++++++++++++++++++--------------
 1 file changed, 156 insertions(+), 80 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index d7a9195a8351..b6e9b49bf151 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -176,6 +176,48 @@ struct _pid {
 	int32_t last_err;
 };
 
+/**
+ * struct perf_limits - Store user and policy limits
+ * @no_turbo:		User requested turbo state from intel_pstate sysfs
+ * @turbo_disabled:	Platform turbo status either from msr
+ *			MSR_IA32_MISC_ENABLE or when maximum available pstate
+ *			matches the maximum turbo pstate
+ * @max_perf_pct:	Effective maximum performance limit in percentage, this
+ *			is minimum of either limits enforced by cpufreq policy
+ *			or limits from user set limits via intel_pstate sysfs
+ * @min_perf_pct:	Effective minimum performance limit in percentage, this
+ *			is maximum of either limits enforced by cpufreq policy
+ *			or limits from user set limits via intel_pstate sysfs
+ * @max_perf:		This is a scaled value between 0 to 255 for max_perf_pct
+ *			This value is used to limit max pstate
+ * @min_perf:		This is a scaled value between 0 to 255 for min_perf_pct
+ *			This value is used to limit min pstate
+ * @max_policy_pct:	The maximum performance in percentage enforced by
+ *			cpufreq setpolicy interface
+ * @max_sysfs_pct:	The maximum performance in percentage enforced by
+ *			intel pstate sysfs interface, unused when per cpu
+ *			controls are enforced
+ * @min_policy_pct:	The minimum performance in percentage enforced by
+ *			cpufreq setpolicy interface
+ * @min_sysfs_pct:	The minimum performance in percentage enforced by
+ *			intel pstate sysfs interface, unused when per cpu
+ *			controls are enforced
+ *
+ * Storage for user and policy defined limits.
+ */
+struct perf_limits {
+	int no_turbo;
+	int turbo_disabled;
+	int max_perf_pct;
+	int min_perf_pct;
+	int32_t max_perf;
+	int32_t min_perf;
+	int max_policy_pct;
+	int max_sysfs_pct;
+	int min_policy_pct;
+	int min_sysfs_pct;
+};
+
 /**
  * struct cpudata -	Per CPU instance data storage
  * @cpu:		CPU number for this instance data
@@ -194,6 +236,9 @@ struct _pid {
  * @prev_cummulative_iowait: IO Wait time difference from last and
  *			current sample
  * @sample:		Storage for storing last Sample data
+ * @perf_limits:	Pointer to perf_limit unique to this CPU
+ *			Not all field in the structure are applicable
+ *			when per cpu controls are enforced
  * @acpi_perf_data:	Stores ACPI perf information read from _PSS
  * @valid_pss_table:	Set to true for valid ACPI _PSS entries found
  *
@@ -217,6 +262,7 @@ struct cpudata {
 	u64	prev_tsc;
 	u64	prev_cummulative_iowait;
 	struct sample sample;
+	struct perf_limits *perf_limits;
 #ifdef CONFIG_ACPI
 	struct acpi_processor_performance acpi_perf_data;
 	bool valid_pss_table;
@@ -289,51 +335,12 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu);
 static struct pstate_adjust_policy pid_params __read_mostly;
 static struct pstate_funcs pstate_funcs __read_mostly;
 static int hwp_active __read_mostly;
+static bool per_cpu_limits __read_mostly;
 
 #ifdef CONFIG_ACPI
 static bool acpi_ppc;
 #endif
 
-/**
- * struct perf_limits - Store user and policy limits
- * @no_turbo:		User requested turbo state from intel_pstate sysfs
- * @turbo_disabled:	Platform turbo status either from msr
- *			MSR_IA32_MISC_ENABLE or when maximum available pstate
- *			matches the maximum turbo pstate
- * @max_perf_pct:	Effective maximum performance limit in percentage, this
- *			is minimum of either limits enforced by cpufreq policy
- *			or limits from user set limits via intel_pstate sysfs
- * @min_perf_pct:	Effective minimum performance limit in percentage, this
- *			is maximum of either limits enforced by cpufreq policy
- *			or limits from user set limits via intel_pstate sysfs
- * @max_perf:		This is a scaled value between 0 to 255 for max_perf_pct
- *			This value is used to limit max pstate
- * @min_perf:		This is a scaled value between 0 to 255 for min_perf_pct
- *			This value is used to limit min pstate
- * @max_policy_pct:	The maximum performance in percentage enforced by
- *			cpufreq setpolicy interface
- * @max_sysfs_pct:	The maximum performance in percentage enforced by
- *			intel pstate sysfs interface
- * @min_policy_pct:	The minimum performance in percentage enforced by
- *			cpufreq setpolicy interface
- * @min_sysfs_pct:	The minimum performance in percentage enforced by
- *			intel pstate sysfs interface
- *
- * Storage for user and policy defined limits.
- */
-struct perf_limits {
-	int no_turbo;
-	int turbo_disabled;
-	int max_perf_pct;
-	int min_perf_pct;
-	int32_t max_perf;
-	int32_t min_perf;
-	int max_policy_pct;
-	int max_sysfs_pct;
-	int min_policy_pct;
-	int min_sysfs_pct;
-};
-
 static struct perf_limits performance_limits = {
 	.no_turbo = 0,
 	.turbo_disabled = 0,
@@ -560,21 +567,30 @@ static inline void update_turbo_state(void)
 static void intel_pstate_hwp_set(const struct cpumask *cpumask)
 {
 	int min, hw_min, max, hw_max, cpu, range, adj_range;
+	struct perf_limits *perf_limits = limits;
 	u64 value, cap;
 
 	for_each_cpu(cpu, cpumask) {
+		int max_perf_pct, min_perf_pct;
+
+		if (per_cpu_limits)
+			perf_limits = all_cpu_data[cpu]->perf_limits;
+
 		rdmsrl_on_cpu(cpu, MSR_HWP_CAPABILITIES, &cap);
 		hw_min = HWP_LOWEST_PERF(cap);
 		hw_max = HWP_HIGHEST_PERF(cap);
 		range = hw_max - hw_min;
 
+		max_perf_pct = perf_limits->max_perf_pct;
+		min_perf_pct = perf_limits->min_perf_pct;
+
 		rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value);
-		adj_range = limits->min_perf_pct * range / 100;
+		adj_range = min_perf_pct * range / 100;
 		min = hw_min + adj_range;
 		value &= ~HWP_MIN_PERF(~0L);
 		value |= HWP_MIN_PERF(min);
 
-		adj_range = limits->max_perf_pct * range / 100;
+		adj_range = max_perf_pct * range / 100;
 		max = hw_min + adj_range;
 		if (limits->no_turbo) {
 			hw_max = HWP_GUARANTEED_PERF(cap);
@@ -787,8 +803,6 @@ define_one_global_ro(num_pstates);
 
 static struct attribute *intel_pstate_attributes[] = {
 	&no_turbo.attr,
-	&max_perf_pct.attr,
-	&min_perf_pct.attr,
 	&turbo_pct.attr,
 	&num_pstates.attr,
 	NULL
@@ -805,9 +819,26 @@ static void __init intel_pstate_sysfs_expose_params(void)
 
 	intel_pstate_kobject = kobject_create_and_add("intel_pstate",
 						&cpu_subsys.dev_root->kobj);
-	BUG_ON(!intel_pstate_kobject);
+	if (WARN_ON(!intel_pstate_kobject))
+		return;
+
 	rc = sysfs_create_group(intel_pstate_kobject, &intel_pstate_attr_group);
-	BUG_ON(rc);
+	if (WARN_ON(rc))
+		return;
+
+	/*
+	 * If per cpu limits are enforced there are no global limits, so
+	 * return without creating max/min_perf_pct attributes
+	 */
+	if (per_cpu_limits)
+		return;
+
+	rc = sysfs_create_file(intel_pstate_kobject, &max_perf_pct.attr);
+	WARN_ON(rc);
+
+	rc = sysfs_create_file(intel_pstate_kobject, &min_perf_pct.attr);
+	WARN_ON(rc);
+
 }
 /************************** sysfs end ************************/
 
@@ -1124,20 +1155,24 @@ static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max)
 	int max_perf = cpu->pstate.turbo_pstate;
 	int max_perf_adj;
 	int min_perf;
+	struct perf_limits *perf_limits = limits;
 
 	if (limits->no_turbo || limits->turbo_disabled)
 		max_perf = cpu->pstate.max_pstate;
 
+	if (per_cpu_limits)
+		perf_limits = cpu->perf_limits;
+
 	/*
 	 * performance can be limited by user through sysfs, by cpufreq
 	 * policy, or by cpu specific default values determined through
 	 * experimentation.
 	 */
-	max_perf_adj = fp_toint(max_perf * limits->max_perf);
+	max_perf_adj = fp_toint(max_perf * perf_limits->max_perf);
 	*max = clamp_t(int, max_perf_adj,
 			cpu->pstate.min_pstate, cpu->pstate.turbo_pstate);
 
-	min_perf = fp_toint(max_perf * limits->min_perf);
+	min_perf = fp_toint(max_perf * perf_limits->min_perf);
 	*min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf);
 }
 
@@ -1421,11 +1456,23 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
 {
 	struct cpudata *cpu;
 
-	if (!all_cpu_data[cpunum])
-		all_cpu_data[cpunum] = kzalloc(sizeof(struct cpudata),
-					       GFP_KERNEL);
-	if (!all_cpu_data[cpunum])
-		return -ENOMEM;
+	cpu = all_cpu_data[cpunum];
+
+	if (!cpu) {
+		unsigned int size = sizeof(struct cpudata);
+
+		if (per_cpu_limits)
+			size += sizeof(struct perf_limits);
+
+		cpu = kzalloc(size, GFP_KERNEL);
+		if (!cpu)
+			return -ENOMEM;
+
+		all_cpu_data[cpunum] = cpu;
+		if (per_cpu_limits)
+			cpu->perf_limits = (struct perf_limits *)(cpu + 1);
+
+	}
 
 	cpu = all_cpu_data[cpunum];
 
@@ -1493,9 +1540,40 @@ static void intel_pstate_set_performance_limits(struct perf_limits *limits)
 	limits->min_sysfs_pct = 0;
 }
 
+static void intel_pstate_update_perf_limits(struct cpufreq_policy *policy,
+					    struct perf_limits *limits)
+{
+	limits->min_policy_pct = (policy->min * 100) / policy->cpuinfo.max_freq;
+	limits->min_policy_pct = clamp_t(int, limits->min_policy_pct, 0, 100);
+	limits->max_policy_pct = DIV_ROUND_UP(policy->max * 100,
+					      policy->cpuinfo.max_freq);
+	limits->max_policy_pct = clamp_t(int, limits->max_policy_pct, 0, 100);
+
+	/* Normalize user input to [min_policy_pct, max_policy_pct] */
+	limits->min_perf_pct = max(limits->min_policy_pct,
+				   limits->min_sysfs_pct);
+	limits->min_perf_pct = min(limits->max_policy_pct,
+				   limits->min_perf_pct);
+	limits->max_perf_pct = min(limits->max_policy_pct,
+				   limits->max_sysfs_pct);
+	limits->max_perf_pct = max(limits->min_policy_pct,
+				   limits->max_perf_pct);
+
+	/* Make sure min_perf_pct <= max_perf_pct */
+	limits->min_perf_pct = min(limits->max_perf_pct, limits->min_perf_pct);
+
+	limits->min_perf = div_fp(limits->min_perf_pct, 100);
+	limits->max_perf = div_fp(limits->max_perf_pct, 100);
+	limits->max_perf = round_up(limits->max_perf, FRAC_BITS);
+
+	pr_debug("cpu:%d max_perf_pct:%d min_perf_pct:%d\n", policy->cpu,
+		 limits->max_perf_pct, limits->min_perf_pct);
+}
+
 static int intel_pstate_set_policy(struct cpufreq_policy *policy)
 {
 	struct cpudata *cpu;
+	struct perf_limits *perf_limits = NULL;
 
 	if (!policy->cpuinfo.max_freq)
 		return -ENODEV;
@@ -1513,41 +1591,29 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
 		policy->max = policy->cpuinfo.max_freq;
 	}
 
-	if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) {
-		limits = &performance_limits;
+	if (per_cpu_limits)
+		perf_limits = cpu->perf_limits;
+
+	if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) {
+		if (!perf_limits) {
+			limits = &performance_limits;
+			perf_limits = limits;
+		}
 		if (policy->max >= policy->cpuinfo.max_freq) {
 			pr_debug("set performance\n");
-			intel_pstate_set_performance_limits(limits);
+			intel_pstate_set_performance_limits(perf_limits);
 			goto out;
 		}
 	} else {
 		pr_debug("set powersave\n");
-		limits = &powersave_limits;
-	}
-
-	limits->min_policy_pct = (policy->min * 100) / policy->cpuinfo.max_freq;
-	limits->min_policy_pct = clamp_t(int, limits->min_policy_pct, 0 , 100);
-	limits->max_policy_pct = DIV_ROUND_UP(policy->max * 100,
-					      policy->cpuinfo.max_freq);
-	limits->max_policy_pct = clamp_t(int, limits->max_policy_pct, 0 , 100);
-
-	/* Normalize user input to [min_policy_pct, max_policy_pct] */
-	limits->min_perf_pct = max(limits->min_policy_pct,
-				   limits->min_sysfs_pct);
-	limits->min_perf_pct = min(limits->max_policy_pct,
-				   limits->min_perf_pct);
-	limits->max_perf_pct = min(limits->max_policy_pct,
-				   limits->max_sysfs_pct);
-	limits->max_perf_pct = max(limits->min_policy_pct,
-				   limits->max_perf_pct);
-
-	/* Make sure min_perf_pct <= max_perf_pct */
-	limits->min_perf_pct = min(limits->max_perf_pct, limits->min_perf_pct);
+		if (!perf_limits) {
+			limits = &powersave_limits;
+			perf_limits = limits;
+		}
 
-	limits->min_perf = div_fp(limits->min_perf_pct, 100);
-	limits->max_perf = div_fp(limits->max_perf_pct, 100);
-	limits->max_perf = round_up(limits->max_perf, FRAC_BITS);
+	}
 
+	intel_pstate_update_perf_limits(policy, perf_limits);
  out:
 	if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) {
 		/*
@@ -1607,6 +1673,14 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
 	else
 		policy->policy = CPUFREQ_POLICY_POWERSAVE;
 
+	/*
+	 * We need sane value in the cpu->perf_limits, so inherit from global
+	 * perf_limits limits, which are seeded with values based on the
+	 * CONFIG_CPU_FREQ_DEFAULT_GOV_*, during boot up.
+	 */
+	if (per_cpu_limits)
+		memcpy(cpu->perf_limits, limits, sizeof(struct perf_limits));
+
 	policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling;
 	policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
 
@@ -1888,6 +1962,8 @@ static int __init intel_pstate_setup(char *str)
 		force_load = 1;
 	if (!strcmp(str, "hwp_only"))
 		hwp_only = 1;
+	if (!strcmp(str, "per_cpu_perf_limits"))
+		per_cpu_limits = true;
 
 #ifdef CONFIG_ACPI
 	if (!strcmp(str, "support_acpi_ppc"))
-- 
cgit v1.2.3


From 5879f877398a2a5e5006c6e16a4288e9d4c308a1 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Tue, 25 Oct 2016 13:20:41 -0700
Subject: cpufreq: intel_pstate: Reduce impact due to rounding error

When policy->max and policy->min are same, in some cases they don't
result in the same frequency cap. The max_policy_pct is rounded up but
not min_perf_pct. So even when they are same, results in different
percentage or maximum and minimum.
Since minimum is a conservative value for power, a lower value without
rounding is better in most of the cases, unless user wants
policy->max = policy->min.
This change uses use the same policy percentage when policy->max and
policy->min are same.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index b6e9b49bf151..8e7390bb2175 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1543,11 +1543,17 @@ static void intel_pstate_set_performance_limits(struct perf_limits *limits)
 static void intel_pstate_update_perf_limits(struct cpufreq_policy *policy,
 					    struct perf_limits *limits)
 {
-	limits->min_policy_pct = (policy->min * 100) / policy->cpuinfo.max_freq;
-	limits->min_policy_pct = clamp_t(int, limits->min_policy_pct, 0, 100);
 	limits->max_policy_pct = DIV_ROUND_UP(policy->max * 100,
 					      policy->cpuinfo.max_freq);
 	limits->max_policy_pct = clamp_t(int, limits->max_policy_pct, 0, 100);
+	if (policy->max == policy->min) {
+		limits->min_policy_pct = limits->max_policy_pct;
+	} else {
+		limits->min_policy_pct = (policy->min * 100) /
+						policy->cpuinfo.max_freq;
+		limits->min_policy_pct = clamp_t(int, limits->min_policy_pct,
+						 0, 100);
+	}
 
 	/* Normalize user input to [min_policy_pct, max_policy_pct] */
 	limits->min_perf_pct = max(limits->min_policy_pct,
-- 
cgit v1.2.3


From 9e472f95e480d6746ed8646634e6b11994207015 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Tue, 25 Oct 2016 13:20:42 -0700
Subject: Documentation: intel_pstate: Update per core limits

Document restriction on per core P-State control.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/cpu-freq/intel-pstate.txt | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/Documentation/cpu-freq/intel-pstate.txt b/Documentation/cpu-freq/intel-pstate.txt
index 5528d6d59687..198b2ef88aec 100644
--- a/Documentation/cpu-freq/intel-pstate.txt
+++ b/Documentation/cpu-freq/intel-pstate.txt
@@ -48,7 +48,7 @@ In addition to the frequency-controlling interfaces provided by the cpufreq
 core, the driver provides its own sysfs files to control the P-State selection.
 These files have been added to /sys/devices/system/cpu/intel_pstate/.
 Any changes made to these files are applicable to all CPUs (even in a
-multi-package system).
+multi-package system, Refer to later section on placing "Per-CPU limits").
 
       max_perf_pct: Limits the maximum P-State that will be requested by
       the driver. It states it as a percentage of the available performance. The
@@ -120,6 +120,25 @@ frequency is fictional for Intel Core processors. Even if the scaling
 driver selects a single P-State, the actual frequency the processor
 will run at is selected by the processor itself.
 
+Per-CPU limits
+
+The kernel command line option "intel_pstate=per_cpu_perf_limits" forces
+the intel_pstate driver to use per-CPU performance limits.  When it is set,
+the sysfs control interface described above is subject to limitations.
+- The following controls are not available for both read and write
+	/sys/devices/system/cpu/intel_pstate/max_perf_pct
+	/sys/devices/system/cpu/intel_pstate/min_perf_pct
+- The following controls can be used to set performance limits, as far as the
+architecture of the processor permits:
+	/sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq
+	/sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq
+	/sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
+- User can still observe turbo percent and number of P-States from
+	/sys/devices/system/cpu/intel_pstate/turbo_pct
+	/sys/devices/system/cpu/intel_pstate/num_pstates
+- User can read write system wide turbo status
+	/sys/devices/system/cpu/no_turbo
+
 Tuning Intel P-State driver
 
 When the performance can be tuned using PID (Proportional Integral
-- 
cgit v1.2.3


From 22449c0cba0e68489e5b7b033215c8dc6effde35 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Tue, 25 Oct 2016 13:20:43 -0700
Subject: Documentation: kernel parameters: per core P-State control

Additional command line control to enable per core performance
control.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/kernel-parameters.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 37babf91f2cb..fa2336ec1f56 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1780,6 +1780,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			Description Table, specifies preferred power management
 			profile as "Enterprise Server" or "Performance Server",
 			then this feature is turned on by default.
+		per_cpu_perf_limits
+			Allow per-logical-CPU P-State performance control limits using
+			cpufreq sysfs interface
 
 	intremap=	[X86-64, Intel-IOMMU]
 			on	enable Interrupt Remapping (default)
-- 
cgit v1.2.3


From 1758b3374bc5b56f60fc07ccc37863a64fc67ecc Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <yamada.masahiro@socionext.com>
Date: Thu, 27 Oct 2016 01:41:33 +0900
Subject: cpufreq: dt: add Socionext UniPhier SoCs support

Add compatible strings for Pro5, PXs2, LD6b, LD11, LD20 SoCs to use
the generic cpufreq driver.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq-dt-platdev.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c
index 142a6f615e52..fba3f61f320c 100644
--- a/drivers/cpufreq/cpufreq-dt-platdev.c
+++ b/drivers/cpufreq/cpufreq-dt-platdev.c
@@ -75,6 +75,12 @@ static const struct of_device_id machines[] __initconst = {
 
 	{ .compatible = "sigma,tango4" },
 
+	{ .compatible = "socionext,uniphier-pro5", },
+	{ .compatible = "socionext,uniphier-pxs2", },
+	{ .compatible = "socionext,uniphier-ld6b", },
+	{ .compatible = "socionext,uniphier-ld11", },
+	{ .compatible = "socionext,uniphier-ld20", },
+
 	{ .compatible = "ti,am33xx", },
 	{ .compatible = "ti,dra7", },
 	{ .compatible = "ti,omap2", },
-- 
cgit v1.2.3


From bb446b574e1a0c7fcaeb4ff59a3693e3194f57fe Mon Sep 17 00:00:00 2001
From: Markus Mayer <mmayer@broadcom.com>
Date: Thu, 27 Oct 2016 14:05:34 -0700
Subject: dt: cpufreq: brcm: New binding document for brcmstb-avs-cpufreq

Add the binding document for the new brcmstb-avs-cpufreq driver.

Signed-off-by: Markus Mayer <mmayer@broadcom.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 .../bindings/cpufreq/brcm,stb-avs-cpu-freq.txt     | 78 ++++++++++++++++++++++
 MAINTAINERS                                        |  7 ++
 2 files changed, 85 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt

diff --git a/Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt b/Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt
new file mode 100644
index 000000000000..af2385795d78
--- /dev/null
+++ b/Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt
@@ -0,0 +1,78 @@
+Broadcom AVS mail box and interrupt register bindings
+=====================================================
+
+A total of three DT nodes are required. One node (brcm,avs-cpu-data-mem)
+references the mailbox register used to communicate with the AVS CPU[1]. The
+second node (brcm,avs-cpu-l2-intr) is required to trigger an interrupt on
+the AVS CPU. The interrupt tells the AVS CPU that it needs to process a
+command sent to it by a driver. Interrupting the AVS CPU is mandatory for
+commands to be processed.
+
+The interface also requires a reference to the AVS host interrupt controller,
+so a driver can react to interrupts generated by the AVS CPU whenever a command
+has been processed. See [2] for more information on the brcm,l2-intc node.
+
+[1] The AVS CPU is an independent co-processor that runs proprietary
+firmware. On some SoCs, this firmware supports DFS and DVFS in addition to
+Adaptive Voltage Scaling.
+
+[2] Documentation/devicetree/bindings/interrupt-controller/brcm,l2-intc.txt
+
+
+Node brcm,avs-cpu-data-mem
+--------------------------
+
+Required properties:
+- compatible: must include: brcm,avs-cpu-data-mem and
+              should include: one of brcm,bcm7271-avs-cpu-data-mem or
+                              brcm,bcm7268-avs-cpu-data-mem
+- reg: Specifies base physical address and size of the registers.
+- interrupts: The interrupt that the AVS CPU will use to interrupt the host
+              when a command completed.
+- interrupt-parent: The interrupt controller the above interrupt is routed
+                    through.
+- interrupt-names: The name of the interrupt used to interrupt the host.
+
+Optional properties:
+- None
+
+Node brcm,avs-cpu-l2-intr
+-------------------------
+
+Required properties:
+- compatible: must include: brcm,avs-cpu-l2-intr and
+              should include: one of brcm,bcm7271-avs-cpu-l2-intr or
+                              brcm,bcm7268-avs-cpu-l2-intr
+- reg: Specifies base physical address and size of the registers.
+
+Optional properties:
+- None
+
+
+Example
+=======
+
+	avs_host_l2_intc: interrupt-controller@f04d1200 {
+		#interrupt-cells = <1>;
+		compatible = "brcm,l2-intc";
+		interrupt-parent = <&intc>;
+		reg = <0xf04d1200 0x48>;
+		interrupt-controller;
+		interrupts = <0x0 0x19 0x0>;
+		interrupt-names = "avs";
+	};
+
+	avs-cpu-data-mem@f04c4000 {
+		compatible = "brcm,bcm7271-avs-cpu-data-mem",
+				"brcm,avs-cpu-data-mem";
+		reg = <0xf04c4000 0x60>;
+		interrupts = <0x1a>;
+		interrupt-parent = <&avs_host_l2_intc>;
+		interrupt-names = "sw_intr";
+	};
+
+	avs-cpu-l2-intr@f04d1100 {
+		compatible = "brcm,bcm7271-avs-cpu-l2-intr",
+				"brcm,avs-cpu-l2-intr";
+		reg = <0xf04d1100 0x10>;
+	};
diff --git a/MAINTAINERS b/MAINTAINERS
index 1cd38a7e0064..9379aca4bec4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2738,6 +2738,13 @@ L:	bcm-kernel-feedback-list@broadcom.com
 S:	Maintained
 F:	drivers/mtd/nand/brcmnand/
 
+BROADCOM STB AVS CPUFREQ DRIVER
+M:	Markus Mayer <mmayer@broadcom.com>
+M:	bcm-kernel-feedback-list@broadcom.com
+L:	linux-pm@vger.kernel.org
+S:	Maintained
+F:	Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt
+
 BROADCOM SPECIFIC AMBA DRIVER (BCMA)
 M:	Rafał Miłecki <zajec5@gmail.com>
 L:	linux-wireless@vger.kernel.org
-- 
cgit v1.2.3


From de322e085995b9417582d6f72229dadb5c09d163 Mon Sep 17 00:00:00 2001
From: Markus Mayer <mmayer@broadcom.com>
Date: Thu, 27 Oct 2016 14:05:35 -0700
Subject: cpufreq: brcmstb-avs-cpufreq: AVS CPUfreq driver for Broadcom STB
 SoCs

This driver supports voltage and frequency scaling on Broadcom STB SoCs
using AVS firmware with DFS and DVFS support.

Actual frequency or voltage scaling is done exclusively by the AVS
firmware. The driver merely provides a standard CPUfreq interface to
other kernel components and userland, and instructs the AVS firmware to
perform frequency or voltage changes on its behalf.

Signed-off-by: Markus Mayer <mmayer@broadcom.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 MAINTAINERS                           |   1 +
 drivers/cpufreq/Kconfig.arm           |  11 +
 drivers/cpufreq/Makefile              |   1 +
 drivers/cpufreq/brcmstb-avs-cpufreq.c | 736 ++++++++++++++++++++++++++++++++++
 4 files changed, 749 insertions(+)
 create mode 100644 drivers/cpufreq/brcmstb-avs-cpufreq.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 9379aca4bec4..d0ca1198d8ab 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2744,6 +2744,7 @@ M:	bcm-kernel-feedback-list@broadcom.com
 L:	linux-pm@vger.kernel.org
 S:	Maintained
 F:	Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt
+F:	drivers/cpufreq/brcmstb*
 
 BROADCOM SPECIFIC AMBA DRIVER (BCMA)
 M:	Rafał Miłecki <zajec5@gmail.com>
diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index fdbc630272b3..04f7862eacd3 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -12,6 +12,17 @@ config ARM_BIG_LITTLE_CPUFREQ
 	help
 	  This enables the Generic CPUfreq driver for ARM big.LITTLE platforms.
 
+config ARM_BRCMSTB_AVS_CPUFREQ
+	tristate "Broadcom STB AVS CPUfreq driver"
+	depends on ARCH_BRCMSTB || COMPILE_TEST
+	default y
+	help
+	  Some Broadcom STB SoCs use a co-processor running proprietary firmware
+	  ("AVS") to handle voltage and frequency scaling. This driver provides
+	  a standard CPUfreq interface to to the firmware.
+
+	  Say Y, if you have a Broadcom SoC with AVS support for DFS or DVFS.
+
 config ARM_DT_BL_CPUFREQ
 	tristate "Generic probing via DT for ARM big LITTLE CPUfreq driver"
 	depends on ARM_BIG_LITTLE_CPUFREQ && OF
diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index 7dde82179d62..1e46c3918e7a 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -51,6 +51,7 @@ obj-$(CONFIG_ARM_BIG_LITTLE_CPUFREQ)	+= arm_big_little.o
 # LITTLE drivers, so that it is probed last.
 obj-$(CONFIG_ARM_DT_BL_CPUFREQ)		+= arm_big_little_dt.o
 
+obj-$(CONFIG_ARM_BRCMSTB_AVS_CPUFREQ)	+= brcmstb-avs-cpufreq.o
 obj-$(CONFIG_ARCH_DAVINCI)		+= davinci-cpufreq.o
 obj-$(CONFIG_UX500_SOC_DB8500)		+= dbx500-cpufreq.o
 obj-$(CONFIG_ARM_EXYNOS5440_CPUFREQ)	+= exynos5440-cpufreq.o
diff --git a/drivers/cpufreq/brcmstb-avs-cpufreq.c b/drivers/cpufreq/brcmstb-avs-cpufreq.c
new file mode 100644
index 000000000000..4415fa051283
--- /dev/null
+++ b/drivers/cpufreq/brcmstb-avs-cpufreq.c
@@ -0,0 +1,736 @@
+/*
+ * CPU frequency scaling for Broadcom SoCs with AVS firmware that
+ * supports DVS or DVFS
+ *
+ * Copyright (c) 2016 Broadcom
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation version 2.
+ *
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+/*
+ * "AVS" is the name of a firmware developed at Broadcom. It derives
+ * its name from the technique called "Adaptive Voltage Scaling".
+ * Adaptive voltage scaling was the original purpose of this firmware.
+ * The AVS firmware still supports "AVS mode", where all it does is
+ * adaptive voltage scaling. However, on some newer Broadcom SoCs, the
+ * AVS Firmware, despite its unchanged name, also supports DFS mode and
+ * DVFS mode.
+ *
+ * In the context of this document and the related driver, "AVS" by
+ * itself always means the Broadcom firmware and never refers to the
+ * technique called "Adaptive Voltage Scaling".
+ *
+ * The Broadcom STB AVS CPUfreq driver provides voltage and frequency
+ * scaling on Broadcom SoCs using AVS firmware with support for DFS and
+ * DVFS. The AVS firmware is running on its own co-processor. The
+ * driver supports both uniprocessor (UP) and symmetric multiprocessor
+ * (SMP) systems which share clock and voltage across all CPUs.
+ *
+ * Actual voltage and frequency scaling is done solely by the AVS
+ * firmware. This driver does not change frequency or voltage itself.
+ * It provides a standard CPUfreq interface to the rest of the kernel
+ * and to userland. It interfaces with the AVS firmware to effect the
+ * requested changes and to report back the current system status in a
+ * way that is expected by existing tools.
+ */
+
+#include <linux/cpufreq.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/of_address.h>
+#include <linux/platform_device.h>
+#include <linux/semaphore.h>
+
+/* Max number of arguments AVS calls take */
+#define AVS_MAX_CMD_ARGS	4
+/*
+ * This macro is used to generate AVS parameter register offsets. For
+ * x >= AVS_MAX_CMD_ARGS, it returns 0 to protect against accidental memory
+ * access outside of the parameter range. (Offset 0 is the first parameter.)
+ */
+#define AVS_PARAM_MULT(x)	((x) < AVS_MAX_CMD_ARGS ? (x) : 0)
+
+/* AVS Mailbox Register offsets */
+#define AVS_MBOX_COMMAND	0x00
+#define AVS_MBOX_STATUS		0x04
+#define AVS_MBOX_VOLTAGE0	0x08
+#define AVS_MBOX_TEMP0		0x0c
+#define AVS_MBOX_PV0		0x10
+#define AVS_MBOX_MV0		0x14
+#define AVS_MBOX_PARAM(x)	(0x18 + AVS_PARAM_MULT(x) * sizeof(u32))
+#define AVS_MBOX_REVISION	0x28
+#define AVS_MBOX_PSTATE		0x2c
+#define AVS_MBOX_HEARTBEAT	0x30
+#define AVS_MBOX_MAGIC		0x34
+#define AVS_MBOX_SIGMA_HVT	0x38
+#define AVS_MBOX_SIGMA_SVT	0x3c
+#define AVS_MBOX_VOLTAGE1	0x40
+#define AVS_MBOX_TEMP1		0x44
+#define AVS_MBOX_PV1		0x48
+#define AVS_MBOX_MV1		0x4c
+#define AVS_MBOX_FREQUENCY	0x50
+
+/* AVS Commands */
+#define AVS_CMD_AVAILABLE	0x00
+#define AVS_CMD_DISABLE		0x10
+#define AVS_CMD_ENABLE		0x11
+#define AVS_CMD_S2_ENTER	0x12
+#define AVS_CMD_S2_EXIT		0x13
+#define AVS_CMD_BBM_ENTER	0x14
+#define AVS_CMD_BBM_EXIT	0x15
+#define AVS_CMD_S3_ENTER	0x16
+#define AVS_CMD_S3_EXIT		0x17
+#define AVS_CMD_BALANCE		0x18
+/* PMAP and P-STATE commands */
+#define AVS_CMD_GET_PMAP	0x30
+#define AVS_CMD_SET_PMAP	0x31
+#define AVS_CMD_GET_PSTATE	0x40
+#define AVS_CMD_SET_PSTATE	0x41
+
+/* Different modes AVS supports (for GET_PMAP/SET_PMAP) */
+#define AVS_MODE_AVS		0x0
+#define AVS_MODE_DFS		0x1
+#define AVS_MODE_DVS		0x2
+#define AVS_MODE_DVFS		0x3
+
+/*
+ * PMAP parameter p1
+ * unused:31-24, mdiv_p0:23-16, unused:15-14, pdiv:13-10 , ndiv_int:9-0
+ */
+#define NDIV_INT_SHIFT		0
+#define NDIV_INT_MASK		0x3ff
+#define PDIV_SHIFT		10
+#define PDIV_MASK		0xf
+#define MDIV_P0_SHIFT		16
+#define MDIV_P0_MASK		0xff
+/*
+ * PMAP parameter p2
+ * mdiv_p4:31-24, mdiv_p3:23-16, mdiv_p2:15:8, mdiv_p1:7:0
+ */
+#define MDIV_P1_SHIFT		0
+#define MDIV_P1_MASK		0xff
+#define MDIV_P2_SHIFT		8
+#define MDIV_P2_MASK		0xff
+#define MDIV_P3_SHIFT		16
+#define MDIV_P3_MASK		0xff
+#define MDIV_P4_SHIFT		24
+#define MDIV_P4_MASK		0xff
+
+/* Different P-STATES AVS supports (for GET_PSTATE/SET_PSTATE) */
+#define AVS_PSTATE_P0		0x0
+#define AVS_PSTATE_P1		0x1
+#define AVS_PSTATE_P2		0x2
+#define AVS_PSTATE_P3		0x3
+#define AVS_PSTATE_P4		0x4
+#define AVS_PSTATE_MAX		AVS_PSTATE_P4
+
+/* CPU L2 Interrupt Controller Registers */
+#define AVS_CPU_L2_SET0		0x04
+#define AVS_CPU_L2_INT_MASK	BIT(31)
+
+/* AVS Command Status Values */
+#define AVS_STATUS_CLEAR	0x00
+/* Command/notification accepted */
+#define AVS_STATUS_SUCCESS	0xf0
+/* Command/notification rejected */
+#define AVS_STATUS_FAILURE	0xff
+/* Invalid command/notification (unknown) */
+#define AVS_STATUS_INVALID	0xf1
+/* Non-AVS modes are not supported */
+#define AVS_STATUS_NO_SUPP	0xf2
+/* Cannot set P-State until P-Map supplied */
+#define AVS_STATUS_NO_MAP	0xf3
+/* Cannot change P-Map after initial P-Map set */
+#define AVS_STATUS_MAP_SET	0xf4
+/* Max AVS status; higher numbers are used for debugging */
+#define AVS_STATUS_MAX		0xff
+
+/* Other AVS related constants */
+#define AVS_LOOP_LIMIT		10000
+#define AVS_TIMEOUT		300 /* in ms; expected completion is < 10ms */
+#define AVS_FIRMWARE_MAGIC	0xa11600d1
+
+#define BRCM_AVS_CPUFREQ_PREFIX	"brcmstb-avs"
+#define BRCM_AVS_CPUFREQ_NAME	BRCM_AVS_CPUFREQ_PREFIX "-cpufreq"
+#define BRCM_AVS_CPU_DATA	"brcm,avs-cpu-data-mem"
+#define BRCM_AVS_CPU_INTR	"brcm,avs-cpu-l2-intr"
+#define BRCM_AVS_HOST_INTR	"sw_intr"
+
+struct pmap {
+	unsigned int mode;
+	unsigned int p1;
+	unsigned int p2;
+	unsigned int state;
+};
+
+struct private_data {
+	void __iomem *base;
+	void __iomem *avs_intr_base;
+	struct device *dev;
+	struct completion done;
+	struct semaphore sem;
+	struct pmap pmap;
+};
+
+static void __iomem *__map_region(const char *name)
+{
+	struct device_node *np;
+	void __iomem *ptr;
+
+	np = of_find_compatible_node(NULL, NULL, name);
+	if (!np)
+		return NULL;
+
+	ptr = of_iomap(np, 0);
+	of_node_put(np);
+
+	return ptr;
+}
+
+static int __issue_avs_command(struct private_data *priv, int cmd, bool is_send,
+			       u32 args[])
+{
+	unsigned long time_left = msecs_to_jiffies(AVS_TIMEOUT);
+	void __iomem *base = priv->base;
+	unsigned int i;
+	int ret;
+	u32 val;
+
+	ret = down_interruptible(&priv->sem);
+	if (ret)
+		return ret;
+
+	/*
+	 * Make sure no other command is currently running: cmd is 0 if AVS
+	 * co-processor is idle. Due to the guard above, we should almost never
+	 * have to wait here.
+	 */
+	for (i = 0, val = 1; val != 0 && i < AVS_LOOP_LIMIT; i++)
+		val = readl(base + AVS_MBOX_COMMAND);
+
+	/* Give the caller a chance to retry if AVS is busy. */
+	if (i == AVS_LOOP_LIMIT) {
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	/* Clear status before we begin. */
+	writel(AVS_STATUS_CLEAR, base + AVS_MBOX_STATUS);
+
+	/* We need to send arguments for this command. */
+	if (args && is_send) {
+		for (i = 0; i < AVS_MAX_CMD_ARGS; i++)
+			writel(args[i], base + AVS_MBOX_PARAM(i));
+	}
+
+	/* Protect from spurious interrupts. */
+	reinit_completion(&priv->done);
+
+	/* Now issue the command & tell firmware to wake up to process it. */
+	writel(cmd, base + AVS_MBOX_COMMAND);
+	writel(AVS_CPU_L2_INT_MASK, priv->avs_intr_base + AVS_CPU_L2_SET0);
+
+	/* Wait for AVS co-processor to finish processing the command. */
+	time_left = wait_for_completion_timeout(&priv->done, time_left);
+
+	/*
+	 * If the AVS status is not in the expected range, it means AVS didn't
+	 * complete our command in time, and we return an error. Also, if there
+	 * is no "time left", we timed out waiting for the interrupt.
+	 */
+	val = readl(base + AVS_MBOX_STATUS);
+	if (time_left == 0 || val == 0 || val > AVS_STATUS_MAX) {
+		dev_err(priv->dev, "AVS command %#x didn't complete in time\n",
+			cmd);
+		dev_err(priv->dev, "    Time left: %u ms, AVS status: %#x\n",
+			jiffies_to_msecs(time_left), val);
+		ret = -ETIMEDOUT;
+		goto out;
+	}
+
+	/* This command returned arguments, so we read them back. */
+	if (args && !is_send) {
+		for (i = 0; i < AVS_MAX_CMD_ARGS; i++)
+			args[i] = readl(base + AVS_MBOX_PARAM(i));
+	}
+
+	/* Clear status to tell AVS co-processor we are done. */
+	writel(AVS_STATUS_CLEAR, base + AVS_MBOX_STATUS);
+
+	/* Convert firmware errors to errno's as much as possible. */
+	switch (val) {
+	case AVS_STATUS_INVALID:
+		ret = -EINVAL;
+		break;
+	case AVS_STATUS_NO_SUPP:
+		ret = -ENOTSUPP;
+		break;
+	case AVS_STATUS_NO_MAP:
+		ret = -ENOENT;
+		break;
+	case AVS_STATUS_MAP_SET:
+		ret = -EEXIST;
+		break;
+	case AVS_STATUS_FAILURE:
+		ret = -EIO;
+		break;
+	}
+
+out:
+	up(&priv->sem);
+
+	return ret;
+}
+
+static irqreturn_t irq_handler(int irq, void *data)
+{
+	struct private_data *priv = data;
+
+	/* AVS command completed execution. Wake up __issue_avs_command(). */
+	complete(&priv->done);
+
+	return IRQ_HANDLED;
+}
+
+static char *brcm_avs_mode_to_string(unsigned int mode)
+{
+	switch (mode) {
+	case AVS_MODE_AVS:
+		return "AVS";
+	case AVS_MODE_DFS:
+		return "DFS";
+	case AVS_MODE_DVS:
+		return "DVS";
+	case AVS_MODE_DVFS:
+		return "DVFS";
+	}
+	return NULL;
+}
+
+static void brcm_avs_parse_p1(u32 p1, unsigned int *mdiv_p0, unsigned int *pdiv,
+			      unsigned int *ndiv)
+{
+	*mdiv_p0 = (p1 >> MDIV_P0_SHIFT) & MDIV_P0_MASK;
+	*pdiv = (p1 >> PDIV_SHIFT) & PDIV_MASK;
+	*ndiv = (p1 >> NDIV_INT_SHIFT) & NDIV_INT_MASK;
+}
+
+static void brcm_avs_parse_p2(u32 p2, unsigned int *mdiv_p1,
+			      unsigned int *mdiv_p2, unsigned int *mdiv_p3,
+			      unsigned int *mdiv_p4)
+{
+	*mdiv_p4 = (p2 >> MDIV_P4_SHIFT) & MDIV_P4_MASK;
+	*mdiv_p3 = (p2 >> MDIV_P3_SHIFT) & MDIV_P3_MASK;
+	*mdiv_p2 = (p2 >> MDIV_P2_SHIFT) & MDIV_P2_MASK;
+	*mdiv_p1 = (p2 >> MDIV_P1_SHIFT) & MDIV_P1_MASK;
+}
+
+static int brcm_avs_get_pmap(struct private_data *priv, struct pmap *pmap)
+{
+	u32 args[AVS_MAX_CMD_ARGS];
+	int ret;
+
+	ret = __issue_avs_command(priv, AVS_CMD_GET_PMAP, false, args);
+	if (ret || !pmap)
+		return ret;
+
+	pmap->mode = args[0];
+	pmap->p1 = args[1];
+	pmap->p2 = args[2];
+	pmap->state = args[3];
+
+	return 0;
+}
+
+static int brcm_avs_set_pmap(struct private_data *priv, struct pmap *pmap)
+{
+	u32 args[AVS_MAX_CMD_ARGS];
+
+	args[0] = pmap->mode;
+	args[1] = pmap->p1;
+	args[2] = pmap->p2;
+	args[3] = pmap->state;
+
+	return __issue_avs_command(priv, AVS_CMD_SET_PMAP, true, args);
+}
+
+static int brcm_avs_get_pstate(struct private_data *priv, unsigned int *pstate)
+{
+	u32 args[AVS_MAX_CMD_ARGS];
+	int ret;
+
+	ret = __issue_avs_command(priv, AVS_CMD_GET_PSTATE, false, args);
+	if (ret)
+		return ret;
+	*pstate = args[0];
+
+	return 0;
+}
+
+static int brcm_avs_set_pstate(struct private_data *priv, unsigned int pstate)
+{
+	u32 args[AVS_MAX_CMD_ARGS];
+
+	args[0] = pstate;
+
+	return __issue_avs_command(priv, AVS_CMD_SET_PSTATE, true, args);
+}
+
+static unsigned long brcm_avs_get_voltage(void __iomem *base)
+{
+	return readl(base + AVS_MBOX_VOLTAGE1);
+}
+
+static unsigned long brcm_avs_get_frequency(void __iomem *base)
+{
+	return readl(base + AVS_MBOX_FREQUENCY) * 1000;	/* in kHz */
+}
+
+/*
+ * We determine which frequencies are supported by cycling through all P-states
+ * and reading back what frequency we are running at for each P-state.
+ */
+static struct cpufreq_frequency_table *
+brcm_avs_get_freq_table(struct device *dev, struct private_data *priv)
+{
+	struct cpufreq_frequency_table *table;
+	unsigned int pstate;
+	int i, ret;
+
+	/* Remember P-state for later */
+	ret = brcm_avs_get_pstate(priv, &pstate);
+	if (ret)
+		return ERR_PTR(ret);
+
+	table = devm_kzalloc(dev, (AVS_PSTATE_MAX + 1) * sizeof(*table),
+			     GFP_KERNEL);
+	if (!table)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = AVS_PSTATE_P0; i <= AVS_PSTATE_MAX; i++) {
+		ret = brcm_avs_set_pstate(priv, i);
+		if (ret)
+			return ERR_PTR(ret);
+		table[i].frequency = brcm_avs_get_frequency(priv->base);
+		table[i].driver_data = i;
+	}
+	table[i].frequency = CPUFREQ_TABLE_END;
+
+	/* Restore P-state */
+	ret = brcm_avs_set_pstate(priv, pstate);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return table;
+}
+
+/*
+ * To ensure the right firmware is running we need to
+ *    - check the MAGIC matches what we expect
+ *    - brcm_avs_get_pmap() doesn't return -ENOTSUPP or -EINVAL
+ * We need to set up our interrupt handling before calling brcm_avs_get_pmap()!
+ */
+static bool brcm_avs_is_firmware_loaded(struct private_data *priv)
+{
+	u32 magic;
+	int rc;
+
+	rc = brcm_avs_get_pmap(priv, NULL);
+	magic = readl(priv->base + AVS_MBOX_MAGIC);
+
+	return (magic == AVS_FIRMWARE_MAGIC) && (rc != -ENOTSUPP) &&
+		(rc != -EINVAL);
+}
+
+static unsigned int brcm_avs_cpufreq_get(unsigned int cpu)
+{
+	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
+	struct private_data *priv = policy->driver_data;
+
+	return brcm_avs_get_frequency(priv->base);
+}
+
+static int brcm_avs_target_index(struct cpufreq_policy *policy,
+				 unsigned int index)
+{
+	return brcm_avs_set_pstate(policy->driver_data,
+				  policy->freq_table[index].driver_data);
+}
+
+static int brcm_avs_suspend(struct cpufreq_policy *policy)
+{
+	struct private_data *priv = policy->driver_data;
+
+	return brcm_avs_get_pmap(priv, &priv->pmap);
+}
+
+static int brcm_avs_resume(struct cpufreq_policy *policy)
+{
+	struct private_data *priv = policy->driver_data;
+	int ret;
+
+	ret = brcm_avs_set_pmap(priv, &priv->pmap);
+	if (ret == -EEXIST) {
+		struct platform_device *pdev  = cpufreq_get_driver_data();
+		struct device *dev = &pdev->dev;
+
+		dev_warn(dev, "PMAP was already set\n");
+		ret = 0;
+	}
+
+	return ret;
+}
+
+/*
+ * All initialization code that we only want to execute once goes here. Setup
+ * code that can be re-tried on every core (if it failed before) can go into
+ * brcm_avs_cpufreq_init().
+ */
+static int brcm_avs_prepare_init(struct platform_device *pdev)
+{
+	struct private_data *priv;
+	struct device *dev;
+	int host_irq, ret;
+
+	dev = &pdev->dev;
+	priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->dev = dev;
+	sema_init(&priv->sem, 1);
+	init_completion(&priv->done);
+	platform_set_drvdata(pdev, priv);
+
+	priv->base = __map_region(BRCM_AVS_CPU_DATA);
+	if (!priv->base) {
+		dev_err(dev, "Couldn't find property %s in device tree.\n",
+			BRCM_AVS_CPU_DATA);
+		return -ENOENT;
+	}
+
+	priv->avs_intr_base = __map_region(BRCM_AVS_CPU_INTR);
+	if (!priv->avs_intr_base) {
+		dev_err(dev, "Couldn't find property %s in device tree.\n",
+			BRCM_AVS_CPU_INTR);
+		ret = -ENOENT;
+		goto unmap_base;
+	}
+
+	host_irq = platform_get_irq_byname(pdev, BRCM_AVS_HOST_INTR);
+	if (host_irq < 0) {
+		dev_err(dev, "Couldn't find interrupt %s -- %d\n",
+			BRCM_AVS_HOST_INTR, host_irq);
+		ret = host_irq;
+		goto unmap_intr_base;
+	}
+
+	ret = devm_request_irq(dev, host_irq, irq_handler, IRQF_TRIGGER_RISING,
+			       BRCM_AVS_HOST_INTR, priv);
+	if (ret) {
+		dev_err(dev, "IRQ request failed: %s (%d) -- %d\n",
+			BRCM_AVS_HOST_INTR, host_irq, ret);
+		goto unmap_intr_base;
+	}
+
+	if (brcm_avs_is_firmware_loaded(priv))
+		return 0;
+
+	dev_err(dev, "AVS firmware is not loaded or doesn't support DVFS\n");
+	ret = -ENODEV;
+
+unmap_intr_base:
+	iounmap(priv->avs_intr_base);
+unmap_base:
+	iounmap(priv->base);
+	platform_set_drvdata(pdev, NULL);
+
+	return ret;
+}
+
+static int brcm_avs_cpufreq_init(struct cpufreq_policy *policy)
+{
+	struct cpufreq_frequency_table *freq_table;
+	struct platform_device *pdev;
+	struct private_data *priv;
+	struct device *dev;
+	int ret;
+
+	pdev = cpufreq_get_driver_data();
+	priv = platform_get_drvdata(pdev);
+	policy->driver_data = priv;
+	dev = &pdev->dev;
+
+	freq_table = brcm_avs_get_freq_table(dev, priv);
+	if (IS_ERR(freq_table)) {
+		ret = PTR_ERR(freq_table);
+		dev_err(dev, "Couldn't determine frequency table (%d).\n", ret);
+		return ret;
+	}
+
+	ret = cpufreq_table_validate_and_show(policy, freq_table);
+	if (ret) {
+		dev_err(dev, "invalid frequency table: %d\n", ret);
+		return ret;
+	}
+
+	/* All cores share the same clock and thus the same policy. */
+	cpumask_setall(policy->cpus);
+
+	ret = __issue_avs_command(priv, AVS_CMD_ENABLE, false, NULL);
+	if (!ret) {
+		unsigned int pstate;
+
+		ret = brcm_avs_get_pstate(priv, &pstate);
+		if (!ret) {
+			policy->cur = freq_table[pstate].frequency;
+			dev_info(dev, "registered\n");
+			return 0;
+		}
+	}
+
+	dev_err(dev, "couldn't initialize driver (%d)\n", ret);
+
+	return ret;
+}
+
+static ssize_t show_brcm_avs_pstate(struct cpufreq_policy *policy, char *buf)
+{
+	struct private_data *priv = policy->driver_data;
+	unsigned int pstate;
+
+	if (brcm_avs_get_pstate(priv, &pstate))
+		return sprintf(buf, "<unknown>\n");
+
+	return sprintf(buf, "%u\n", pstate);
+}
+
+static ssize_t show_brcm_avs_mode(struct cpufreq_policy *policy, char *buf)
+{
+	struct private_data *priv = policy->driver_data;
+	struct pmap pmap;
+
+	if (brcm_avs_get_pmap(priv, &pmap))
+		return sprintf(buf, "<unknown>\n");
+
+	return sprintf(buf, "%s %u\n", brcm_avs_mode_to_string(pmap.mode),
+		pmap.mode);
+}
+
+static ssize_t show_brcm_avs_pmap(struct cpufreq_policy *policy, char *buf)
+{
+	unsigned int mdiv_p0, mdiv_p1, mdiv_p2, mdiv_p3, mdiv_p4;
+	struct private_data *priv = policy->driver_data;
+	unsigned int ndiv, pdiv;
+	struct pmap pmap;
+
+	if (brcm_avs_get_pmap(priv, &pmap))
+		return sprintf(buf, "<unknown>\n");
+
+	brcm_avs_parse_p1(pmap.p1, &mdiv_p0, &pdiv, &ndiv);
+	brcm_avs_parse_p2(pmap.p2, &mdiv_p1, &mdiv_p2, &mdiv_p3, &mdiv_p4);
+
+	return sprintf(buf, "0x%08x 0x%08x %u %u %u %u %u %u %u\n",
+		pmap.p1, pmap.p2, ndiv, pdiv, mdiv_p0, mdiv_p1, mdiv_p2,
+		mdiv_p3, mdiv_p4);
+}
+
+static ssize_t show_brcm_avs_voltage(struct cpufreq_policy *policy, char *buf)
+{
+	struct private_data *priv = policy->driver_data;
+
+	return sprintf(buf, "0x%08lx\n", brcm_avs_get_voltage(priv->base));
+}
+
+static ssize_t show_brcm_avs_frequency(struct cpufreq_policy *policy, char *buf)
+{
+	struct private_data *priv = policy->driver_data;
+
+	return sprintf(buf, "0x%08lx\n", brcm_avs_get_frequency(priv->base));
+}
+
+cpufreq_freq_attr_ro(brcm_avs_pstate);
+cpufreq_freq_attr_ro(brcm_avs_mode);
+cpufreq_freq_attr_ro(brcm_avs_pmap);
+cpufreq_freq_attr_ro(brcm_avs_voltage);
+cpufreq_freq_attr_ro(brcm_avs_frequency);
+
+struct freq_attr *brcm_avs_cpufreq_attr[] = {
+	&cpufreq_freq_attr_scaling_available_freqs,
+	&brcm_avs_pstate,
+	&brcm_avs_mode,
+	&brcm_avs_pmap,
+	&brcm_avs_voltage,
+	&brcm_avs_frequency,
+	NULL
+};
+
+static struct cpufreq_driver brcm_avs_driver = {
+	.flags		= CPUFREQ_NEED_INITIAL_FREQ_CHECK,
+	.verify		= cpufreq_generic_frequency_table_verify,
+	.target_index	= brcm_avs_target_index,
+	.get		= brcm_avs_cpufreq_get,
+	.suspend	= brcm_avs_suspend,
+	.resume		= brcm_avs_resume,
+	.init		= brcm_avs_cpufreq_init,
+	.attr		= brcm_avs_cpufreq_attr,
+	.name		= BRCM_AVS_CPUFREQ_PREFIX,
+};
+
+static int brcm_avs_cpufreq_probe(struct platform_device *pdev)
+{
+	int ret;
+
+	ret = brcm_avs_prepare_init(pdev);
+	if (ret)
+		return ret;
+
+	brcm_avs_driver.driver_data = pdev;
+
+	return cpufreq_register_driver(&brcm_avs_driver);
+}
+
+static int brcm_avs_cpufreq_remove(struct platform_device *pdev)
+{
+	struct private_data *priv;
+	int ret;
+
+	ret = cpufreq_unregister_driver(&brcm_avs_driver);
+	if (ret)
+		return ret;
+
+	priv = platform_get_drvdata(pdev);
+	iounmap(priv->base);
+	iounmap(priv->avs_intr_base);
+	platform_set_drvdata(pdev, NULL);
+
+	return 0;
+}
+
+static const struct of_device_id brcm_avs_cpufreq_match[] = {
+	{ .compatible = BRCM_AVS_CPU_DATA },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, brcm_avs_cpufreq_match);
+
+static struct platform_driver brcm_avs_cpufreq_platdrv = {
+	.driver = {
+		.name	= BRCM_AVS_CPUFREQ_NAME,
+		.of_match_table = brcm_avs_cpufreq_match,
+	},
+	.probe		= brcm_avs_cpufreq_probe,
+	.remove		= brcm_avs_cpufreq_remove,
+};
+module_platform_driver(brcm_avs_cpufreq_platdrv);
+
+MODULE_AUTHOR("Markus Mayer <mmayer@broadcom.com>");
+MODULE_DESCRIPTION("CPUfreq driver for Broadcom STB AVS");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 33de45c133b40a129a880c175af4c2bb39dafc88 Mon Sep 17 00:00:00 2001
From: Markus Mayer <mmayer@broadcom.com>
Date: Thu, 27 Oct 2016 14:05:36 -0700
Subject: cpufreq: brcmstb-avs-cpufreq: add debugfs support

In order to aid debugging, we add a debugfs interface to the driver
that allows direct interaction with the AVS co-processor.

The debugfs interface provides a means for reading all and writing some
of the mailbox registers directly from the shell prompt and enables a
user to execute the communications protocol between ARM CPU and AVS CPU
step-by-step.

This interface should be used for debugging purposes only.

Signed-off-by: Markus Mayer <mmayer@broadcom.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/Kconfig.arm           |  10 ++
 drivers/cpufreq/brcmstb-avs-cpufreq.c | 323 +++++++++++++++++++++++++++++++++-
 2 files changed, 332 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
index 04f7862eacd3..920c469f3953 100644
--- a/drivers/cpufreq/Kconfig.arm
+++ b/drivers/cpufreq/Kconfig.arm
@@ -23,6 +23,16 @@ config ARM_BRCMSTB_AVS_CPUFREQ
 
 	  Say Y, if you have a Broadcom SoC with AVS support for DFS or DVFS.
 
+config ARM_BRCMSTB_AVS_CPUFREQ_DEBUG
+	bool "Broadcom STB AVS CPUfreq driver sysfs debug capability"
+	depends on ARM_BRCMSTB_AVS_CPUFREQ
+	help
+	  Enabling this option turns on debug support via sysfs under
+	  /sys/kernel/debug/brcmstb-avs-cpufreq. It is possible to read all and
+	  write some AVS mailbox registers through sysfs entries.
+
+	  If in doubt, say N.
+
 config ARM_DT_BL_CPUFREQ
 	tristate "Generic probing via DT for ARM big LITTLE CPUfreq driver"
 	depends on ARM_BIG_LITTLE_CPUFREQ && OF
diff --git a/drivers/cpufreq/brcmstb-avs-cpufreq.c b/drivers/cpufreq/brcmstb-avs-cpufreq.c
index 4415fa051283..b761d54bb08d 100644
--- a/drivers/cpufreq/brcmstb-avs-cpufreq.c
+++ b/drivers/cpufreq/brcmstb-avs-cpufreq.c
@@ -49,6 +49,13 @@
 #include <linux/platform_device.h>
 #include <linux/semaphore.h>
 
+#ifdef CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#endif
+
 /* Max number of arguments AVS calls take */
 #define AVS_MAX_CMD_ARGS	4
 /*
@@ -175,11 +182,88 @@ struct private_data {
 	void __iomem *base;
 	void __iomem *avs_intr_base;
 	struct device *dev;
+#ifdef CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG
+	struct dentry *debugfs;
+#endif
 	struct completion done;
 	struct semaphore sem;
 	struct pmap pmap;
 };
 
+#ifdef CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG
+
+enum debugfs_format {
+	DEBUGFS_NORMAL,
+	DEBUGFS_FLOAT,
+	DEBUGFS_REV,
+};
+
+struct debugfs_data {
+	struct debugfs_entry *entry;
+	struct private_data *priv;
+};
+
+struct debugfs_entry {
+	char *name;
+	u32 offset;
+	fmode_t mode;
+	enum debugfs_format format;
+};
+
+#define DEBUGFS_ENTRY(name, mode, format)	{ \
+	#name, AVS_MBOX_##name, mode, format \
+}
+
+/*
+ * These are used for debugfs only. Otherwise we use AVS_MBOX_PARAM() directly.
+ */
+#define AVS_MBOX_PARAM1		AVS_MBOX_PARAM(0)
+#define AVS_MBOX_PARAM2		AVS_MBOX_PARAM(1)
+#define AVS_MBOX_PARAM3		AVS_MBOX_PARAM(2)
+#define AVS_MBOX_PARAM4		AVS_MBOX_PARAM(3)
+
+/*
+ * This table stores the name, access permissions and offset for each hardware
+ * register and is used to generate debugfs entries.
+ */
+static struct debugfs_entry debugfs_entries[] = {
+	DEBUGFS_ENTRY(COMMAND, S_IWUSR, DEBUGFS_NORMAL),
+	DEBUGFS_ENTRY(STATUS, S_IWUSR, DEBUGFS_NORMAL),
+	DEBUGFS_ENTRY(VOLTAGE0, 0, DEBUGFS_FLOAT),
+	DEBUGFS_ENTRY(TEMP0, 0, DEBUGFS_FLOAT),
+	DEBUGFS_ENTRY(PV0, 0, DEBUGFS_FLOAT),
+	DEBUGFS_ENTRY(MV0, 0, DEBUGFS_FLOAT),
+	DEBUGFS_ENTRY(PARAM1, S_IWUSR, DEBUGFS_NORMAL),
+	DEBUGFS_ENTRY(PARAM2, S_IWUSR, DEBUGFS_NORMAL),
+	DEBUGFS_ENTRY(PARAM3, S_IWUSR, DEBUGFS_NORMAL),
+	DEBUGFS_ENTRY(PARAM4, S_IWUSR, DEBUGFS_NORMAL),
+	DEBUGFS_ENTRY(REVISION, 0, DEBUGFS_REV),
+	DEBUGFS_ENTRY(PSTATE, 0, DEBUGFS_NORMAL),
+	DEBUGFS_ENTRY(HEARTBEAT, 0, DEBUGFS_NORMAL),
+	DEBUGFS_ENTRY(MAGIC, S_IWUSR, DEBUGFS_NORMAL),
+	DEBUGFS_ENTRY(SIGMA_HVT, 0, DEBUGFS_NORMAL),
+	DEBUGFS_ENTRY(SIGMA_SVT, 0, DEBUGFS_NORMAL),
+	DEBUGFS_ENTRY(VOLTAGE1, 0, DEBUGFS_FLOAT),
+	DEBUGFS_ENTRY(TEMP1, 0, DEBUGFS_FLOAT),
+	DEBUGFS_ENTRY(PV1, 0, DEBUGFS_FLOAT),
+	DEBUGFS_ENTRY(MV1, 0, DEBUGFS_FLOAT),
+	DEBUGFS_ENTRY(FREQUENCY, 0, DEBUGFS_NORMAL),
+};
+
+static int brcm_avs_target_index(struct cpufreq_policy *, unsigned int);
+
+static char *__strtolower(char *s)
+{
+	char *p;
+
+	for (p = s; *p; p++)
+		*p = tolower(*p);
+
+	return s;
+}
+
+#endif /* CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG */
+
 static void __iomem *__map_region(const char *name)
 {
 	struct device_node *np;
@@ -432,6 +516,238 @@ brcm_avs_get_freq_table(struct device *dev, struct private_data *priv)
 	return table;
 }
 
+#ifdef CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG
+
+#define MANT(x)	(unsigned int)(abs((x)) / 1000)
+#define FRAC(x)	(unsigned int)(abs((x)) - abs((x)) / 1000 * 1000)
+
+static int brcm_avs_debug_show(struct seq_file *s, void *data)
+{
+	struct debugfs_data *dbgfs = s->private;
+	void __iomem *base;
+	u32 val, offset;
+
+	if (!dbgfs) {
+		seq_puts(s, "No device pointer\n");
+		return 0;
+	}
+
+	base = dbgfs->priv->base;
+	offset = dbgfs->entry->offset;
+	val = readl(base + offset);
+	switch (dbgfs->entry->format) {
+	case DEBUGFS_NORMAL:
+		seq_printf(s, "%u\n", val);
+		break;
+	case DEBUGFS_FLOAT:
+		seq_printf(s, "%d.%03d\n", MANT(val), FRAC(val));
+		break;
+	case DEBUGFS_REV:
+		seq_printf(s, "%c.%c.%c.%c\n", (val >> 24 & 0xff),
+			   (val >> 16 & 0xff), (val >> 8 & 0xff),
+			   val & 0xff);
+		break;
+	}
+	seq_printf(s, "0x%08x\n", val);
+
+	return 0;
+}
+
+#undef MANT
+#undef FRAC
+
+static ssize_t brcm_avs_seq_write(struct file *file, const char __user *buf,
+				  size_t size, loff_t *ppos)
+{
+	struct seq_file *s = file->private_data;
+	struct debugfs_data *dbgfs = s->private;
+	struct private_data *priv = dbgfs->priv;
+	void __iomem *base, *avs_intr_base;
+	bool use_issue_command = false;
+	unsigned long val, offset;
+	char str[128];
+	int ret;
+	char *str_ptr = str;
+
+	if (size >= sizeof(str))
+		return -E2BIG;
+
+	memset(str, 0, sizeof(str));
+	ret = copy_from_user(str, buf, size);
+	if (ret)
+		return ret;
+
+	base = priv->base;
+	avs_intr_base = priv->avs_intr_base;
+	offset = dbgfs->entry->offset;
+	/*
+	 * Special case writing to "command" entry only: if the string starts
+	 * with a 'c', we use the driver's __issue_avs_command() function.
+	 * Otherwise, we perform a raw write. This should allow testing of raw
+	 * access as well as using the higher level function. (Raw access
+	 * doesn't clear the firmware return status after issuing the command.)
+	 */
+	if (str_ptr[0] == 'c' && offset == AVS_MBOX_COMMAND) {
+		use_issue_command = true;
+		str_ptr++;
+	}
+	if (kstrtoul(str_ptr, 0, &val) != 0)
+		return -EINVAL;
+
+	/*
+	 * Setting the P-state is a special case. We need to update the CPU
+	 * frequency we report.
+	 */
+	if (val == AVS_CMD_SET_PSTATE) {
+		struct cpufreq_policy *policy;
+		unsigned int pstate;
+
+		policy = cpufreq_cpu_get(smp_processor_id());
+		/* Read back the P-state we are about to set */
+		pstate = readl(base + AVS_MBOX_PARAM(0));
+		if (use_issue_command) {
+			ret = brcm_avs_target_index(policy, pstate);
+			return ret ? ret : size;
+		}
+		policy->cur = policy->freq_table[pstate].frequency;
+	}
+
+	if (use_issue_command) {
+		ret = __issue_avs_command(priv, val, false, NULL);
+	} else {
+		/* Locking here is not perfect, but is only for debug. */
+		ret = down_interruptible(&priv->sem);
+		if (ret)
+			return ret;
+
+		writel(val, base + offset);
+		/* We have to wake up the firmware to process a command. */
+		if (offset == AVS_MBOX_COMMAND)
+			writel(AVS_CPU_L2_INT_MASK,
+			       avs_intr_base + AVS_CPU_L2_SET0);
+		up(&priv->sem);
+	}
+
+	return ret ? ret : size;
+}
+
+static struct debugfs_entry *__find_debugfs_entry(const char *name)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(debugfs_entries); i++)
+		if (strcasecmp(debugfs_entries[i].name, name) == 0)
+			return &debugfs_entries[i];
+
+	return NULL;
+}
+
+static int brcm_avs_debug_open(struct inode *inode, struct file *file)
+{
+	struct debugfs_data *data;
+	fmode_t fmode;
+	int ret;
+
+	/*
+	 * seq_open(), which is called by single_open(), clears "write" access.
+	 * We need write access to some files, so we preserve our access mode
+	 * and restore it.
+	 */
+	fmode = file->f_mode;
+	/*
+	 * Check access permissions even for root. We don't want to be writing
+	 * to read-only registers. Access for regular users has already been
+	 * checked by the VFS layer.
+	 */
+	if ((fmode & FMODE_WRITER) && !(inode->i_mode & S_IWUSR))
+		return -EACCES;
+
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+	/*
+	 * We use the same file system operations for all our debug files. To
+	 * produce specific output, we look up the file name upon opening a
+	 * debugfs entry and map it to a memory offset. This offset is then used
+	 * in the generic "show" function to read a specific register.
+	 */
+	data->entry = __find_debugfs_entry(file->f_path.dentry->d_iname);
+	data->priv = inode->i_private;
+
+	ret = single_open(file, brcm_avs_debug_show, data);
+	if (ret)
+		kfree(data);
+	file->f_mode = fmode;
+
+	return ret;
+}
+
+static int brcm_avs_debug_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq_priv = file->private_data;
+	struct debugfs_data *data = seq_priv->private;
+
+	kfree(data);
+	return single_release(inode, file);
+}
+
+static const struct file_operations brcm_avs_debug_ops = {
+	.open		= brcm_avs_debug_open,
+	.read		= seq_read,
+	.write		= brcm_avs_seq_write,
+	.llseek		= seq_lseek,
+	.release	= brcm_avs_debug_release,
+};
+
+static void brcm_avs_cpufreq_debug_init(struct platform_device *pdev)
+{
+	struct private_data *priv = platform_get_drvdata(pdev);
+	struct dentry *dir;
+	int i;
+
+	if (!priv)
+		return;
+
+	dir = debugfs_create_dir(BRCM_AVS_CPUFREQ_NAME, NULL);
+	if (IS_ERR_OR_NULL(dir))
+		return;
+	priv->debugfs = dir;
+
+	for (i = 0; i < ARRAY_SIZE(debugfs_entries); i++) {
+		/*
+		 * The DEBUGFS_ENTRY macro generates uppercase strings. We
+		 * convert them to lowercase before creating the debugfs
+		 * entries.
+		 */
+		char *entry = __strtolower(debugfs_entries[i].name);
+		fmode_t mode = debugfs_entries[i].mode;
+
+		if (!debugfs_create_file(entry, S_IFREG | S_IRUGO | mode,
+					 dir, priv, &brcm_avs_debug_ops)) {
+			priv->debugfs = NULL;
+			debugfs_remove_recursive(dir);
+			break;
+		}
+	}
+}
+
+static void brcm_avs_cpufreq_debug_exit(struct platform_device *pdev)
+{
+	struct private_data *priv = platform_get_drvdata(pdev);
+
+	if (priv && priv->debugfs) {
+		debugfs_remove_recursive(priv->debugfs);
+		priv->debugfs = NULL;
+	}
+}
+
+#else
+
+static void brcm_avs_cpufreq_debug_init(struct platform_device *pdev) {}
+static void brcm_avs_cpufreq_debug_exit(struct platform_device *pdev) {}
+
+#endif /* CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG */
+
 /*
  * To ensure the right firmware is running we need to
  *    - check the MAGIC matches what we expect
@@ -694,8 +1010,11 @@ static int brcm_avs_cpufreq_probe(struct platform_device *pdev)
 		return ret;
 
 	brcm_avs_driver.driver_data = pdev;
+	ret = cpufreq_register_driver(&brcm_avs_driver);
+	if (!ret)
+		brcm_avs_cpufreq_debug_init(pdev);
 
-	return cpufreq_register_driver(&brcm_avs_driver);
+	return ret;
 }
 
 static int brcm_avs_cpufreq_remove(struct platform_device *pdev)
@@ -707,6 +1026,8 @@ static int brcm_avs_cpufreq_remove(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
+	brcm_avs_cpufreq_debug_exit(pdev);
+
 	priv = platform_get_drvdata(pdev);
 	iounmap(priv->base);
 	iounmap(priv->avs_intr_base);
-- 
cgit v1.2.3


From a410c03d668122923ab9b17c0c5728b520edddb4 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Fri, 28 Oct 2016 10:44:52 -0700
Subject: cpufreq: intel_pstate: protect limits variable

The limits variable gets modified from intel_pstate sysfs and also gets
modified from cpufreq sysfs. So protect with a mutex to keep data
integrity, when they are getting modified from multiple threads.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 8e7390bb2175..0837175395b9 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -373,6 +373,8 @@ static struct perf_limits *limits = &performance_limits;
 static struct perf_limits *limits = &powersave_limits;
 #endif
 
+static DEFINE_MUTEX(intel_pstate_limits_lock);
+
 #ifdef CONFIG_ACPI
 
 static bool intel_pstate_get_ppc_enable_status(void)
@@ -730,14 +732,19 @@ static ssize_t store_no_turbo(struct kobject *a, struct attribute *b,
 	if (ret != 1)
 		return -EINVAL;
 
+	mutex_lock(&intel_pstate_limits_lock);
+
 	update_turbo_state();
 	if (limits->turbo_disabled) {
 		pr_warn("Turbo disabled by BIOS or unavailable on processor\n");
+		mutex_unlock(&intel_pstate_limits_lock);
 		return -EPERM;
 	}
 
 	limits->no_turbo = clamp_t(int, input, 0, 1);
 
+	mutex_unlock(&intel_pstate_limits_lock);
+
 	if (hwp_active)
 		intel_pstate_hwp_set_online_cpus();
 
@@ -754,6 +761,8 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
 	if (ret != 1)
 		return -EINVAL;
 
+	mutex_lock(&intel_pstate_limits_lock);
+
 	limits->max_sysfs_pct = clamp_t(int, input, 0 , 100);
 	limits->max_perf_pct = min(limits->max_policy_pct,
 				   limits->max_sysfs_pct);
@@ -763,6 +772,8 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
 				   limits->max_perf_pct);
 	limits->max_perf = div_fp(limits->max_perf_pct, 100);
 
+	mutex_unlock(&intel_pstate_limits_lock);
+
 	if (hwp_active)
 		intel_pstate_hwp_set_online_cpus();
 	return count;
@@ -778,6 +789,8 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
 	if (ret != 1)
 		return -EINVAL;
 
+	mutex_lock(&intel_pstate_limits_lock);
+
 	limits->min_sysfs_pct = clamp_t(int, input, 0 , 100);
 	limits->min_perf_pct = max(limits->min_policy_pct,
 				   limits->min_sysfs_pct);
@@ -787,6 +800,8 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
 				   limits->min_perf_pct);
 	limits->min_perf = div_fp(limits->min_perf_pct, 100);
 
+	mutex_unlock(&intel_pstate_limits_lock);
+
 	if (hwp_active)
 		intel_pstate_hwp_set_online_cpus();
 	return count;
@@ -1528,6 +1543,7 @@ static void intel_pstate_clear_update_util_hook(unsigned int cpu)
 
 static void intel_pstate_set_performance_limits(struct perf_limits *limits)
 {
+	mutex_lock(&intel_pstate_limits_lock);
 	limits->no_turbo = 0;
 	limits->turbo_disabled = 0;
 	limits->max_perf_pct = 100;
@@ -1538,11 +1554,15 @@ static void intel_pstate_set_performance_limits(struct perf_limits *limits)
 	limits->max_sysfs_pct = 100;
 	limits->min_policy_pct = 0;
 	limits->min_sysfs_pct = 0;
+	mutex_unlock(&intel_pstate_limits_lock);
 }
 
 static void intel_pstate_update_perf_limits(struct cpufreq_policy *policy,
 					    struct perf_limits *limits)
 {
+
+	mutex_lock(&intel_pstate_limits_lock);
+
 	limits->max_policy_pct = DIV_ROUND_UP(policy->max * 100,
 					      policy->cpuinfo.max_freq);
 	limits->max_policy_pct = clamp_t(int, limits->max_policy_pct, 0, 100);
@@ -1572,6 +1592,8 @@ static void intel_pstate_update_perf_limits(struct cpufreq_policy *policy,
 	limits->max_perf = div_fp(limits->max_perf_pct, 100);
 	limits->max_perf = round_up(limits->max_perf, FRAC_BITS);
 
+	mutex_unlock(&intel_pstate_limits_lock);
+
 	pr_debug("cpu:%d max_perf_pct:%d min_perf_pct:%d\n", policy->cpu,
 		 limits->max_perf_pct, limits->min_perf_pct);
 }
-- 
cgit v1.2.3


From 8812872960824681147fad051e6e1406fdfa07f9 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Thu, 27 Oct 2016 13:23:54 +0200
Subject: net: smsc911x: Synchronize the runtime PM status during system
 suspend

The smsc911c driver puts its device into low power state when entering
system suspend. Although it doesn't update the device's runtime PM status
to RPM_SUSPENDED, which causes problems for a parent device.

In particular, when the runtime PM status of the parent is requested to be
updated to RPM_SUSPENDED, the runtime PM core prevent this, because it's
forbidden to runtime suspend a device, which has an active child.

Fix this by updating the runtime PM status of the smsc911x device to
RPM_SUSPENDED during system suspend. In system resume, let's reverse that
action by runtime resuming the device and thus also the parent.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/net/ethernet/smsc/smsc911x.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/smsc/smsc911x.c b/drivers/net/ethernet/smsc/smsc911x.c
index e9b8579e6241..65fca9ca26ba 100644
--- a/drivers/net/ethernet/smsc/smsc911x.c
+++ b/drivers/net/ethernet/smsc/smsc911x.c
@@ -2584,6 +2584,9 @@ static int smsc911x_suspend(struct device *dev)
 		PMT_CTRL_PM_MODE_D1_ | PMT_CTRL_WOL_EN_ |
 		PMT_CTRL_ED_EN_ | PMT_CTRL_PME_EN_);
 
+	pm_runtime_disable(dev);
+	pm_runtime_set_suspended(dev);
+
 	return 0;
 }
 
@@ -2593,6 +2596,9 @@ static int smsc911x_resume(struct device *dev)
 	struct smsc911x_data *pdata = netdev_priv(ndev);
 	unsigned int to = 100;
 
+	pm_runtime_enable(dev);
+	pm_runtime_resume(dev);
+
 	/* Note 3.11 from the datasheet:
 	 * 	"When the LAN9220 is in a power saving state, a write of any
 	 * 	 data to the BYTE_TEST register will wake-up the device."
-- 
cgit v1.2.3


From a8636c89648ab12e59d8f3aa667ec76fc96fd643 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Mon, 17 Oct 2016 20:17:01 +0200
Subject: PM / Runtime: Don't allow to suspend a device with an active child

When resuming a device in __pm_runtime_set_status(), the prerequisite is
that its parent must already be active, else an error code is returned and
the device's status remains suspended.

When suspending a device there is no similar constraints being validated.
Let's change this to make the behaviour consistent, by not allowing to
suspend a device with an active child, unless it has been explicitly set to
ignore its children.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/runtime.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 117db71e84cb..60ebb04d8140 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -1028,7 +1028,17 @@ int __pm_runtime_set_status(struct device *dev, unsigned int status)
 		goto out_set;
 
 	if (status == RPM_SUSPENDED) {
-		/* It always is possible to set the status to 'suspended'. */
+		/*
+		 * It is invalid to suspend a device with an active child,
+		 * unless it has been set to ignore its children.
+		 */
+		if (!dev->power.ignore_children &&
+			atomic_read(&dev->power.child_count)) {
+			dev_err(dev, "runtime PM trying to suspend device but active child\n");
+			error = -EBUSY;
+			goto out;
+		}
+
 		if (parent) {
 			atomic_add_unless(&parent->power.child_count, -1, 0);
 			notify_parent = !parent->power.ignore_children;
-- 
cgit v1.2.3


From e7d040b8a2ea36a40f0d4176fe558d4e813fb715 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Thu, 10 Nov 2016 15:19:05 +0000
Subject: cpufreq: brcmstb-avs-cpufreq: make symbol brcm_avs_cpufreq_attr
 static

Fixes the following sparse warning:

drivers/cpufreq/brcmstb-avs-cpufreq.c:982:18: warning:
 symbol 'brcm_avs_cpufreq_attr' was not declared. Should it be static?

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Acked-by: Markus Mayer <mmayer@broadcom.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/brcmstb-avs-cpufreq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/brcmstb-avs-cpufreq.c b/drivers/cpufreq/brcmstb-avs-cpufreq.c
index b761d54bb08d..4fda623e55bb 100644
--- a/drivers/cpufreq/brcmstb-avs-cpufreq.c
+++ b/drivers/cpufreq/brcmstb-avs-cpufreq.c
@@ -979,7 +979,7 @@ cpufreq_freq_attr_ro(brcm_avs_pmap);
 cpufreq_freq_attr_ro(brcm_avs_voltage);
 cpufreq_freq_attr_ro(brcm_avs_frequency);
 
-struct freq_attr *brcm_avs_cpufreq_attr[] = {
+static struct freq_attr *brcm_avs_cpufreq_attr[] = {
 	&cpufreq_freq_attr_scaling_available_freqs,
 	&brcm_avs_pstate,
 	&brcm_avs_mode,
-- 
cgit v1.2.3


From 44cae7d5a1b4a4e4db24eae478a9cfba0ea2b9d2 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 10 Nov 2016 15:52:15 +0300
Subject: PM / Domains: Fix a warning message

The first argument of WARN() is the condition, followed by the message.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 661737c2bae0..7b4d41ff9506 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -1297,7 +1297,7 @@ static int genpd_add_subdomain(struct generic_pm_domain *genpd,
 	 * powered on/off in that context.
 	 */
 	if (!genpd_is_irq_safe(genpd) && genpd_is_irq_safe(subdomain)) {
-		WARN("Parent %s of subdomain %s must be IRQ safe\n",
+		WARN(1, "Parent %s of subdomain %s must be IRQ safe\n",
 				genpd->name, subdomain->name);
 		return -EINVAL;
 	}
-- 
cgit v1.2.3


From 60c9efb8f783194200bdb510df8d9d4fb398016e Mon Sep 17 00:00:00 2001
From: Akshay Adiga <akshay.adiga@linux.vnet.ibm.com>
Date: Tue, 8 Nov 2016 19:03:27 +0530
Subject: cpufreq: powernv: Adding fast_switch for schedutil

Adding fast_switch which does light weight operation to set the desired
pstate. Both global and local pstates are set to the same desired pstate.

Signed-off-by: Akshay Adiga <akshay.adiga@linux.vnet.ibm.com>
Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/powernv-cpufreq.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index d3ffde806629..4a4380d728ad 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -752,9 +752,12 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	spin_lock_init(&gpstates->gpstate_lock);
 	ret = cpufreq_table_validate_and_show(policy, powernv_freqs);
 
-	if (ret < 0)
+	if (ret < 0) {
 		kfree(policy->driver_data);
+		return ret;
+	}
 
+	policy->fast_switch_possible = true;
 	return ret;
 }
 
@@ -897,6 +900,20 @@ static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy)
 	del_timer_sync(&gpstates->timer);
 }
 
+static unsigned int powernv_fast_switch(struct cpufreq_policy *policy,
+					unsigned int target_freq)
+{
+	int index;
+	struct powernv_smp_call_data freq_data;
+
+	index = cpufreq_table_find_index_dl(policy, target_freq);
+	freq_data.pstate_id = powernv_freqs[index].driver_data;
+	freq_data.gpstate_id = powernv_freqs[index].driver_data;
+	set_pstate(&freq_data);
+
+	return powernv_freqs[index].frequency;
+}
+
 static struct cpufreq_driver powernv_cpufreq_driver = {
 	.name		= "powernv-cpufreq",
 	.flags		= CPUFREQ_CONST_LOOPS,
@@ -904,6 +921,7 @@ static struct cpufreq_driver powernv_cpufreq_driver = {
 	.exit		= powernv_cpufreq_cpu_exit,
 	.verify		= cpufreq_generic_frequency_table_verify,
 	.target_index	= powernv_cpufreq_target_index,
+	.fast_switch	= powernv_fast_switch,
 	.get		= powernv_cpufreq_get,
 	.stop_cpu	= powernv_cpufreq_stop_cpu,
 	.attr		= powernv_cpu_freq_attr,
-- 
cgit v1.2.3


From 20b15b7663549b34abcb4a18f16fedb239406c41 Mon Sep 17 00:00:00 2001
From: Akshay Adiga <akshay.adiga@linux.vnet.ibm.com>
Date: Tue, 8 Nov 2016 19:03:28 +0530
Subject: cpufreq: powernv: Use PMCR to verify global and local pstate

As fast_switch() may get called with interrupt disable mode, we cannot
hold a mutex to update the global_pstate_info. So currently, fast_switch()
does not update the global_pstate_info and it will end up with stale data
whenever pstate is updated through fast_switch().

As the gpstate_timer can fire after fast_switch() has updated the pstates,
the timer handler cannot rely on the cached values of local and global
pstate and needs to read it from the PMCR.

Only gpstate_timer_handler() is affected by the stale cached pstate data
beacause either fast_switch() or target_index() routines will be called
for a given govenor, but gpstate_timer can fire after the governor has
changed to schedutil.

Signed-off-by: Akshay Adiga <akshay.adiga@linux.vnet.ibm.com>
Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/powernv-cpufreq.c | 36 ++++++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index 4a4380d728ad..c82304b7a4c0 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -42,6 +42,10 @@
 #define PMSR_PSAFE_ENABLE	(1UL << 30)
 #define PMSR_SPR_EM_DISABLE	(1UL << 31)
 #define PMSR_MAX(x)		((x >> 32) & 0xFF)
+#define LPSTATE_SHIFT		48
+#define GPSTATE_SHIFT		56
+#define GET_LPSTATE(x)		(((x) >> LPSTATE_SHIFT) & 0xFF)
+#define GET_GPSTATE(x)		(((x) >> GPSTATE_SHIFT) & 0xFF)
 
 #define MAX_RAMP_DOWN_TIME				5120
 /*
@@ -592,7 +596,8 @@ void gpstate_timer_handler(unsigned long data)
 {
 	struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
 	struct global_pstate_info *gpstates = policy->driver_data;
-	int gpstate_idx;
+	int gpstate_idx, lpstate_idx;
+	unsigned long val;
 	unsigned int time_diff = jiffies_to_msecs(jiffies)
 					- gpstates->last_sampled_time;
 	struct powernv_smp_call_data freq_data;
@@ -600,21 +605,36 @@ void gpstate_timer_handler(unsigned long data)
 	if (!spin_trylock(&gpstates->gpstate_lock))
 		return;
 
+	/*
+	 * If PMCR was last updated was using fast_swtich then
+	 * We may have wrong in gpstate->last_lpstate_idx
+	 * value. Hence, read from PMCR to get correct data.
+	 */
+	val = get_pmspr(SPRN_PMCR);
+	freq_data.gpstate_id = (s8)GET_GPSTATE(val);
+	freq_data.pstate_id = (s8)GET_LPSTATE(val);
+	if (freq_data.gpstate_id  == freq_data.pstate_id) {
+		reset_gpstates(policy);
+		spin_unlock(&gpstates->gpstate_lock);
+		return;
+	}
+
 	gpstates->last_sampled_time += time_diff;
 	gpstates->elapsed_time += time_diff;
-	freq_data.pstate_id = idx_to_pstate(gpstates->last_lpstate_idx);
 
-	if ((gpstates->last_gpstate_idx == gpstates->last_lpstate_idx) ||
-	    (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME)) {
+	if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
 		gpstate_idx = pstate_to_idx(freq_data.pstate_id);
 		reset_gpstates(policy);
 		gpstates->highest_lpstate_idx = gpstate_idx;
 	} else {
+		lpstate_idx = pstate_to_idx(freq_data.pstate_id);
 		gpstate_idx = calc_global_pstate(gpstates->elapsed_time,
 						 gpstates->highest_lpstate_idx,
-						 gpstates->last_lpstate_idx);
+						 lpstate_idx);
 	}
-
+	freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
+	gpstates->last_gpstate_idx = gpstate_idx;
+	gpstates->last_lpstate_idx = lpstate_idx;
 	/*
 	 * If local pstate is equal to global pstate, rampdown is over
 	 * So timer is not required to be queued.
@@ -622,10 +642,6 @@ void gpstate_timer_handler(unsigned long data)
 	if (gpstate_idx != gpstates->last_lpstate_idx)
 		queue_gpstate_timer(gpstates);
 
-	freq_data.gpstate_id = idx_to_pstate(gpstate_idx);
-	gpstates->last_gpstate_idx = pstate_to_idx(freq_data.gpstate_id);
-	gpstates->last_lpstate_idx = pstate_to_idx(freq_data.pstate_id);
-
 	spin_unlock(&gpstates->gpstate_lock);
 
 	/* Timer may get migrated to a different cpu on cpu hot unplug */
-- 
cgit v1.2.3


From 26f0dbc9ab158afe86bac5ece2fcaf873d6bd8ad Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 8 Nov 2016 11:06:33 +0530
Subject: cpufreq: governor: Don't use 'timer' keyword

The earlier implementation of governors used background timers and so
functions, mutex, etc had 'timer' keyword in their names.

But that's not true anymore. Replace 'timer' with 'update', as those
functions, variables are based around updates to frequency.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_conservative.c |  4 ++--
 drivers/cpufreq/cpufreq_governor.c     | 18 +++++++++---------
 drivers/cpufreq/cpufreq_governor.h     |  4 ++--
 drivers/cpufreq/cpufreq_ondemand.c     | 15 +++++++--------
 4 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 13475890d792..fa5ece3915a1 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -58,7 +58,7 @@ static inline unsigned int get_freq_target(struct cs_dbs_tuners *cs_tuners,
  * Any frequency increase takes it to the maximum frequency. Frequency reduction
  * happens at minimum steps of 5% (default) of maximum frequency
  */
-static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
+static unsigned int cs_dbs_update(struct cpufreq_policy *policy)
 {
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 	struct cs_policy_dbs_info *dbs_info = to_dbs_info(policy_dbs);
@@ -305,7 +305,7 @@ static void cs_start(struct cpufreq_policy *policy)
 static struct dbs_governor cs_governor = {
 	.gov = CPUFREQ_DBS_GOVERNOR_INITIALIZER("conservative"),
 	.kobj_type = { .default_attrs = cs_attributes },
-	.gov_dbs_timer = cs_dbs_timer,
+	.gov_dbs_update = cs_dbs_update,
 	.alloc = cs_alloc,
 	.free = cs_free,
 	.init = cs_init,
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 642dd0f183a8..372947416b75 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -61,7 +61,7 @@ ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf,
 	 * entries can't be freed concurrently.
 	 */
 	list_for_each_entry(policy_dbs, &attr_set->policy_list, list) {
-		mutex_lock(&policy_dbs->timer_mutex);
+		mutex_lock(&policy_dbs->update_mutex);
 		/*
 		 * On 32-bit architectures this may race with the
 		 * sample_delay_ns read in dbs_update_util_handler(), but that
@@ -76,7 +76,7 @@ ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf,
 		 * taken, so it shouldn't be significant.
 		 */
 		gov_update_sample_delay(policy_dbs, 0);
-		mutex_unlock(&policy_dbs->timer_mutex);
+		mutex_unlock(&policy_dbs->update_mutex);
 	}
 
 	return count;
@@ -236,9 +236,9 @@ static void dbs_work_handler(struct work_struct *work)
 	 * Make sure cpufreq_governor_limits() isn't evaluating load or the
 	 * ondemand governor isn't updating the sampling rate in parallel.
 	 */
-	mutex_lock(&policy_dbs->timer_mutex);
-	gov_update_sample_delay(policy_dbs, gov->gov_dbs_timer(policy));
-	mutex_unlock(&policy_dbs->timer_mutex);
+	mutex_lock(&policy_dbs->update_mutex);
+	gov_update_sample_delay(policy_dbs, gov->gov_dbs_update(policy));
+	mutex_unlock(&policy_dbs->update_mutex);
 
 	/* Allow the utilization update handler to queue up more work. */
 	atomic_set(&policy_dbs->work_count, 0);
@@ -348,7 +348,7 @@ static struct policy_dbs_info *alloc_policy_dbs_info(struct cpufreq_policy *poli
 		return NULL;
 
 	policy_dbs->policy = policy;
-	mutex_init(&policy_dbs->timer_mutex);
+	mutex_init(&policy_dbs->update_mutex);
 	atomic_set(&policy_dbs->work_count, 0);
 	init_irq_work(&policy_dbs->irq_work, dbs_irq_work);
 	INIT_WORK(&policy_dbs->work, dbs_work_handler);
@@ -367,7 +367,7 @@ static void free_policy_dbs_info(struct policy_dbs_info *policy_dbs,
 {
 	int j;
 
-	mutex_destroy(&policy_dbs->timer_mutex);
+	mutex_destroy(&policy_dbs->update_mutex);
 
 	for_each_cpu(j, policy_dbs->policy->related_cpus) {
 		struct cpu_dbs_info *j_cdbs = &per_cpu(cpu_dbs, j);
@@ -547,10 +547,10 @@ void cpufreq_dbs_governor_limits(struct cpufreq_policy *policy)
 {
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 
-	mutex_lock(&policy_dbs->timer_mutex);
+	mutex_lock(&policy_dbs->update_mutex);
 	cpufreq_policy_apply_limits(policy);
 	gov_update_sample_delay(policy_dbs, 0);
 
-	mutex_unlock(&policy_dbs->timer_mutex);
+	mutex_unlock(&policy_dbs->update_mutex);
 }
 EXPORT_SYMBOL_GPL(cpufreq_dbs_governor_limits);
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index ef1037e9c92b..9660cc6b4b39 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -85,7 +85,7 @@ struct policy_dbs_info {
 	 * Per policy mutex that serializes load evaluation from limit-change
 	 * and work-handler.
 	 */
-	struct mutex timer_mutex;
+	struct mutex update_mutex;
 
 	u64 last_sample_time;
 	s64 sample_delay_ns;
@@ -135,7 +135,7 @@ struct dbs_governor {
 	 */
 	struct dbs_data *gdbs_data;
 
-	unsigned int (*gov_dbs_timer)(struct cpufreq_policy *policy);
+	unsigned int (*gov_dbs_update)(struct cpufreq_policy *policy);
 	struct policy_dbs_info *(*alloc)(void);
 	void (*free)(struct policy_dbs_info *policy_dbs);
 	int (*init)(struct dbs_data *dbs_data);
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 3a1f49f5f4c6..1e2bd9851fba 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -169,7 +169,7 @@ static void od_update(struct cpufreq_policy *policy)
 	}
 }
 
-static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
+static unsigned int od_dbs_update(struct cpufreq_policy *policy)
 {
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
@@ -191,7 +191,7 @@ static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
 	od_update(policy);
 
 	if (dbs_info->freq_lo) {
-		/* Setup timer for SUB_SAMPLE */
+		/* Setup SUB_SAMPLE */
 		dbs_info->sample_type = OD_SUB_SAMPLE;
 		return dbs_info->freq_hi_delay_us;
 	}
@@ -255,11 +255,11 @@ static ssize_t store_sampling_down_factor(struct gov_attr_set *attr_set,
 	list_for_each_entry(policy_dbs, &attr_set->policy_list, list) {
 		/*
 		 * Doing this without locking might lead to using different
-		 * rate_mult values in od_update() and od_dbs_timer().
+		 * rate_mult values in od_update() and od_dbs_update().
 		 */
-		mutex_lock(&policy_dbs->timer_mutex);
+		mutex_lock(&policy_dbs->update_mutex);
 		policy_dbs->rate_mult = 1;
-		mutex_unlock(&policy_dbs->timer_mutex);
+		mutex_unlock(&policy_dbs->update_mutex);
 	}
 
 	return count;
@@ -374,8 +374,7 @@ static int od_init(struct dbs_data *dbs_data)
 		dbs_data->up_threshold = MICRO_FREQUENCY_UP_THRESHOLD;
 		/*
 		 * In nohz/micro accounting case we set the minimum frequency
-		 * not depending on HZ, but fixed (very low). The deferred
-		 * timer might skip some samples if idle/sleeping as needed.
+		 * not depending on HZ, but fixed (very low).
 		*/
 		dbs_data->min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE;
 	} else {
@@ -415,7 +414,7 @@ static struct od_ops od_ops = {
 static struct dbs_governor od_dbs_gov = {
 	.gov = CPUFREQ_DBS_GOVERNOR_INITIALIZER("ondemand"),
 	.kobj_type = { .default_attrs = od_attributes },
-	.gov_dbs_timer = od_dbs_timer,
+	.gov_dbs_update = od_dbs_update,
 	.alloc = od_alloc,
 	.free = od_free,
 	.init = od_init,
-- 
cgit v1.2.3


From ee7930ee27fe5240398cc302fa8eb4454725f188 Mon Sep 17 00:00:00 2001
From: Markus Mayer <mmayer@broadcom.com>
Date: Mon, 7 Nov 2016 10:02:23 -0800
Subject: cpufreq: stats: New sysfs attribute for clearing statistics

Allow CPUfreq statistics to be cleared by writing anything to
/sys/.../cpufreq/stats/reset.

Signed-off-by: Markus Mayer <mmayer@broadcom.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/cpu-freq/cpufreq-stats.txt |  6 ++++++
 drivers/cpufreq/cpufreq_stats.c          | 22 ++++++++++++++++++++++
 include/linux/cpufreq.h                  |  4 ++++
 3 files changed, 32 insertions(+)

diff --git a/Documentation/cpu-freq/cpufreq-stats.txt b/Documentation/cpu-freq/cpufreq-stats.txt
index 8d9773f23550..3c355f6ad834 100644
--- a/Documentation/cpu-freq/cpufreq-stats.txt
+++ b/Documentation/cpu-freq/cpufreq-stats.txt
@@ -44,11 +44,17 @@ the stats driver insertion.
 total 0
 drwxr-xr-x  2 root root    0 May 14 16:06 .
 drwxr-xr-x  3 root root    0 May 14 15:58 ..
+--w-------  1 root root 4096 May 14 16:06 reset
 -r--r--r--  1 root root 4096 May 14 16:06 time_in_state
 -r--r--r--  1 root root 4096 May 14 16:06 total_trans
 -r--r--r--  1 root root 4096 May 14 16:06 trans_table
 --------------------------------------------------------------------------------
 
+-  reset
+Write-only attribute that can be used to reset the stat counters. This can be
+useful for evaluating system behaviour under different governors without the
+need for a reboot.
+
 -  time_in_state
 This gives the amount of time spent in each of the frequencies supported by
 this CPU. The cat output will have "<frequency> <time>" pair in each line, which
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 06d3abdffd3a..ac284e66839c 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -41,6 +41,18 @@ static int cpufreq_stats_update(struct cpufreq_stats *stats)
 	return 0;
 }
 
+static void cpufreq_stats_clear_table(struct cpufreq_stats *stats)
+{
+	unsigned int count = stats->max_state;
+
+	memset(stats->time_in_state, 0, count * sizeof(u64));
+#ifdef CONFIG_CPU_FREQ_STAT_DETAILS
+	memset(stats->trans_table, 0, count * count * sizeof(int));
+#endif
+	stats->last_time = get_jiffies_64();
+	stats->total_trans = 0;
+}
+
 static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf)
 {
 	return sprintf(buf, "%d\n", policy->stats->total_trans);
@@ -64,6 +76,14 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf)
 	return len;
 }
 
+static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf,
+			   size_t count)
+{
+	/* We don't care what is written to the attribute. */
+	cpufreq_stats_clear_table(policy->stats);
+	return count;
+}
+
 #ifdef CONFIG_CPU_FREQ_STAT_DETAILS
 static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf)
 {
@@ -113,10 +133,12 @@ cpufreq_freq_attr_ro(trans_table);
 
 cpufreq_freq_attr_ro(total_trans);
 cpufreq_freq_attr_ro(time_in_state);
+cpufreq_freq_attr_wo(reset);
 
 static struct attribute *default_attrs[] = {
 	&total_trans.attr,
 	&time_in_state.attr,
+	&reset.attr,
 #ifdef CONFIG_CPU_FREQ_STAT_DETAILS
 	&trans_table.attr,
 #endif
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 32dc0cbd51ca..40dc2e29f480 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -234,6 +234,10 @@ __ATTR(_name, _perm, show_##_name, NULL)
 static struct freq_attr _name =			\
 __ATTR(_name, 0644, show_##_name, store_##_name)
 
+#define cpufreq_freq_attr_wo(_name)		\
+static struct freq_attr _name =			\
+__ATTR(_name, 0200, NULL, store_##_name)
+
 struct global_attr {
 	struct attribute attr;
 	ssize_t (*show)(struct kobject *kobj,
-- 
cgit v1.2.3


From dcd2ea410d8780951420f32d0d49782cc324e1e9 Mon Sep 17 00:00:00 2001
From: Robert Jarzmik <robert.jarzmik@free.fr>
Date: Mon, 31 Oct 2016 20:54:53 +0100
Subject: cpufreq: pxa: use generic platdev driver for device-tree

For device-tree based pxa25x and pxa27x platforms, cpufreq-dt driver is
doing the job as well as pxa2xx-cpufreq, so add these platforms to the
compatibility list.

This won't work for legacy non device-tree platforms where
pxa2xx-cpufreq is still required.

Signed-off-by: Robert Jarzmik <robert.jarzmik@free.fr>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq-dt-platdev.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c
index fba3f61f320c..fcc5bcfe57ef 100644
--- a/drivers/cpufreq/cpufreq-dt-platdev.c
+++ b/drivers/cpufreq/cpufreq-dt-platdev.c
@@ -37,6 +37,8 @@ static const struct of_device_id machines[] __initconst = {
 	{ .compatible = "fsl,imx7d", },
 
 	{ .compatible = "marvell,berlin", },
+	{ .compatible = "marvell,pxa250", },
+	{ .compatible = "marvell,pxa270", },
 
 	{ .compatible = "samsung,exynos3250", },
 	{ .compatible = "samsung,exynos4210", },
-- 
cgit v1.2.3


From 62a03defeabd58f74e07ca030d6c21e069d4d88e Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Thu, 20 Oct 2016 16:14:52 +0800
Subject: PM / hibernate: Verify the consistent of e820 memory map by md5
 digest

On some platforms, there is occasional panic triggered when
trying to resume from hibernation, a typical panic looks like:

"BUG: unable to handle kernel paging request at ffff880085894000
IP: [<ffffffff810c5dc2>] load_image_lzo+0x8c2/0xe70"

Investigation carried out by Lee Chun-Yi shows that this is because
e820 map has been changed by BIOS across hibernation, and one
of the page frames from suspend kernel is right located in restore
kernel's unmapped region, so panic comes out when accessing unmapped
kernel address.

In order to expose this issue earlier, the md5 hash of e820 map
is passed from suspend kernel to restore kernel, and the restore
kernel will terminate the resume process once it finds the md5
hash are not the same.

As the format of image header has been modified, the magic number
should also be adjusted as kernels with the same RESTORE_MAGIC have
to use the same header format and interpret all of the fields in
it in the same way.

If the suspend kernel is built without md5 support, and the restore
kernel has md5 support, then the latter will bypass the check process.
Vice versa the restore kernel will bypass the check if it does not
support md5 operation.

Note:
1. Without this patch applied, it is possible that BIOS has
   provided an inconsistent memory map, but the resume kernel is still
   able to restore the image anyway(e.g, E820_RAM region is the superset
   of the previous one), although the system might be unstable. So this
   patch tries to treat any inconsistent e820 as illegal.

2. Another case is, this patch replies on comparing the e820_saved, but
   currently the e820_save might not be strictly the same across
   hibernation, even if BIOS has provided consistent e820 map - In
   theory mptable might modify the BIOS-provided e820_saved dynamically
   in early_reserve_e820_mpc_new, which would allocate a buffer from
   E820_RAM, and marks it from E820_RAM to E820_RESERVED).
   This is a potential and rare case we need to deal with in OS in
   the future.

Suggested-by: Pavel Machek <pavel@ucw.cz>
Suggested-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Reviewed-by: Lee, Chun-Yi <jlee@suse.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/power/hibernate_64.c | 94 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 92 insertions(+), 2 deletions(-)

diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 9634557a5444..ded2e8272382 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -11,6 +11,10 @@
 #include <linux/gfp.h>
 #include <linux/smp.h>
 #include <linux/suspend.h>
+#include <linux/scatterlist.h>
+#include <linux/kdebug.h>
+
+#include <crypto/hash.h>
 
 #include <asm/init.h>
 #include <asm/proto.h>
@@ -177,14 +181,86 @@ int pfn_is_nosave(unsigned long pfn)
 	return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
 }
 
+#define MD5_DIGEST_SIZE 16
+
 struct restore_data_record {
 	unsigned long jump_address;
 	unsigned long jump_address_phys;
 	unsigned long cr3;
 	unsigned long magic;
+	u8 e820_digest[MD5_DIGEST_SIZE];
 };
 
-#define RESTORE_MAGIC	0x123456789ABCDEF0UL
+#define RESTORE_MAGIC	0x23456789ABCDEF01UL
+
+#if IS_BUILTIN(CONFIG_CRYPTO_MD5)
+/**
+ * get_e820_md5 - calculate md5 according to given e820 map
+ *
+ * @map: the e820 map to be calculated
+ * @buf: the md5 result to be stored to
+ */
+static int get_e820_md5(struct e820map *map, void *buf)
+{
+	struct scatterlist sg;
+	struct crypto_ahash *tfm;
+	int size;
+	int ret = 0;
+
+	tfm = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm))
+		return -ENOMEM;
+
+	{
+		AHASH_REQUEST_ON_STACK(req, tfm);
+		size = offsetof(struct e820map, map)
+			+ sizeof(struct e820entry) * map->nr_map;
+		ahash_request_set_tfm(req, tfm);
+		sg_init_one(&sg, (u8 *)map, size);
+		ahash_request_set_callback(req, 0, NULL, NULL);
+		ahash_request_set_crypt(req, &sg, buf, size);
+
+		if (crypto_ahash_digest(req))
+			ret = -EINVAL;
+		ahash_request_zero(req);
+	}
+	crypto_free_ahash(tfm);
+
+	return ret;
+}
+
+static void hibernation_e820_save(void *buf)
+{
+	get_e820_md5(e820_saved, buf);
+}
+
+static bool hibernation_e820_mismatch(void *buf)
+{
+	int ret;
+	u8 result[MD5_DIGEST_SIZE];
+
+	memset(result, 0, MD5_DIGEST_SIZE);
+	/* If there is no digest in suspend kernel, let it go. */
+	if (!memcmp(result, buf, MD5_DIGEST_SIZE))
+		return false;
+
+	ret = get_e820_md5(e820_saved, result);
+	if (ret)
+		return true;
+
+	return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false;
+}
+#else
+static void hibernation_e820_save(void *buf)
+{
+}
+
+static bool hibernation_e820_mismatch(void *buf)
+{
+	/* If md5 is not builtin for restore kernel, let it go. */
+	return false;
+}
+#endif
 
 /**
  *	arch_hibernation_header_save - populate the architecture specific part
@@ -201,6 +277,9 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size)
 	rdr->jump_address_phys = __pa_symbol(&restore_registers);
 	rdr->cr3 = restore_cr3;
 	rdr->magic = RESTORE_MAGIC;
+
+	hibernation_e820_save(rdr->e820_digest);
+
 	return 0;
 }
 
@@ -216,5 +295,16 @@ int arch_hibernation_header_restore(void *addr)
 	restore_jump_address = rdr->jump_address;
 	jump_address_phys = rdr->jump_address_phys;
 	restore_cr3 = rdr->cr3;
-	return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL;
+
+	if (rdr->magic != RESTORE_MAGIC) {
+		pr_crit("Unrecognized hibernate image header format!\n");
+		return -EINVAL;
+	}
+
+	if (hibernation_e820_mismatch(rdr->e820_digest)) {
+		pr_crit("Hibernate inconsistent memory map detected!\n");
+		return -ENODEV;
+	}
+
+	return 0;
 }
-- 
cgit v1.2.3


From 7f7a516ee30a931fa2d257b39f5ef7b9196c56fc Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Mon, 14 Nov 2016 10:31:11 -0800
Subject: cpufreq: intel_pstate: Use CPU load based algorithm for PM_MOBILE

Use get_target_pstate_use_cpu_load() to calculate target P-State for
devices, with the preferred power management profile in ACPI FADT
set to PM_MOBILE.

This may help in resolving some thermal issues caused by low sustained
cpu bound workloads. The current algorithm tend to over provision in this
case as it doesn't look at the CPU busyness.

Also included the fix from Arnd Bergmann <arnd@arndb.de> to solve compile
issue, when CONFIG_ACPI is not defined.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 0837175395b9..f07e591c0e1f 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1771,6 +1771,19 @@ static void __init copy_pid_params(struct pstate_adjust_policy *policy)
 	pid_params.setpoint = policy->setpoint;
 }
 
+#ifdef CONFIG_ACPI
+static void intel_pstate_use_acpi_profile(void)
+{
+	if (acpi_gbl_FADT.preferred_profile == PM_MOBILE)
+		pstate_funcs.get_target_pstate =
+				get_target_pstate_use_cpu_load;
+}
+#else
+static void intel_pstate_use_acpi_profile(void)
+{
+}
+#endif
+
 static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
 {
 	pstate_funcs.get_max   = funcs->get_max;
@@ -1782,6 +1795,7 @@ static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
 	pstate_funcs.get_vid   = funcs->get_vid;
 	pstate_funcs.get_target_pstate = funcs->get_target_pstate;
 
+	intel_pstate_use_acpi_profile();
 }
 
 #ifdef CONFIG_ACPI
-- 
cgit v1.2.3


From c9a81e6864d48f81797397bbd65d299bf20c6148 Mon Sep 17 00:00:00 2001
From: Akshay Adiga <akshay.adiga@linux.vnet.ibm.com>
Date: Mon, 14 Nov 2016 17:29:27 +0530
Subject: cpufreq: powernv: Fix uninitialized lpstate_idx in
 gpstates_timer_handler()

lpstate_idx remains uninitialized in the case when elapsed_time
is greater than MAX_RAMP_DOWN_TIME.  At the end of rampdown the
global pstate should be equal to the local pstate.

Fixes: 20b15b766354 (cpufreq: powernv: Use PMCR to verify global and localpstate)
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Akshay Adiga <akshay.adiga@linux.vnet.ibm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/powernv-cpufreq.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index c82304b7a4c0..c5c5bc35eece 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -624,6 +624,7 @@ void gpstate_timer_handler(unsigned long data)
 
 	if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
 		gpstate_idx = pstate_to_idx(freq_data.pstate_id);
+		lpstate_idx = gpstate_idx;
 		reset_gpstates(policy);
 		gpstates->highest_lpstate_idx = gpstate_idx;
 	} else {
-- 
cgit v1.2.3


From d5f905a93c321929ad8609c2478c1a5cd00aefdf Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 14 Nov 2016 12:30:43 +0530
Subject: cpufreq: conservative: Rename get_freq_target() to get_freq_step()

What's returned from this function is the delta by which the frequency
must be increased or decreased and not the final frequency that should
be selected.

Name it properly to match its purpose. Also update the variables used to
store that value.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_conservative.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index fa5ece3915a1..0681fcff5aae 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -37,16 +37,16 @@ struct cs_dbs_tuners {
 #define DEF_SAMPLING_DOWN_FACTOR		(1)
 #define MAX_SAMPLING_DOWN_FACTOR		(10)
 
-static inline unsigned int get_freq_target(struct cs_dbs_tuners *cs_tuners,
-					   struct cpufreq_policy *policy)
+static inline unsigned int get_freq_step(struct cs_dbs_tuners *cs_tuners,
+					 struct cpufreq_policy *policy)
 {
-	unsigned int freq_target = (cs_tuners->freq_step * policy->max) / 100;
+	unsigned int freq_step = (cs_tuners->freq_step * policy->max) / 100;
 
 	/* max freq cannot be less than 100. But who knows... */
-	if (unlikely(freq_target == 0))
-		freq_target = DEF_FREQUENCY_STEP;
+	if (unlikely(freq_step == 0))
+		freq_step = DEF_FREQUENCY_STEP;
 
-	return freq_target;
+	return freq_step;
 }
 
 /*
@@ -90,7 +90,7 @@ static unsigned int cs_dbs_update(struct cpufreq_policy *policy)
 		if (requested_freq == policy->max)
 			goto out;
 
-		requested_freq += get_freq_target(cs_tuners, policy);
+		requested_freq += get_freq_step(cs_tuners, policy);
 		if (requested_freq > policy->max)
 			requested_freq = policy->max;
 
@@ -106,16 +106,16 @@ static unsigned int cs_dbs_update(struct cpufreq_policy *policy)
 
 	/* Check for frequency decrease */
 	if (load < cs_tuners->down_threshold) {
-		unsigned int freq_target;
+		unsigned int freq_step;
 		/*
 		 * if we cannot reduce the frequency anymore, break out early
 		 */
 		if (requested_freq == policy->min)
 			goto out;
 
-		freq_target = get_freq_target(cs_tuners, policy);
-		if (requested_freq > freq_target)
-			requested_freq -= freq_target;
+		freq_step = get_freq_step(cs_tuners, policy);
+		if (requested_freq > freq_step)
+			requested_freq -= freq_step;
 		else
 			requested_freq = policy->min;
 
-- 
cgit v1.2.3


From 00bfe05889e91b5112893b001e4a47b0a0f8bdd7 Mon Sep 17 00:00:00 2001
From: Stratos Karafotis <stratosk@semaphore.gr>
Date: Wed, 16 Nov 2016 19:26:29 +0200
Subject: cpufreq: conservative: Decrease frequency faster for deferred updates

Conservative governor changes the CPU frequency in steps.
That means that if a CPU runs at max frequency, it will need several
sampling periods to return to min frequency when the workload
is finished.

If the update function that calculates the load and target frequency
is deferred, the governor might need even more time to decrease the
frequency.

This may have impact to power consumption and after all conservative
should decrease the frequency if there is no workload at every sampling
rate.

To resolve the above issue calculate the number of sampling periods
that the update is deferred. Considering that for each sampling period
conservative should drop the frequency by a freq_step because the
CPU was idle apply the proper subtraction to requested frequency.

Below, the kernel trace with and without this patch. First an
intensive workload is applied on a specific CPU. Then the workload
is removed and the CPU goes to idle.

WITHOUT

     <idle>-0     [007] dN..   620.329153: cpu_idle: state=4294967295 cpu_id=7
kworker/7:2-556   [007] ....   620.350857: cpu_frequency: state=1700000 cpu_id=7
kworker/7:2-556   [007] ....   620.370856: cpu_frequency: state=1900000 cpu_id=7
kworker/7:2-556   [007] ....   620.390854: cpu_frequency: state=2100000 cpu_id=7
kworker/7:2-556   [007] ....   620.411853: cpu_frequency: state=2200000 cpu_id=7
kworker/7:2-556   [007] ....   620.432854: cpu_frequency: state=2400000 cpu_id=7
kworker/7:2-556   [007] ....   620.453854: cpu_frequency: state=2600000 cpu_id=7
kworker/7:2-556   [007] ....   620.494856: cpu_frequency: state=2900000 cpu_id=7
kworker/7:2-556   [007] ....   620.515856: cpu_frequency: state=3100000 cpu_id=7
kworker/7:2-556   [007] ....   620.536858: cpu_frequency: state=3300000 cpu_id=7
kworker/7:2-556   [007] ....   620.557857: cpu_frequency: state=3401000 cpu_id=7
     <idle>-0     [007] d...   669.591363: cpu_idle: state=4 cpu_id=7
     <idle>-0     [007] d...   669.591939: cpu_idle: state=4294967295 cpu_id=7
     <idle>-0     [007] d...   669.591980: cpu_idle: state=4 cpu_id=7
     <idle>-0     [007] dN..   669.591989: cpu_idle: state=4294967295 cpu_id=7
...
     <idle>-0     [007] d...   670.201224: cpu_idle: state=4 cpu_id=7
     <idle>-0     [007] d...   670.221975: cpu_idle: state=4294967295 cpu_id=7
kworker/7:2-556   [007] ....   670.222016: cpu_frequency: state=3300000 cpu_id=7
     <idle>-0     [007] d...   670.222026: cpu_idle: state=4 cpu_id=7
     <idle>-0     [007] d...   670.234964: cpu_idle: state=4294967295 cpu_id=7
...
     <idle>-0     [007] d...   670.801251: cpu_idle: state=4 cpu_id=7
     <idle>-0     [007] d...   671.236046: cpu_idle: state=4294967295 cpu_id=7
kworker/7:2-556   [007] ....   671.236073: cpu_frequency: state=3100000 cpu_id=7
     <idle>-0     [007] d...   671.236112: cpu_idle: state=4 cpu_id=7
     <idle>-0     [007] d...   671.393437: cpu_idle: state=4294967295 cpu_id=7
...
     <idle>-0     [007] d...   671.401277: cpu_idle: state=4 cpu_id=7
     <idle>-0     [007] d...   671.404083: cpu_idle: state=4294967295 cpu_id=7
kworker/7:2-556   [007] ....   671.404111: cpu_frequency: state=2900000 cpu_id=7
     <idle>-0     [007] d...   671.404125: cpu_idle: state=4 cpu_id=7
     <idle>-0     [007] d...   671.404974: cpu_idle: state=4294967295 cpu_id=7
...
     <idle>-0     [007] d...   671.501180: cpu_idle: state=4 cpu_id=7
     <idle>-0     [007] d...   671.995414: cpu_idle: state=4294967295 cpu_id=7
kworker/7:2-556   [007] ....   671.995459: cpu_frequency: state=2800000 cpu_id=7
     <idle>-0     [007] d...   671.995469: cpu_idle: state=4 cpu_id=7
     <idle>-0     [007] d...   671.996287: cpu_idle: state=4294967295 cpu_id=7
...
     <idle>-0     [007] d...   672.001305: cpu_idle: state=4 cpu_id=7
     <idle>-0     [007] d...   672.078374: cpu_idle: state=4294967295 cpu_id=7
kworker/7:2-556   [007] ....   672.078410: cpu_frequency: state=2600000 cpu_id=7
     <idle>-0     [007] d...   672.078419: cpu_idle: state=4 cpu_id=7
     <idle>-0     [007] d...   672.158020: cpu_idle: state=4294967295 cpu_id=7
kworker/7:2-556   [007] ....   672.158040: cpu_frequency: state=2400000 cpu_id=7
     <idle>-0     [007] d...   672.158044: cpu_idle: state=4 cpu_id=7
     <idle>-0     [007] d...   672.160038: cpu_idle: state=4294967295 cpu_id=7
...
     <idle>-0     [007] d...   672.234557: cpu_idle: state=4 cpu_id=7
     <idle>-0     [007] d...   672.237121: cpu_idle: state=4294967295 cpu_id=7
kworker/7:2-556   [007] ....   672.237174: cpu_frequency: state=2100000 cpu_id=7
     <idle>-0     [007] d...   672.237186: cpu_idle: state=4 cpu_id=7
     <idle>-0     [007] d...   672.237778: cpu_idle: state=4294967295 cpu_id=7
...
     <idle>-0     [007] d...   672.267902: cpu_idle: state=4 cpu_id=7
     <idle>-0     [007] d...   672.269860: cpu_idle: state=4294967295 cpu_id=7
kworker/7:2-556   [007] ....   672.269906: cpu_frequency: state=1900000 cpu_id=7
     <idle>-0     [007] d...   672.269914: cpu_idle: state=4 cpu_id=7
     <idle>-0     [007] d...   672.271902: cpu_idle: state=4294967295 cpu_id=7
...
     <idle>-0     [007] d...   672.751342: cpu_idle: state=4 cpu_id=7
     <idle>-0     [007] d...   672.823056: cpu_idle: state=4294967295 cpu_id=7
kworker/7:2-556   [007] ....   672.823095: cpu_frequency: state=1600000 cpu_id=7

WITH

     <idle>-0     [007] dN..  4380.928009: cpu_idle: state=4294967295 cpu_id=7
kworker/7:2-399   [007] ....  4380.949767: cpu_frequency: state=2000000 cpu_id=7
kworker/7:2-399   [007] ....  4380.969765: cpu_frequency: state=2200000 cpu_id=7
kworker/7:2-399   [007] ....  4381.009766: cpu_frequency: state=2500000 cpu_id=7
kworker/7:2-399   [007] ....  4381.029767: cpu_frequency: state=2600000 cpu_id=7
kworker/7:2-399   [007] ....  4381.049769: cpu_frequency: state=2800000 cpu_id=7
kworker/7:2-399   [007] ....  4381.069769: cpu_frequency: state=3000000 cpu_id=7
kworker/7:2-399   [007] ....  4381.089771: cpu_frequency: state=3100000 cpu_id=7
kworker/7:2-399   [007] ....  4381.109772: cpu_frequency: state=3400000 cpu_id=7
kworker/7:2-399   [007] ....  4381.129773: cpu_frequency: state=3401000 cpu_id=7
     <idle>-0     [007] d...  4428.226159: cpu_idle: state=1 cpu_id=7
     <idle>-0     [007] d...  4428.226176: cpu_idle: state=4294967295 cpu_id=7
     <idle>-0     [007] d...  4428.226181: cpu_idle: state=4 cpu_id=7
     <idle>-0     [007] d...  4428.227177: cpu_idle: state=4294967295 cpu_id=7
...
     <idle>-0     [007] d...  4428.551640: cpu_idle: state=4 cpu_id=7
     <idle>-0     [007] d...  4428.649239: cpu_idle: state=4294967295 cpu_id=7
kworker/7:2-399   [007] ....  4428.649268: cpu_frequency: state=2800000 cpu_id=7
     <idle>-0     [007] d...  4428.649278: cpu_idle: state=4 cpu_id=7
     <idle>-0     [007] d...  4428.689856: cpu_idle: state=4294967295 cpu_id=7
...
     <idle>-0     [007] d...  4428.799542: cpu_idle: state=4 cpu_id=7
     <idle>-0     [007] d...  4428.801683: cpu_idle: state=4294967295 cpu_id=7
kworker/7:2-399   [007] ....  4428.801748: cpu_frequency: state=1700000 cpu_id=7
     <idle>-0     [007] d...  4428.801761: cpu_idle: state=4 cpu_id=7
     <idle>-0     [007] d...  4428.806545: cpu_idle: state=4294967295 cpu_id=7
...
     <idle>-0     [007] d...  4429.051880: cpu_idle: state=4 cpu_id=7
     <idle>-0     [007] d...  4429.086240: cpu_idle: state=4294967295 cpu_id=7
kworker/7:2-399   [007] ....  4429.086293: cpu_frequency: state=1600000 cpu_id=7

Without the patch the CPU dropped to min frequency after 3.2s
With the patch applied the CPU dropped to min frequency after 0.86s

Signed-off-by: Stratos Karafotis <stratosk@semaphore.gr>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_conservative.c | 22 +++++++++++++++++++---
 drivers/cpufreq/cpufreq_governor.c     | 12 +++++++++++-
 drivers/cpufreq/cpufreq_governor.h     |  1 +
 3 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 0681fcff5aae..a48b724768af 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -66,6 +66,7 @@ static unsigned int cs_dbs_update(struct cpufreq_policy *policy)
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 	unsigned int load = dbs_update(policy);
+	unsigned int freq_step;
 
 	/*
 	 * break out if we 'cannot' reduce the speed as the user might
@@ -82,6 +83,23 @@ static unsigned int cs_dbs_update(struct cpufreq_policy *policy)
 	if (requested_freq > policy->max || requested_freq < policy->min)
 		requested_freq = policy->cur;
 
+	freq_step = get_freq_step(cs_tuners, policy);
+
+	/*
+	 * Decrease requested_freq one freq_step for each idle period that
+	 * we didn't update the frequency.
+	 */
+	if (policy_dbs->idle_periods < UINT_MAX) {
+		unsigned int freq_steps = policy_dbs->idle_periods * freq_step;
+
+		if (requested_freq > freq_steps)
+			requested_freq -= freq_steps;
+		else
+			requested_freq = policy->min;
+
+		policy_dbs->idle_periods = UINT_MAX;
+	}
+
 	/* Check for frequency increase */
 	if (load > dbs_data->up_threshold) {
 		dbs_info->down_skip = 0;
@@ -90,7 +108,7 @@ static unsigned int cs_dbs_update(struct cpufreq_policy *policy)
 		if (requested_freq == policy->max)
 			goto out;
 
-		requested_freq += get_freq_step(cs_tuners, policy);
+		requested_freq += freq_step;
 		if (requested_freq > policy->max)
 			requested_freq = policy->max;
 
@@ -106,14 +124,12 @@ static unsigned int cs_dbs_update(struct cpufreq_policy *policy)
 
 	/* Check for frequency decrease */
 	if (load < cs_tuners->down_threshold) {
-		unsigned int freq_step;
 		/*
 		 * if we cannot reduce the frequency anymore, break out early
 		 */
 		if (requested_freq == policy->min)
 			goto out;
 
-		freq_step = get_freq_step(cs_tuners, policy);
 		if (requested_freq > freq_step)
 			requested_freq -= freq_step;
 		else
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 372947416b75..0196467280bd 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -117,7 +117,7 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
 	struct policy_dbs_info *policy_dbs = policy->governor_data;
 	struct dbs_data *dbs_data = policy_dbs->dbs_data;
 	unsigned int ignore_nice = dbs_data->ignore_nice_load;
-	unsigned int max_load = 0;
+	unsigned int max_load = 0, idle_periods = UINT_MAX;
 	unsigned int sampling_rate, io_busy, j;
 
 	/*
@@ -215,9 +215,19 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
 			j_cdbs->prev_load = load;
 		}
 
+		if (time_elapsed > 2 * sampling_rate) {
+			unsigned int periods = time_elapsed / sampling_rate;
+
+			if (periods < idle_periods)
+				idle_periods = periods;
+		}
+
 		if (load > max_load)
 			max_load = load;
 	}
+
+	policy_dbs->idle_periods = idle_periods;
+
 	return max_load;
 }
 EXPORT_SYMBOL_GPL(dbs_update);
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 9660cc6b4b39..f5717ca070cc 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -97,6 +97,7 @@ struct policy_dbs_info {
 	struct list_head list;
 	/* Multiplier for increasing sample delay temporarily. */
 	unsigned int rate_mult;
+	unsigned int idle_periods;	/* For conservative */
 	/* Status indicators */
 	bool is_shared;		/* This object is used by multiple CPUs */
 	bool work_in_progress;	/* Work is being queued up or in progress */
-- 
cgit v1.2.3


From 42d951c851f1d08bb12430a7e2e9f4657f7c395c Mon Sep 17 00:00:00 2001
From: Stratos Karafotis <stratosk@semaphore.gr>
Date: Wed, 16 Nov 2016 21:27:22 +0200
Subject: cpufreq: conservative: Fix comment explaining frequency updates

The original comment about the frequency increase to maximum is wrong.

Both increase and decrease happen at steps.

Signed-off-by: Stratos Karafotis <stratosk@semaphore.gr>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_conservative.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index a48b724768af..992f7c20760f 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -55,8 +55,8 @@ static inline unsigned int get_freq_step(struct cs_dbs_tuners *cs_tuners,
  * sampling_down_factor, we check, if current idle time is more than 80%
  * (default), then we try to decrease frequency
  *
- * Any frequency increase takes it to the maximum frequency. Frequency reduction
- * happens at minimum steps of 5% (default) of maximum frequency
+ * Frequency updates happen at minimum steps of 5% (default) of maximum
+ * frequency
  */
 static unsigned int cs_dbs_update(struct cpufreq_policy *policy)
 {
-- 
cgit v1.2.3


From 8e2ddb03643eb9d0bc4926946d7ce0d308eef0a5 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 15 Nov 2016 13:53:20 +0530
Subject: cpufreq: schedutil: Avoid indented labels

Switch to the more common practice of writing labels.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 69e06898997d..8c4e1652e895 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -454,17 +454,17 @@ static int sugov_init(struct cpufreq_policy *policy)
 	if (ret)
 		goto fail;
 
- out:
+out:
 	mutex_unlock(&global_tunables_lock);
 
 	cpufreq_enable_fast_switch(policy);
 	return 0;
 
- fail:
+fail:
 	policy->governor_data = NULL;
 	sugov_tunables_free(tunables);
 
- free_sg_policy:
+free_sg_policy:
 	mutex_unlock(&global_tunables_lock);
 
 	sugov_policy_free(sg_policy);
-- 
cgit v1.2.3


From 4a71ce4348bb61740d411822357061f8bf870f4c Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 15 Nov 2016 13:53:21 +0530
Subject: cpufreq: schedutil: enable fast switch earlier

The fast_switch_enabled flag will be used by both sugov_policy_alloc()
and sugov_policy_free() with a later patch.

Prepare for that by moving the calls to enable and disable it to the
beginning of sugov_init() and end of sugov_exit().

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 8c4e1652e895..68f21bb6bd44 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -416,9 +416,13 @@ static int sugov_init(struct cpufreq_policy *policy)
 	if (policy->governor_data)
 		return -EBUSY;
 
+	cpufreq_enable_fast_switch(policy);
+
 	sg_policy = sugov_policy_alloc(policy);
-	if (!sg_policy)
-		return -ENOMEM;
+	if (!sg_policy) {
+		ret = -ENOMEM;
+		goto disable_fast_switch;
+	}
 
 	mutex_lock(&global_tunables_lock);
 
@@ -456,8 +460,6 @@ static int sugov_init(struct cpufreq_policy *policy)
 
 out:
 	mutex_unlock(&global_tunables_lock);
-
-	cpufreq_enable_fast_switch(policy);
 	return 0;
 
 fail:
@@ -468,6 +470,10 @@ free_sg_policy:
 	mutex_unlock(&global_tunables_lock);
 
 	sugov_policy_free(sg_policy);
+
+disable_fast_switch:
+	cpufreq_disable_fast_switch(policy);
+
 	pr_err("initialization failed (error %d)\n", ret);
 	return ret;
 }
@@ -478,8 +484,6 @@ static void sugov_exit(struct cpufreq_policy *policy)
 	struct sugov_tunables *tunables = sg_policy->tunables;
 	unsigned int count;
 
-	cpufreq_disable_fast_switch(policy);
-
 	mutex_lock(&global_tunables_lock);
 
 	count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
@@ -490,6 +494,7 @@ static void sugov_exit(struct cpufreq_policy *policy)
 	mutex_unlock(&global_tunables_lock);
 
 	sugov_policy_free(sg_policy);
+	cpufreq_disable_fast_switch(policy);
 }
 
 static int sugov_start(struct cpufreq_policy *policy)
-- 
cgit v1.2.3


From 02a7b1ee3baa15a98b541d8cfd156bbe1a091c20 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 15 Nov 2016 13:53:22 +0530
Subject: cpufreq: schedutil: move slow path from workqueue to SCHED_FIFO task

If slow path frequency changes are conducted in a SCHED_OTHER context
then they may be delayed for some amount of time, including
indefinitely, when real time or deadline activity is taking place.

Move the slow path to a real time kernel thread. In the future the
thread should be made SCHED_DEADLINE. The RT priority is arbitrarily set
to 50 for now.

Hackbench results on ARM Exynos, dual core A15 platform for 10
iterations:

$ hackbench -s 100 -l 100 -g 10 -f 20

Before			After
---------------------------------
1.808			1.603
1.847			1.251
2.229			1.590
1.952			1.600
1.947			1.257
1.925			1.627
2.694			1.620
1.258			1.621
1.919			1.632
1.250			1.240

Average:

1.8829			1.5041

Based on initial work by Steve Muckle.

Signed-off-by: Steve Muckle <smuckle.linux@gmail.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 85 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 78 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 68f21bb6bd44..f165ba0f0766 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -12,11 +12,14 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/cpufreq.h>
+#include <linux/kthread.h>
 #include <linux/slab.h>
 #include <trace/events/power.h>
 
 #include "sched.h"
 
+#define SUGOV_KTHREAD_PRIORITY	50
+
 struct sugov_tunables {
 	struct gov_attr_set attr_set;
 	unsigned int rate_limit_us;
@@ -35,8 +38,10 @@ struct sugov_policy {
 
 	/* The next fields are only needed if fast switch cannot be used. */
 	struct irq_work irq_work;
-	struct work_struct work;
+	struct kthread_work work;
 	struct mutex work_lock;
+	struct kthread_worker worker;
+	struct task_struct *thread;
 	bool work_in_progress;
 
 	bool need_freq_update;
@@ -291,7 +296,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
 	raw_spin_unlock(&sg_policy->update_lock);
 }
 
-static void sugov_work(struct work_struct *work)
+static void sugov_work(struct kthread_work *work)
 {
 	struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
 
@@ -308,7 +313,21 @@ static void sugov_irq_work(struct irq_work *irq_work)
 	struct sugov_policy *sg_policy;
 
 	sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
-	schedule_work_on(smp_processor_id(), &sg_policy->work);
+
+	/*
+	 * For Real Time and Deadline tasks, schedutil governor shoots the
+	 * frequency to maximum. And special care must be taken to ensure that
+	 * this kthread doesn't result in that.
+	 *
+	 * This is (mostly) guaranteed by the work_in_progress flag. The flag is
+	 * updated only at the end of the sugov_work() and before that schedutil
+	 * rejects all other frequency scaling requests.
+	 *
+	 * Though there is a very rare case where the RT thread yields right
+	 * after the work_in_progress flag is cleared. The effects of that are
+	 * neglected for now.
+	 */
+	kthread_queue_work(&sg_policy->worker, &sg_policy->work);
 }
 
 /************************** sysfs interface ************************/
@@ -372,7 +391,6 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
 
 	sg_policy->policy = policy;
 	init_irq_work(&sg_policy->irq_work, sugov_irq_work);
-	INIT_WORK(&sg_policy->work, sugov_work);
 	mutex_init(&sg_policy->work_lock);
 	raw_spin_lock_init(&sg_policy->update_lock);
 	return sg_policy;
@@ -384,6 +402,51 @@ static void sugov_policy_free(struct sugov_policy *sg_policy)
 	kfree(sg_policy);
 }
 
+static int sugov_kthread_create(struct sugov_policy *sg_policy)
+{
+	struct task_struct *thread;
+	struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
+	struct cpufreq_policy *policy = sg_policy->policy;
+	int ret;
+
+	/* kthread only required for slow path */
+	if (policy->fast_switch_enabled)
+		return 0;
+
+	kthread_init_work(&sg_policy->work, sugov_work);
+	kthread_init_worker(&sg_policy->worker);
+	thread = kthread_create(kthread_worker_fn, &sg_policy->worker,
+				"sugov:%d",
+				cpumask_first(policy->related_cpus));
+	if (IS_ERR(thread)) {
+		pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread));
+		return PTR_ERR(thread);
+	}
+
+	ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param);
+	if (ret) {
+		kthread_stop(thread);
+		pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
+		return ret;
+	}
+
+	sg_policy->thread = thread;
+	kthread_bind_mask(thread, policy->related_cpus);
+	wake_up_process(thread);
+
+	return 0;
+}
+
+static void sugov_kthread_stop(struct sugov_policy *sg_policy)
+{
+	/* kthread only required for slow path */
+	if (sg_policy->policy->fast_switch_enabled)
+		return;
+
+	kthread_flush_worker(&sg_policy->worker);
+	kthread_stop(sg_policy->thread);
+}
+
 static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
 {
 	struct sugov_tunables *tunables;
@@ -424,12 +487,16 @@ static int sugov_init(struct cpufreq_policy *policy)
 		goto disable_fast_switch;
 	}
 
+	ret = sugov_kthread_create(sg_policy);
+	if (ret)
+		goto free_sg_policy;
+
 	mutex_lock(&global_tunables_lock);
 
 	if (global_tunables) {
 		if (WARN_ON(have_governor_per_policy())) {
 			ret = -EINVAL;
-			goto free_sg_policy;
+			goto stop_kthread;
 		}
 		policy->governor_data = sg_policy;
 		sg_policy->tunables = global_tunables;
@@ -441,7 +508,7 @@ static int sugov_init(struct cpufreq_policy *policy)
 	tunables = sugov_tunables_alloc(sg_policy);
 	if (!tunables) {
 		ret = -ENOMEM;
-		goto free_sg_policy;
+		goto stop_kthread;
 	}
 
 	tunables->rate_limit_us = LATENCY_MULTIPLIER;
@@ -466,6 +533,9 @@ fail:
 	policy->governor_data = NULL;
 	sugov_tunables_free(tunables);
 
+stop_kthread:
+	sugov_kthread_stop(sg_policy);
+
 free_sg_policy:
 	mutex_unlock(&global_tunables_lock);
 
@@ -493,6 +563,7 @@ static void sugov_exit(struct cpufreq_policy *policy)
 
 	mutex_unlock(&global_tunables_lock);
 
+	sugov_kthread_stop(sg_policy);
 	sugov_policy_free(sg_policy);
 	cpufreq_disable_fast_switch(policy);
 }
@@ -541,7 +612,7 @@ static void sugov_stop(struct cpufreq_policy *policy)
 	synchronize_sched();
 
 	irq_work_sync(&sg_policy->irq_work);
-	cancel_work_sync(&sg_policy->work);
+	kthread_cancel_work_sync(&sg_policy->work);
 }
 
 static void sugov_limits(struct cpufreq_policy *policy)
-- 
cgit v1.2.3


From 21ef57297b15a49b0c4dd4e7135c1a08e9a29a1c Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 15 Nov 2016 13:53:23 +0530
Subject: cpufreq: schedutil: irq-work and mutex are only used in slow path

Execute the irq-work specific initialization/exit code only when the
fast path isn't available.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index f165ba0f0766..42a220e78f00 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -390,15 +390,12 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
 		return NULL;
 
 	sg_policy->policy = policy;
-	init_irq_work(&sg_policy->irq_work, sugov_irq_work);
-	mutex_init(&sg_policy->work_lock);
 	raw_spin_lock_init(&sg_policy->update_lock);
 	return sg_policy;
 }
 
 static void sugov_policy_free(struct sugov_policy *sg_policy)
 {
-	mutex_destroy(&sg_policy->work_lock);
 	kfree(sg_policy);
 }
 
@@ -432,6 +429,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
 
 	sg_policy->thread = thread;
 	kthread_bind_mask(thread, policy->related_cpus);
+	init_irq_work(&sg_policy->irq_work, sugov_irq_work);
+	mutex_init(&sg_policy->work_lock);
+
 	wake_up_process(thread);
 
 	return 0;
@@ -445,6 +445,7 @@ static void sugov_kthread_stop(struct sugov_policy *sg_policy)
 
 	kthread_flush_worker(&sg_policy->worker);
 	kthread_stop(sg_policy->thread);
+	mutex_destroy(&sg_policy->work_lock);
 }
 
 static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
@@ -611,8 +612,10 @@ static void sugov_stop(struct cpufreq_policy *policy)
 
 	synchronize_sched();
 
-	irq_work_sync(&sg_policy->irq_work);
-	kthread_cancel_work_sync(&sg_policy->work);
+	if (!policy->fast_switch_enabled) {
+		irq_work_sync(&sg_policy->irq_work);
+		kthread_cancel_work_sync(&sg_policy->work);
+	}
 }
 
 static void sugov_limits(struct cpufreq_policy *policy)
-- 
cgit v1.2.3


From 8a10c06a20ec8097a68fd7a4a1c0e285095b4d2f Mon Sep 17 00:00:00 2001
From: Denis Kirjanov <kda@linux-powerpc.org>
Date: Tue, 8 Nov 2016 05:39:28 -0500
Subject: cpufreq: powernv: Disable preemption while checking CPU throttling
 state

With preemption turned on we can read incorrect throttling state
while being switched to CPU on a different chip.

 BUG: using smp_processor_id() in preemptible [00000000] code: cat/7343
 caller is .powernv_cpufreq_throttle_check+0x2c/0x710
 CPU: 13 PID: 7343 Comm: cat Not tainted 4.8.0-rc5-dirty #1
 Call Trace:
 [c0000007d25b75b0] [c000000000971378] .dump_stack+0xe4/0x150 (unreliable)
 [c0000007d25b7640] [c0000000005162e4] .check_preemption_disabled+0x134/0x150
 [c0000007d25b76e0] [c0000000007b63ac] .powernv_cpufreq_throttle_check+0x2c/0x710
 [c0000007d25b7790] [c0000000007b6d18] .powernv_cpufreq_target_index+0x288/0x360
 [c0000007d25b7870] [c0000000007acee4] .__cpufreq_driver_target+0x394/0x8c0
 [c0000007d25b7920] [c0000000007b22ac] .cpufreq_set+0x7c/0xd0
 [c0000007d25b79b0] [c0000000007adf50] .store_scaling_setspeed+0x80/0xc0
 [c0000007d25b7a40] [c0000000007ae270] .store+0xa0/0x100
 [c0000007d25b7ae0] [c0000000003566e8] .sysfs_kf_write+0x88/0xb0
 [c0000007d25b7b70] [c0000000003553b8] .kernfs_fop_write+0x178/0x260
 [c0000007d25b7c10] [c0000000002ac3cc] .__vfs_write+0x3c/0x1c0
 [c0000007d25b7cf0] [c0000000002ad584] .vfs_write+0xc4/0x230
 [c0000007d25b7d90] [c0000000002aeef8] .SyS_write+0x58/0x100
 [c0000007d25b7e30] [c00000000000bfec] system_call+0x38/0xfc

Fixes: 09a972d16209 (cpufreq: powernv: Report cpu frequency throttling)
Reviewed-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Denis Kirjanov <kda@linux-powerpc.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/powernv-cpufreq.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c
index c5c5bc35eece..37671b545880 100644
--- a/drivers/cpufreq/powernv-cpufreq.c
+++ b/drivers/cpufreq/powernv-cpufreq.c
@@ -664,8 +664,14 @@ static int powernv_cpufreq_target_index(struct cpufreq_policy *policy,
 	if (unlikely(rebooting) && new_index != get_nominal_index())
 		return 0;
 
-	if (!throttled)
+	if (!throttled) {
+		/* we don't want to be preempted while
+		 * checking if the CPU frequency has been throttled
+		 */
+		preempt_disable();
 		powernv_cpufreq_throttle_check(NULL);
+		preempt_enable();
+	}
 
 	cur_msec = jiffies_to_msecs(get_jiffies_64());
 
-- 
cgit v1.2.3


From f0da898b464953157911913cd93eaedcb2c92407 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 16 Nov 2016 11:05:51 +0100
Subject: cpufreq: dt: Add support for r8a7743 and r8a7745

Add the compatible strings for supporting the generic cpufreq driver on
the Renesas RZ/G1M (r8a7743) and RZ/G1E (r8a7745) SoCs.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Simon Horman <horms+renesas@verge.net.au>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq-dt-platdev.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c
index fcc5bcfe57ef..0c73c7dbd587 100644
--- a/drivers/cpufreq/cpufreq-dt-platdev.c
+++ b/drivers/cpufreq/cpufreq-dt-platdev.c
@@ -55,6 +55,8 @@ static const struct of_device_id machines[] __initconst = {
 	{ .compatible = "renesas,r7s72100", },
 	{ .compatible = "renesas,r8a73a4", },
 	{ .compatible = "renesas,r8a7740", },
+	{ .compatible = "renesas,r8a7743", },
+	{ .compatible = "renesas,r8a7745", },
 	{ .compatible = "renesas,r8a7778", },
 	{ .compatible = "renesas,r8a7779", },
 	{ .compatible = "renesas,r8a7790", },
-- 
cgit v1.2.3


From 2f3f1a261c0f4827bda86009f0059aefbe30ed11 Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javier@osg.samsung.com>
Date: Wed, 19 Oct 2016 18:06:24 -0300
Subject: PM / devfreq: rk3399_dmc: Fix module autoload

If the driver is built as a module, autoload won't work because the module
alias information is not filled. So user-space can't match the registered
device with the corresponding module.

Export the module alias information using the MODULE_DEVICE_TABLE() macro.

Before this patch:

$ modinfo drivers/devfreq/rk3399_dmc.ko | grep alias
$

After this patch:

$ modinfo drivers/devfreq/rk3399_dmc.ko | grep alias
alias:          of:N*T*Crockchip,rk3399-dmcC*
alias:          of:N*T*Crockchip,rk3399-dmc

Signed-off-by: Javier Martinez Canillas <javier@osg.samsung.com>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 drivers/devfreq/rk3399_dmc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/devfreq/rk3399_dmc.c b/drivers/devfreq/rk3399_dmc.c
index e24b73d66659..77bd9d02e870 100644
--- a/drivers/devfreq/rk3399_dmc.c
+++ b/drivers/devfreq/rk3399_dmc.c
@@ -454,6 +454,7 @@ static const struct of_device_id rk3399dmc_devfreq_of_match[] = {
 	{ .compatible = "rockchip,rk3399-dmc" },
 	{ },
 };
+MODULE_DEVICE_TABLE(of, rk3399dmc_devfreq_of_match);
 
 static struct platform_driver rk3399_dmcfreq_driver = {
 	.probe	= rk3399_dmcfreq_probe,
-- 
cgit v1.2.3


From ca5c3b216fcbbfb1cfa30c82267c9ee5c8763069 Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javier@osg.samsung.com>
Date: Wed, 19 Oct 2016 18:06:25 -0300
Subject: PM / devfreq: exynos-nocp: Fix module autoload

If the driver is built as a module, autoload won't work because the module
alias information is not filled. So user-space can't match the registered
device with the corresponding module.

Export the module alias information using the MODULE_DEVICE_TABLE() macro.

Before this patch:

$ modinfo drivers/devfreq/event/exynos-nocp.ko | grep alias
$

After this patch:

$ modinfo drivers/devfreq/event/exynos-nocp.ko | grep alias
alias:          of:N*T*Csamsung,exynos5420-nocpC*
alias:          of:N*T*Csamsung,exynos5420-nocp

Signed-off-by: Javier Martinez Canillas <javier@osg.samsung.com>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 drivers/devfreq/event/exynos-nocp.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/devfreq/event/exynos-nocp.c b/drivers/devfreq/event/exynos-nocp.c
index 49e712aca0c1..5c3e7b11e8a6 100644
--- a/drivers/devfreq/event/exynos-nocp.c
+++ b/drivers/devfreq/event/exynos-nocp.c
@@ -190,6 +190,7 @@ static const struct of_device_id exynos_nocp_id_match[] = {
 	{ .compatible = "samsung,exynos5420-nocp", },
 	{ /* sentinel */ },
 };
+MODULE_DEVICE_TABLE(of, exynos_nocp_id_match);
 
 static struct regmap_config exynos_nocp_regmap_config = {
 	.reg_bits = 32,
-- 
cgit v1.2.3


From dfd7c845ba981af4994b50c11d58fd230ca15a32 Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javier@osg.samsung.com>
Date: Wed, 19 Oct 2016 18:06:26 -0300
Subject: PM / devfreq: rockchip-dfi: Fix module autoload

If the driver is built as a module, autoload won't work because the module
alias information is not filled. So user-space can't match the registered
device with the corresponding module.

Export the module alias information using the MODULE_DEVICE_TABLE() macro.

Before this patch:

$ modinfo drivers/devfreq/event/rockchip-dfi.ko | grep alias
$

After this patch:

$ modinfo drivers/devfreq/event/rockchip-dfi.ko | grep alias
alias:          of:N*T*Crockchip,rk3399-dfiC*
alias:          of:N*T*Crockchip,rk3399-dfi

Signed-off-by: Javier Martinez Canillas <javier@osg.samsung.com>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 drivers/devfreq/event/rockchip-dfi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/devfreq/event/rockchip-dfi.c b/drivers/devfreq/event/rockchip-dfi.c
index 43fcc5a7f515..22b113363ffc 100644
--- a/drivers/devfreq/event/rockchip-dfi.c
+++ b/drivers/devfreq/event/rockchip-dfi.c
@@ -188,6 +188,7 @@ static const struct of_device_id rockchip_dfi_id_match[] = {
 	{ .compatible = "rockchip,rk3399-dfi" },
 	{ },
 };
+MODULE_DEVICE_TABLE(of, rockchip_dfi_id_match);
 
 static int rockchip_dfi_probe(struct platform_device *pdev)
 {
-- 
cgit v1.2.3


From 29e477f23598b850b08bae22d142bd3abf491248 Mon Sep 17 00:00:00 2001
From: Javier Martinez Canillas <javier@osg.samsung.com>
Date: Wed, 19 Oct 2016 18:06:27 -0300
Subject: PM / devfreq: exynos-ppmu: Fix module autoload

If the driver is built as a module, autoload won't work because the module
alias information is not filled. So user-space can't match the registered
device with the corresponding module.

Export the module alias information using the MODULE_DEVICE_TABLE() macro.

Before this patch:

$ modinfo drivers/devfreq/event/exynos-ppmu.ko | grep alias
$

After this patch:

$ modinfo drivers/devfreq/event/exynos-ppmu.ko | grep alias
alias:          of:N*T*Csamsung,exynos-ppmu-v2C*
alias:          of:N*T*Csamsung,exynos-ppmu-v2
alias:          of:N*T*Csamsung,exynos-ppmuC*
alias:          of:N*T*Csamsung,exynos-ppmu

Signed-off-by: Javier Martinez Canillas <javier@osg.samsung.com>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 drivers/devfreq/event/exynos-ppmu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/devfreq/event/exynos-ppmu.c b/drivers/devfreq/event/exynos-ppmu.c
index f55cf0eb2a66..a112034997eb 100644
--- a/drivers/devfreq/event/exynos-ppmu.c
+++ b/drivers/devfreq/event/exynos-ppmu.c
@@ -351,6 +351,7 @@ static const struct of_device_id exynos_ppmu_id_match[] = {
 	},
 	{ /* sentinel */ },
 };
+MODULE_DEVICE_TABLE(of, exynos_ppmu_id_match);
 
 static struct devfreq_event_ops *exynos_bus_get_ops(struct device_node *np)
 {
-- 
cgit v1.2.3


From f8dbe363ba01de9bebb023f499e7bf9c31f028ab Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@ingics.com>
Date: Fri, 21 Oct 2016 09:09:08 +0800
Subject: PM / devfreq: exynos-ppmu: ppmu_events array should not be NULL
 terminated

The rest of the code uses ARRAY_SIZE to count the number of entries in
ppmu_events array. The NULL terminated entry makes ARRAY_SIZE return
off-by-one value.

Signed-off-by: Axel Lin <axel.lin@ingics.com>
Acked-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 drivers/devfreq/event/exynos-ppmu.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/devfreq/event/exynos-ppmu.c b/drivers/devfreq/event/exynos-ppmu.c
index a112034997eb..958285d5d64b 100644
--- a/drivers/devfreq/event/exynos-ppmu.c
+++ b/drivers/devfreq/event/exynos-ppmu.c
@@ -90,8 +90,6 @@ struct __exynos_ppmu_events {
 	PPMU_EVENT(d1-cpu),
 	PPMU_EVENT(d1-general),
 	PPMU_EVENT(d1-rt),
-
-	{ /* sentinel */ },
 };
 
 static int exynos_ppmu_find_ppmu_id(struct devfreq_event_dev *edev)
-- 
cgit v1.2.3


From 6bbda2d4f83f7bfeb20fcf67670c47193d6ee6b0 Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@ingics.com>
Date: Fri, 21 Oct 2016 09:09:09 +0800
Subject: PM / devfreq: exynos-ppmu: Remove unused mutex from struct
 exynos_ppmu

The mutex is not used at all, remove it.

Signed-off-by: Axel Lin <axel.lin@ingics.com>
Acked-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 drivers/devfreq/event/exynos-ppmu.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/devfreq/event/exynos-ppmu.c b/drivers/devfreq/event/exynos-ppmu.c
index 958285d5d64b..107eb91a9415 100644
--- a/drivers/devfreq/event/exynos-ppmu.c
+++ b/drivers/devfreq/event/exynos-ppmu.c
@@ -15,7 +15,6 @@
 #include <linux/io.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/mutex.h>
 #include <linux/of_address.h>
 #include <linux/platform_device.h>
 #include <linux/suspend.h>
@@ -34,7 +33,6 @@ struct exynos_ppmu {
 	unsigned int num_events;
 
 	struct device *dev;
-	struct mutex lock;
 
 	struct exynos_ppmu_data ppmu;
 };
@@ -462,7 +460,6 @@ static int exynos_ppmu_probe(struct platform_device *pdev)
 	if (!info)
 		return -ENOMEM;
 
-	mutex_init(&info->lock);
 	info->dev = &pdev->dev;
 
 	/* Parse dt data to get resource */
-- 
cgit v1.2.3


From bafeb42bd80fb269d5cc396d28165818aae5994c Mon Sep 17 00:00:00 2001
From: MyungJoo Ham <myungjoo.ham@samsung.com>
Date: Wed, 9 Nov 2016 10:29:14 +0900
Subject: PM / devfreq: correct comment typo.

The function name in the comment was incorrect.

Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
---
 drivers/devfreq/devfreq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index bf3ea7603a58..a324801d6a66 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -850,7 +850,7 @@ err_out:
 EXPORT_SYMBOL(devfreq_add_governor);
 
 /**
- * devfreq_remove_device() - Remove devfreq feature from a device.
+ * devfreq_remove_governor() - Remove devfreq feature from a device.
  * @governor:	the devfreq governor to be removed
  */
 int devfreq_remove_governor(struct devfreq_governor *governor)
-- 
cgit v1.2.3


From 927b75a628b1d80fef171420fbd694fc28d9b8e9 Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Tue, 8 Nov 2016 18:13:27 +0900
Subject: PM / devfreq: rk3399_dmc: Use the resource-managed function to add
 devfreq dev

This patch uses the resource-managed to add the devfreq device.
This function will make it easy to handle the devfreq device.

- struct devfreq *devm_devfreq_add_device(struct device *dev,
				  struct devfreq_dev_profile *profile,
				  const char *governor_name,
				  void *data);

Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: MyungJoo Ham <myungjoo.ham@samsung.com>
---
 drivers/devfreq/rk3399_dmc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/devfreq/rk3399_dmc.c b/drivers/devfreq/rk3399_dmc.c
index 77bd9d02e870..5063ac1a5939 100644
--- a/drivers/devfreq/rk3399_dmc.c
+++ b/drivers/devfreq/rk3399_dmc.c
@@ -436,7 +436,7 @@ static int rk3399_dmcfreq_probe(struct platform_device *pdev)
 
 	rk3399_devfreq_dmc_profile.initial_freq = data->rate;
 
-	data->devfreq = devfreq_add_device(dev,
+	data->devfreq = devm_devfreq_add_device(dev,
 					   &rk3399_devfreq_dmc_profile,
 					   "simple_ondemand",
 					   &data->ondemand_data);
-- 
cgit v1.2.3


From d0ea59e188941417a9fb5898d894b3106a8ad313 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 17 Nov 2016 22:47:47 +0100
Subject: cpufreq: intel_pstate: Request P-states control from SMM if needed

Currently, intel_pstate is unable to control P-states on my
IvyBridge-based Acer Aspire S5, because they are controlled by SMM
on that machine by default and it is necessary to request OS control
of P-states from it via the SMI Command register exposed in the ACPI
FADT.  intel_pstate doesn't do that now, but acpi-cpufreq and other
cpufreq drivers for x86 platforms do.

Address this problem by making intel_pstate use the ACPI-defined
mechanism as well.  However, intel_pstate is not modular and it
doesn't need the module refcount tricks played by
acpi_processor_notify_smm(), so export the core of this function
to it as acpi_processor_pstate_control() and make it call that.
[The changes in processor_perflib.c related to this should not
make any functional difference for the acpi_processor_notify_smm()
users].

To be safe, only call acpi_processor_notify_smm() from intel_pstate
if ACPI _PPC support is enabled in it.

Suggested-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
---
 drivers/acpi/processor_perflib.c | 45 +++++++++++++++++++++++++---------------
 drivers/cpufreq/intel_pstate.c   | 13 ++++++++++++
 include/acpi/processor.h         |  1 +
 3 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index bb01dea39fdc..9c67faa034b5 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -465,11 +465,33 @@ int acpi_processor_get_performance_info(struct acpi_processor *pr)
 	return result;
 }
 EXPORT_SYMBOL_GPL(acpi_processor_get_performance_info);
-int acpi_processor_notify_smm(struct module *calling_module)
+
+int acpi_processor_pstate_control(void)
 {
 	acpi_status status;
-	static int is_done = 0;
 
+	if (!acpi_gbl_FADT.smi_command || !acpi_gbl_FADT.pstate_control)
+		return 0;
+
+	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
+			  "Writing pstate_control [0x%x] to smi_command [0x%x]\n",
+			  acpi_gbl_FADT.pstate_control, acpi_gbl_FADT.smi_command));
+
+	status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
+				    (u32)acpi_gbl_FADT.pstate_control, 8);
+	if (ACPI_SUCCESS(status))
+		return 1;
+
+	ACPI_EXCEPTION((AE_INFO, status,
+			"Failed to write pstate_control [0x%x] to smi_command [0x%x]",
+			acpi_gbl_FADT.pstate_control, acpi_gbl_FADT.smi_command));
+	return -EIO;
+}
+
+int acpi_processor_notify_smm(struct module *calling_module)
+{
+	static int is_done = 0;
+	int result;
 
 	if (!(acpi_processor_ppc_status & PPC_REGISTERED))
 		return -EBUSY;
@@ -492,26 +514,15 @@ int acpi_processor_notify_smm(struct module *calling_module)
 
 	is_done = -EIO;
 
-	/* Can't write pstate_control to smi_command if either value is zero */
-	if ((!acpi_gbl_FADT.smi_command) || (!acpi_gbl_FADT.pstate_control)) {
+	result = acpi_processor_pstate_control();
+	if (!result) {
 		ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No SMI port or pstate_control\n"));
 		module_put(calling_module);
 		return 0;
 	}
-
-	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
-			  "Writing pstate_control [0x%x] to smi_command [0x%x]\n",
-			  acpi_gbl_FADT.pstate_control, acpi_gbl_FADT.smi_command));
-
-	status = acpi_os_write_port(acpi_gbl_FADT.smi_command,
-				    (u32) acpi_gbl_FADT.pstate_control, 8);
-	if (ACPI_FAILURE(status)) {
-		ACPI_EXCEPTION((AE_INFO, status,
-				"Failed to write pstate_control [0x%x] to "
-				"smi_command [0x%x]", acpi_gbl_FADT.pstate_control,
-				acpi_gbl_FADT.smi_command));
+	if (result < 0) {
 		module_put(calling_module);
-		return status;
+		return result;
 	}
 
 	/* Success. If there's no _PPC, we need to fear nothing, so
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index f07e591c0e1f..ec1664bf6ef0 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1909,9 +1909,20 @@ static bool __init intel_pstate_platform_pwr_mgmt_exists(void)
 
 	return false;
 }
+
+static void intel_pstate_request_control_from_smm(void)
+{
+	/*
+	 * It may be unsafe to request P-states control from SMM if _PPC support
+	 * has not been enabled.
+	 */
+	if (acpi_ppc)
+		acpi_processor_pstate_control();
+}
 #else /* CONFIG_ACPI not enabled */
 static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; }
 static inline bool intel_pstate_has_acpi_ppc(void) { return false; }
+static inline void intel_pstate_request_control_from_smm(void) {}
 #endif /* CONFIG_ACPI */
 
 static const struct x86_cpu_id hwp_support_ids[] __initconst = {
@@ -1963,6 +1974,8 @@ hwp_cpu_matched:
 	if (!hwp_active && hwp_only)
 		goto out;
 
+	intel_pstate_request_control_from_smm();
+
 	rc = cpufreq_register_driver(&intel_pstate_driver);
 	if (rc)
 		goto out;
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index f3db11c24654..dd0cb04046ce 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -249,6 +249,7 @@ extern int acpi_processor_register_performance(struct acpi_processor_performance
 					       *performance, unsigned int cpu);
 extern void acpi_processor_unregister_performance(unsigned int cpu);
 
+int acpi_processor_pstate_control(void);
 /* note: this locks both the calling module and the processor module
          if a _PPC object exists, rmmod is disallowed then */
 int acpi_processor_notify_smm(struct module *calling_module);
-- 
cgit v1.2.3


From 1d9174fbc55ec99ccbfcafa3de2528ef78a849aa Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Thu, 13 Oct 2016 16:58:54 +0200
Subject: PM / Runtime: Defer resuming of the device in
 pm_runtime_force_resume()

When the pm_runtime_force_suspend|resume() helpers were invented, we still
had CONFIG_PM_RUNTIME and CONFIG_PM_SLEEP as separate Kconfig options.

To make sure these helpers worked for all combinations and without
introducing too much of complexity, the device was always resumed in
pm_runtime_force_resume().

More precisely, when CONFIG_PM_SLEEP was set and CONFIG_PM_RUNTIME was
unset, we needed to resume the device as the subsystem/driver couldn't
rely on using runtime PM to do it.

As the CONFIG_PM_RUNTIME option was merged into CONFIG_PM a while ago, it
removed this combination, of using CONFIG_PM_SLEEP without the earlier
CONFIG_PM_RUNTIME.

For this reason we can now rely on the subsystem/driver to use runtime PM
to resume the device, instead of forcing that to be done in all cases. In
other words, let's defer the runtime resume to a later point when it's
actually needed.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Geert Uytterhoeven <geert+renesas@glider.be>
Acked-by: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/runtime.c | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 60ebb04d8140..f0d863089345 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -1489,6 +1489,16 @@ int pm_runtime_force_suspend(struct device *dev)
 	if (ret)
 		goto err;
 
+	/*
+	 * Increase the runtime PM usage count for the device's parent, in case
+	 * when we find the device being used when system suspend was invoked.
+	 * This informs pm_runtime_force_resume() to resume the parent
+	 * immediately, which is needed to be able to resume its children,
+	 * when not deferring the resume to be managed via runtime PM.
+	 */
+	if (dev->parent && atomic_read(&dev->power.usage_count) > 1)
+		pm_runtime_get_noresume(dev->parent);
+
 	pm_runtime_set_suspended(dev);
 	return 0;
 err:
@@ -1498,16 +1508,20 @@ err:
 EXPORT_SYMBOL_GPL(pm_runtime_force_suspend);
 
 /**
- * pm_runtime_force_resume - Force a device into resume state.
+ * pm_runtime_force_resume - Force a device into resume state if needed.
  * @dev: Device to resume.
  *
  * Prior invoking this function we expect the user to have brought the device
  * into low power state by a call to pm_runtime_force_suspend(). Here we reverse
- * those actions and brings the device into full power. We update the runtime PM
- * status and re-enables runtime PM.
+ * those actions and brings the device into full power, if it is expected to be
+ * used on system resume. To distinguish that, we check whether the runtime PM
+ * usage count is greater than 1 (the PM core increases the usage count in the
+ * system PM prepare phase), as that indicates a real user (such as a subsystem,
+ * driver, userspace, etc.) is using it. If that is the case, the device is
+ * expected to be used on system resume as well, so then we resume it. In the
+ * other case, we defer the resume to be managed via runtime PM.
  *
- * Typically this function may be invoked from a system resume callback to make
- * sure the device is put into full power state.
+ * Typically this function may be invoked from a system resume callback.
  */
 int pm_runtime_force_resume(struct device *dev)
 {
@@ -1524,6 +1538,17 @@ int pm_runtime_force_resume(struct device *dev)
 	if (!pm_runtime_status_suspended(dev))
 		goto out;
 
+	/*
+	 * Decrease the parent's runtime PM usage count, if we increased it
+	 * during system suspend in pm_runtime_force_suspend().
+	*/
+	if (atomic_read(&dev->power.usage_count) > 1) {
+		if (dev->parent)
+			pm_runtime_put_noidle(dev->parent);
+	} else {
+		goto out;
+	}
+
 	ret = pm_runtime_set_active(dev);
 	if (ret)
 		goto out;
-- 
cgit v1.2.3


From 001c76f05b01cc8ceb2098c9ff5de2609bec7f76 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 17 Nov 2016 23:34:17 +0100
Subject: cpufreq: intel_pstate: Generic governors support

There may be reasons to use generic cpufreq governors (eg. schedutil)
on Intel platforms instead of the intel_pstate driver's internal
governor.  However, that currently can only be done by disabling
intel_pstate altogether and using the acpi-cpufreq driver instead
of it, which is subject to limitations.

First of all, acpi-cpufreq only works on systems where the _PSS
object is present in the ACPI tables for all logical CPUs.  Second,
on those systems acpi-cpufreq will only use frequencies listed by
_PSS which may be suboptimal.  In particular, by convention, the
whole turbo range is represented in _PSS as a single P-state and
the frequency assigned to it is greater by 1 MHz than the greatest
non-turbo frequency listed by _PSS.  That may confuse governors to
use turbo frequencies less frequently which may lead to suboptimal
performance.

For this reason, make it possible to use the intel_pstate driver
with generic cpufreq governors as a "normal" cpufreq driver.  That
mode is enforced by adding intel_pstate=passive to the kernel
command line and cannot be disabled at run time.  In that mode,
intel_pstate provides a cpufreq driver interface including
the ->target() and ->fast_switch() callbacks and is listed in
scaling_driver as "intel_cpufreq".

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Doug Smythies <dsmythies@telus.net>
---
 Documentation/kernel-parameters.txt |   6 ++
 drivers/cpufreq/intel_pstate.c      | 194 +++++++++++++++++++++++++++++++-----
 2 files changed, 176 insertions(+), 24 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index fa2336ec1f56..dfdd38ec149b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1760,6 +1760,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 		       disable
 		         Do not enable intel_pstate as the default
 		         scaling driver for the supported processors
+		       passive
+			 Use intel_pstate as a scaling driver, but configure it
+			 to work with generic cpufreq governors (instead of
+			 enabling its internal governor).  This mode cannot be
+			 used along with the hardware-managed P-states (HWP)
+			 feature.
 		       force
 			 Enable intel_pstate on systems that prohibit it by default
 			 in favor of acpi-cpufreq. Forcing the intel_pstate driver
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index ec1664bf6ef0..0d82bf320838 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -37,6 +37,8 @@
 #include <asm/cpufeature.h>
 #include <asm/intel-family.h>
 
+#define INTEL_CPUFREQ_TRANSITION_LATENCY	20000
+
 #define ATOM_RATIOS		0x66a
 #define ATOM_VIDS		0x66b
 #define ATOM_TURBO_RATIOS	0x66c
@@ -122,6 +124,8 @@ struct sample {
  * @scaling:		Scaling factor to  convert frequency to cpufreq
  *			frequency units
  * @turbo_pstate:	Max Turbo P state possible for this platform
+ * @max_freq:		@max_pstate frequency in cpufreq units
+ * @turbo_freq:		@turbo_pstate frequency in cpufreq units
  *
  * Stores the per cpu model P state limits and current P state.
  */
@@ -132,6 +136,8 @@ struct pstate_data {
 	int	max_pstate_physical;
 	int	scaling;
 	int	turbo_pstate;
+	unsigned int max_freq;
+	unsigned int turbo_freq;
 };
 
 /**
@@ -470,7 +476,7 @@ static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
 {
 }
 
-static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
+static inline int intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
 {
 }
 #endif
@@ -1225,6 +1231,8 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu)
 	cpu->pstate.max_pstate_physical = pstate_funcs.get_max_physical();
 	cpu->pstate.turbo_pstate = pstate_funcs.get_turbo();
 	cpu->pstate.scaling = pstate_funcs.get_scaling();
+	cpu->pstate.max_freq = cpu->pstate.max_pstate * cpu->pstate.scaling;
+	cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * cpu->pstate.scaling;
 
 	if (pstate_funcs.get_vid)
 		pstate_funcs.get_vid(cpu);
@@ -1363,15 +1371,19 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu)
 	return cpu->pstate.current_pstate - pid_calc(&cpu->pid, perf_scaled);
 }
 
-static inline void intel_pstate_update_pstate(struct cpudata *cpu, int pstate)
+static int intel_pstate_prepare_request(struct cpudata *cpu, int pstate)
 {
 	int max_perf, min_perf;
 
-	update_turbo_state();
-
 	intel_pstate_get_min_max(cpu, &min_perf, &max_perf);
 	pstate = clamp_t(int, pstate, min_perf, max_perf);
 	trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu);
+	return pstate;
+}
+
+static void intel_pstate_update_pstate(struct cpudata *cpu, int pstate)
+{
+	pstate = intel_pstate_prepare_request(cpu, pstate);
 	if (pstate == cpu->pstate.current_pstate)
 		return;
 
@@ -1389,6 +1401,8 @@ static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu)
 	target_pstate = cpu->policy == CPUFREQ_POLICY_PERFORMANCE ?
 		cpu->pstate.turbo_pstate : pstate_funcs.get_target_pstate(cpu);
 
+	update_turbo_state();
+
 	intel_pstate_update_pstate(cpu, target_pstate);
 
 	sample = &cpu->sample;
@@ -1670,22 +1684,30 @@ static int intel_pstate_verify_policy(struct cpufreq_policy *policy)
 	return 0;
 }
 
+static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy)
+{
+	intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]);
+}
+
 static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
 {
-	int cpu_num = policy->cpu;
-	struct cpudata *cpu = all_cpu_data[cpu_num];
+	pr_debug("CPU %d exiting\n", policy->cpu);
 
-	pr_debug("CPU %d exiting\n", cpu_num);
+	intel_pstate_clear_update_util_hook(policy->cpu);
+	if (!hwp_active)
+		intel_cpufreq_stop_cpu(policy);
+}
 
-	intel_pstate_clear_update_util_hook(cpu_num);
+static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
+{
+	intel_pstate_exit_perf_limits(policy);
 
-	if (hwp_active)
-		return;
+	policy->fast_switch_possible = false;
 
-	intel_pstate_set_min_pstate(cpu);
+	return 0;
 }
 
-static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
+static int __intel_pstate_cpu_init(struct cpufreq_policy *policy)
 {
 	struct cpudata *cpu;
 	int rc;
@@ -1696,11 +1718,6 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
 
 	cpu = all_cpu_data[policy->cpu];
 
-	if (limits->min_perf_pct == 100 && limits->max_perf_pct == 100)
-		policy->policy = CPUFREQ_POLICY_PERFORMANCE;
-	else
-		policy->policy = CPUFREQ_POLICY_POWERSAVE;
-
 	/*
 	 * We need sane value in the cpu->perf_limits, so inherit from global
 	 * perf_limits limits, which are seeded with values based on the
@@ -1720,20 +1737,30 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
 	policy->cpuinfo.max_freq *= cpu->pstate.scaling;
 
 	intel_pstate_init_acpi_perf_limits(policy);
-	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
 	cpumask_set_cpu(policy->cpu, policy->cpus);
 
+	policy->fast_switch_possible = true;
+
 	return 0;
 }
 
-static int intel_pstate_cpu_exit(struct cpufreq_policy *policy)
+static int intel_pstate_cpu_init(struct cpufreq_policy *policy)
 {
-	intel_pstate_exit_perf_limits(policy);
+	int ret = __intel_pstate_cpu_init(policy);
+
+	if (ret)
+		return ret;
+
+	policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
+	if (limits->min_perf_pct == 100 && limits->max_perf_pct == 100)
+		policy->policy = CPUFREQ_POLICY_PERFORMANCE;
+	else
+		policy->policy = CPUFREQ_POLICY_POWERSAVE;
 
 	return 0;
 }
 
-static struct cpufreq_driver intel_pstate_driver = {
+static struct cpufreq_driver intel_pstate = {
 	.flags		= CPUFREQ_CONST_LOOPS,
 	.verify		= intel_pstate_verify_policy,
 	.setpolicy	= intel_pstate_set_policy,
@@ -1745,6 +1772,118 @@ static struct cpufreq_driver intel_pstate_driver = {
 	.name		= "intel_pstate",
 };
 
+static int intel_cpufreq_verify_policy(struct cpufreq_policy *policy)
+{
+	struct cpudata *cpu = all_cpu_data[policy->cpu];
+	struct perf_limits *perf_limits = limits;
+
+	update_turbo_state();
+	policy->cpuinfo.max_freq = limits->turbo_disabled ?
+			cpu->pstate.max_freq : cpu->pstate.turbo_freq;
+
+	cpufreq_verify_within_cpu_limits(policy);
+
+	if (per_cpu_limits)
+		perf_limits = cpu->perf_limits;
+
+	intel_pstate_update_perf_limits(policy, perf_limits);
+
+	return 0;
+}
+
+static unsigned int intel_cpufreq_turbo_update(struct cpudata *cpu,
+					       struct cpufreq_policy *policy,
+					       unsigned int target_freq)
+{
+	unsigned int max_freq;
+
+	update_turbo_state();
+
+	max_freq = limits->no_turbo || limits->turbo_disabled ?
+			cpu->pstate.max_freq : cpu->pstate.turbo_freq;
+	policy->cpuinfo.max_freq = max_freq;
+	if (policy->max > max_freq)
+		policy->max = max_freq;
+
+	if (target_freq > max_freq)
+		target_freq = max_freq;
+
+	return target_freq;
+}
+
+static int intel_cpufreq_target(struct cpufreq_policy *policy,
+				unsigned int target_freq,
+				unsigned int relation)
+{
+	struct cpudata *cpu = all_cpu_data[policy->cpu];
+	struct cpufreq_freqs freqs;
+	int target_pstate;
+
+	freqs.old = policy->cur;
+	freqs.new = intel_cpufreq_turbo_update(cpu, policy, target_freq);
+
+	cpufreq_freq_transition_begin(policy, &freqs);
+	switch (relation) {
+	case CPUFREQ_RELATION_L:
+		target_pstate = DIV_ROUND_UP(freqs.new, cpu->pstate.scaling);
+		break;
+	case CPUFREQ_RELATION_H:
+		target_pstate = freqs.new / cpu->pstate.scaling;
+		break;
+	default:
+		target_pstate = DIV_ROUND_CLOSEST(freqs.new, cpu->pstate.scaling);
+		break;
+	}
+	target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
+	if (target_pstate != cpu->pstate.current_pstate) {
+		cpu->pstate.current_pstate = target_pstate;
+		wrmsrl_on_cpu(policy->cpu, MSR_IA32_PERF_CTL,
+			      pstate_funcs.get_val(cpu, target_pstate));
+	}
+	cpufreq_freq_transition_end(policy, &freqs, false);
+
+	return 0;
+}
+
+static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy,
+					      unsigned int target_freq)
+{
+	struct cpudata *cpu = all_cpu_data[policy->cpu];
+	int target_pstate;
+
+	target_freq = intel_cpufreq_turbo_update(cpu, policy, target_freq);
+	target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling);
+	intel_pstate_update_pstate(cpu, target_pstate);
+	return target_freq;
+}
+
+static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy)
+{
+	int ret = __intel_pstate_cpu_init(policy);
+
+	if (ret)
+		return ret;
+
+	policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY;
+	/* This reflects the intel_pstate_get_cpu_pstates() setting. */
+	policy->cur = policy->cpuinfo.min_freq;
+
+	return 0;
+}
+
+static struct cpufreq_driver intel_cpufreq = {
+	.flags		= CPUFREQ_CONST_LOOPS,
+	.verify		= intel_cpufreq_verify_policy,
+	.target		= intel_cpufreq_target,
+	.fast_switch	= intel_cpufreq_fast_switch,
+	.init		= intel_cpufreq_cpu_init,
+	.exit		= intel_pstate_cpu_exit,
+	.stop_cpu	= intel_cpufreq_stop_cpu,
+	.name		= "intel_cpufreq",
+};
+
+static struct cpufreq_driver *intel_pstate_driver = &intel_pstate;
+
 static int no_load __initdata;
 static int no_hwp __initdata;
 static int hwp_only __initdata;
@@ -1976,7 +2115,7 @@ hwp_cpu_matched:
 
 	intel_pstate_request_control_from_smm();
 
-	rc = cpufreq_register_driver(&intel_pstate_driver);
+	rc = cpufreq_register_driver(intel_pstate_driver);
 	if (rc)
 		goto out;
 
@@ -1991,7 +2130,9 @@ out:
 	get_online_cpus();
 	for_each_online_cpu(cpu) {
 		if (all_cpu_data[cpu]) {
-			intel_pstate_clear_update_util_hook(cpu);
+			if (intel_pstate_driver == &intel_pstate)
+				intel_pstate_clear_update_util_hook(cpu);
+
 			kfree(all_cpu_data[cpu]);
 		}
 	}
@@ -2007,8 +2148,13 @@ static int __init intel_pstate_setup(char *str)
 	if (!str)
 		return -EINVAL;
 
-	if (!strcmp(str, "disable"))
+	if (!strcmp(str, "disable")) {
 		no_load = 1;
+	} else if (!strcmp(str, "passive")) {
+		pr_info("Passive mode enabled\n");
+		intel_pstate_driver = &intel_cpufreq;
+		no_hwp = 1;
+	}
 	if (!strcmp(str, "no_hwp")) {
 		pr_info("HWP disabled\n");
 		no_hwp = 1;
-- 
cgit v1.2.3


From 182e36af0663a1050c42d234a5bf36f084f8c28b Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 18 Nov 2016 13:40:45 +0100
Subject: cpufreq: Avoid using inactive policies

There are two places in the cpufreq core in which low-level driver
callbacks may be invoked for an inactive cpufreq policy, which isn't
guaranteed to work in general.  Both are due to possible races with
CPU offline.

First, in cpufreq_get(), the policy may become inactive after
the check against policy->cpus in cpufreq_cpu_get() and before
policy->rwsem is acquired, in which case using it going forward may
not be correct.

Second, an analogous situation is possible in cpufreq_update_policy().

Avoid using inactive policies by adding policy_is_inactive() checks
to the code in the above places.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 6e6c1fb60fbc..ad3b319486bd 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1526,7 +1526,10 @@ unsigned int cpufreq_get(unsigned int cpu)
 
 	if (policy) {
 		down_read(&policy->rwsem);
-		ret_freq = __cpufreq_get(policy);
+
+		if (!policy_is_inactive(policy))
+			ret_freq = __cpufreq_get(policy);
+
 		up_read(&policy->rwsem);
 
 		cpufreq_cpu_put(policy);
@@ -2265,6 +2268,11 @@ int cpufreq_update_policy(unsigned int cpu)
 
 	down_write(&policy->rwsem);
 
+	if (policy_is_inactive(policy)) {
+		ret = -ENODEV;
+		goto unlock;
+	}
+
 	pr_debug("updating policy for CPU %u\n", cpu);
 	memcpy(&new_policy, policy, sizeof(*policy));
 	new_policy.min = policy->user_policy.min;
-- 
cgit v1.2.3


From bca5f557dcea50a32ba789c4b4a438af7f49a61f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 18 Nov 2016 13:57:54 +0100
Subject: ACPI / processor: Make acpi_processor_ppc_has_changed() void

The return value of acpi_processor_ppc_has_changed() is never used,
so make it void.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/acpi/processor_perflib.c | 10 ++++------
 include/acpi/processor.h         |  2 +-
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index 9c67faa034b5..f0b4a981b8d3 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -157,7 +157,7 @@ static void acpi_processor_ppc_ost(acpi_handle handle, int status)
 				  status, NULL);
 }
 
-int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag)
+void acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag)
 {
 	int ret;
 
@@ -168,7 +168,7 @@ int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag)
 		 */
 		if (event_flag)
 			acpi_processor_ppc_ost(pr->handle, 1);
-		return 0;
+		return;
 	}
 
 	ret = acpi_processor_get_platform_limit(pr);
@@ -182,10 +182,8 @@ int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag)
 		else
 			acpi_processor_ppc_ost(pr->handle, 0);
 	}
-	if (ret < 0)
-		return (ret);
-	else
-		return cpufreq_update_policy(pr->id);
+	if (ret >= 0)
+		cpufreq_update_policy(pr->id);
 }
 
 int acpi_processor_get_bios_limit(int cpu, unsigned int *limit)
diff --git a/include/acpi/processor.h b/include/acpi/processor.h
index dd0cb04046ce..c1ba00fc4888 100644
--- a/include/acpi/processor.h
+++ b/include/acpi/processor.h
@@ -295,7 +295,7 @@ static inline void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx
 #ifdef CONFIG_CPU_FREQ
 void acpi_processor_ppc_init(void);
 void acpi_processor_ppc_exit(void);
-int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag);
+void acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag);
 extern int acpi_processor_get_bios_limit(int cpu, unsigned int *limit);
 #else
 static inline void acpi_processor_ppc_init(void)
-- 
cgit v1.2.3


From 30248feff5e5c6a01ade5e6126009e296ed8bd35 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Fri, 18 Nov 2016 13:59:21 +0100
Subject: cpufreq: Make cpufreq_update_policy() void

The return value of cpufreq_update_policy() is never used, so make
it void.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq.c | 21 +++++++--------------
 include/linux/cpufreq.h   |  2 +-
 2 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index ad3b319486bd..cc475eff90b3 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2257,21 +2257,18 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
  *	Useful for policy notifiers which have different necessities
  *	at different times.
  */
-int cpufreq_update_policy(unsigned int cpu)
+void cpufreq_update_policy(unsigned int cpu)
 {
 	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
 	struct cpufreq_policy new_policy;
-	int ret;
 
 	if (!policy)
-		return -ENODEV;
+		return;
 
 	down_write(&policy->rwsem);
 
-	if (policy_is_inactive(policy)) {
-		ret = -ENODEV;
+	if (policy_is_inactive(policy))
 		goto unlock;
-	}
 
 	pr_debug("updating policy for CPU %u\n", cpu);
 	memcpy(&new_policy, policy, sizeof(*policy));
@@ -2283,24 +2280,20 @@ int cpufreq_update_policy(unsigned int cpu)
 	 * -> ask driver for current freq and notify governors about a change
 	 */
 	if (cpufreq_driver->get && !cpufreq_driver->setpolicy) {
-		if (cpufreq_suspended) {
-			ret = -EAGAIN;
+		if (cpufreq_suspended)
 			goto unlock;
-		}
+
 		new_policy.cur = cpufreq_update_current_freq(policy);
-		if (WARN_ON(!new_policy.cur)) {
-			ret = -EIO;
+		if (WARN_ON(!new_policy.cur))
 			goto unlock;
-		}
 	}
 
-	ret = cpufreq_set_policy(policy, &new_policy);
+	cpufreq_set_policy(policy, &new_policy);
 
 unlock:
 	up_write(&policy->rwsem);
 
 	cpufreq_cpu_put(policy);
-	return ret;
 }
 EXPORT_SYMBOL(cpufreq_update_policy);
 
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 40dc2e29f480..7e05c5e4e45c 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -175,7 +175,7 @@ void disable_cpufreq(void);
 
 u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy);
 int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu);
-int cpufreq_update_policy(unsigned int cpu);
+void cpufreq_update_policy(unsigned int cpu);
 bool have_governor_per_policy(void);
 struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
 void cpufreq_enable_fast_switch(struct cpufreq_policy *policy);
-- 
cgit v1.2.3


From 406e79385f3223d82272cf2be86bc95cd000a258 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Mon, 21 Nov 2016 22:45:40 +0100
Subject: PM / sleep: System sleep state selection interface rework

There are systems in which the platform doesn't support any special
sleep states, so suspend-to-idle (PM_SUSPEND_FREEZE) is the only
available system sleep state.  However, some user space frameworks
only use the "mem" and (sometimes) "standby" sleep state labels, so
the users of those systems need to modify user space in order to be
able to use system suspend at all and that may be a pain in practice.

Commit 0399d4db3edf (PM / sleep: Introduce command line argument for
sleep state enumeration) attempted to address this problem by adding
a command line argument to change the meaning of the "mem" string in
/sys/power/state to make it trigger suspend-to-idle (instead of
suspend-to-RAM).

However, there also are systems in which the platform does support
special sleep states, but suspend-to-idle is the preferred one anyway
(it even may save more energy than the platform-provided sleep states
in some cases) and the above commit doesn't help in those cases.

For this reason, rework the system sleep state selection interface
again (but preserve backwards compatibiliby).  Namely, add a new
sysfs file, /sys/power/mem_sleep, that will control the system
suspend mode triggered by writing "mem" to /sys/power/state (in
analogy with what /sys/power/disk does for hibernation).  Make it
select suspend-to-RAM ("deep" sleep) by default (if supported) and
fall back to suspend-to-idle ("s2idle") otherwise and add a new
command line argument, mem_sleep_default, allowing that default to
be overridden if need be.

At the same time, drop the relative_sleep_states command line
argument that doesn't make sense any more.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Mario Limonciello <mario.limonciello@dell.com>
---
 Documentation/ABI/testing/sysfs-power | 45 ++++++++++--------
 Documentation/kernel-parameters.txt   | 13 +++---
 Documentation/power/states.txt        | 60 +++++++++++++++---------
 kernel/power/main.c                   | 88 +++++++++++++++++++++++++++++++++--
 kernel/power/power.h                  |  6 ++-
 kernel/power/suspend.c                | 69 ++++++++++++++++-----------
 6 files changed, 201 insertions(+), 80 deletions(-)

diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power
index 50b368d490b5..f523e5a3ac33 100644
--- a/Documentation/ABI/testing/sysfs-power
+++ b/Documentation/ABI/testing/sysfs-power
@@ -7,30 +7,35 @@ Description:
 		subsystem.
 
 What:		/sys/power/state
-Date:		May 2014
+Date:		November 2016
 Contact:	Rafael J. Wysocki <rjw@rjwysocki.net>
 Description:
 		The /sys/power/state file controls system sleep states.
 		Reading from this file returns the available sleep state
-		labels, which may be "mem", "standby", "freeze" and "disk"
-		(hibernation).  The meanings of the first three labels depend on
-		the relative_sleep_states command line argument as follows:
-		 1) relative_sleep_states = 1
-		    "mem", "standby", "freeze" represent non-hibernation sleep
-		    states from the deepest ("mem", always present) to the
-		    shallowest ("freeze").  "standby" and "freeze" may or may
-		    not be present depending on the capabilities of the
-		    platform.  "freeze" can only be present if "standby" is
-		    present.
-		 2) relative_sleep_states = 0 (default)
-		    "mem" - "suspend-to-RAM", present if supported.
-		    "standby" - "power-on suspend", present if supported.
-		    "freeze" - "suspend-to-idle", always present.
-
-		Writing to this file one of these strings causes the system to
-		transition into the corresponding state, if available.  See
-		Documentation/power/states.txt for a description of what
-		"suspend-to-RAM", "power-on suspend" and "suspend-to-idle" mean.
+		labels, which may be "mem" (suspend), "standby" (power-on
+		suspend), "freeze" (suspend-to-idle) and "disk" (hibernation).
+
+		Writing one of the above strings to this file causes the system
+		to transition into the corresponding state, if available.
+
+		See Documentation/power/states.txt for more information.
+
+What:		/sys/power/mem_sleep
+Date:		November 2016
+Contact:	Rafael J. Wysocki <rjw@rjwysocki.net>
+Description:
+		The /sys/power/mem_sleep file controls the operating mode of
+		system suspend.  Reading from it returns the available modes
+		as "s2idle" (always present), "shallow" and "deep" (present if
+		supported).  The mode that will be used on subsequent attempts
+		to suspend the system (by writing "mem" to the /sys/power/state
+		file described above) is enclosed in square brackets.
+
+		Writing one of the above strings to this file causes the mode
+		represented by it to be used on subsequent attempts to suspend
+		the system.
+
+		See Documentation/power/states.txt for more information.
 
 What:		/sys/power/disk
 Date:		September 2006
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 37babf91f2cb..4131e169f97a 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2325,6 +2325,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			memory contents and reserves bad memory
 			regions that are detected.
 
+	mem_sleep_default=	[SUSPEND] Default system suspend mode:
+			s2idle  - Suspend-To-Idle
+			shallow - Power-On Suspend or equivalent (if supported)
+			deep    - Suspend-To-RAM or equivalent (if supported)
+			See Documentation/power/states.txt.
+
 	meye.*=		[HW] Set MotionEye Camera parameters
 			See Documentation/video4linux/meye.txt.
 
@@ -3668,13 +3674,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			[KNL, SMP] Set scheduler's default relax_domain_level.
 			See Documentation/cgroup-v1/cpusets.txt.
 
-	relative_sleep_states=
-			[SUSPEND] Use sleep state labeling where the deepest
-			state available other than hibernation is always "mem".
-			Format: { "0" | "1" }
-			0 -- Traditional sleep state labels.
-			1 -- Relative sleep state labels.
-
 	reserve=	[KNL,BUGS] Force the kernel to ignore some iomem area
 
 	reservetop=	[X86-32]
diff --git a/Documentation/power/states.txt b/Documentation/power/states.txt
index 50f3ef9177c1..008ecb588317 100644
--- a/Documentation/power/states.txt
+++ b/Documentation/power/states.txt
@@ -8,25 +8,41 @@ for each state.
 
 The states are represented by strings that can be read or written to the
 /sys/power/state file.  Those strings may be "mem", "standby", "freeze" and
-"disk", where the last one always represents hibernation (Suspend-To-Disk) and
-the meaning of the remaining ones depends on the relative_sleep_states command
-line argument.
-
-For relative_sleep_states=1, the strings "mem", "standby" and "freeze" label the
-available non-hibernation sleep states from the deepest to the shallowest,
-respectively.  In that case, "mem" is always present in /sys/power/state,
-because there is at least one non-hibernation sleep state in every system.  If
-the given system supports two non-hibernation sleep states, "standby" is present
-in /sys/power/state in addition to "mem".  If the system supports three
-non-hibernation sleep states, "freeze" will be present in /sys/power/state in
-addition to "mem" and "standby".
-
-For relative_sleep_states=0, which is the default, the following descriptions
-apply.
-
-state:		Suspend-To-Idle
+"disk", where the last three always represent Power-On Suspend (if supported),
+Suspend-To-Idle and hibernation (Suspend-To-Disk), respectively.
+
+The meaning of the "mem" string is controlled by the /sys/power/mem_sleep file.
+It contains strings representing the available modes of system suspend that may
+be triggered by writing "mem" to /sys/power/state.  These modes are "s2idle"
+(Suspend-To-Idle), "shallow" (Power-On Suspend) and "deep" (Suspend-To-RAM).
+The "s2idle" mode is always available, while the other ones are only available
+if supported by the platform (if not supported, the strings representing them
+are not present in /sys/power/mem_sleep).  The string representing the suspend
+mode to be used subsequently is enclosed in square brackets.  Writing one of
+the other strings present in /sys/power/mem_sleep to it causes the suspend mode
+to be used subsequently to change to the one represented by that string.
+
+Consequently, there are two ways to cause the system to go into the
+Suspend-To-Idle sleep state.  The first one is to write "freeze" directly to
+/sys/power/state.  The second one is to write "s2idle" to /sys/power/mem_sleep
+and then to wrtie "mem" to /sys/power/state.  Similarly, there are two ways
+to cause the system to go into the Power-On Suspend sleep state (the strings to
+write to the control files in that case are "standby" or "shallow" and "mem",
+respectively) if that state is supported by the platform.  In turn, there is
+only one way to cause the system to go into the Suspend-To-RAM state (write
+"deep" into /sys/power/mem_sleep and "mem" into /sys/power/state).
+
+The default suspend mode (ie. the one to be used without writing anything into
+/sys/power/mem_sleep) is either "deep" (if Suspend-To-RAM is supported) or
+"s2idle", but it can be overridden by the value of the "mem_sleep_default"
+parameter in the kernel command line.
+
+The properties of all of the sleep states are described below.
+
+
+State:		Suspend-To-Idle
 ACPI state:	S0
-Label:		"freeze"
+Label:		"s2idle" ("freeze")
 
 This state is a generic, pure software, light-weight, system sleep state.
 It allows more energy to be saved relative to runtime idle by freezing user
@@ -35,13 +51,13 @@ lower-power than available at run time), such that the processors can
 spend more time in their idle states.
 
 This state can be used for platforms without Power-On Suspend/Suspend-to-RAM
-support, or it can be used in addition to Suspend-to-RAM (memory sleep)
-to provide reduced resume latency.  It is always supported.
+support, or it can be used in addition to Suspend-to-RAM to provide reduced
+resume latency.  It is always supported.
 
 
 State:		Standby / Power-On Suspend
 ACPI State:	S1
-Label:		"standby"
+Label:		"shallow" ("standby")
 
 This state, if supported, offers moderate, though real, power savings, while
 providing a relatively low-latency transition back to a working system.  No
@@ -58,7 +74,7 @@ state.
 
 State:		Suspend-to-RAM
 ACPI State:	S3
-Label:		"mem"
+Label:		"deep"
 
 This state, if supported, offers significant power savings as everything in the
 system is put into a low-power state, except for memory, which should be placed
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 281a697fd458..d401c21136d1 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -78,6 +78,78 @@ static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
 
 power_attr(pm_async);
 
+#ifdef CONFIG_SUSPEND
+static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr,
+			      char *buf)
+{
+	char *s = buf;
+	suspend_state_t i;
+
+	for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
+		if (mem_sleep_states[i]) {
+			const char *label = mem_sleep_states[i];
+
+			if (mem_sleep_current == i)
+				s += sprintf(s, "[%s] ", label);
+			else
+				s += sprintf(s, "%s ", label);
+		}
+
+	/* Convert the last space to a newline if needed. */
+	if (s != buf)
+		*(s-1) = '\n';
+
+	return (s - buf);
+}
+
+static suspend_state_t decode_suspend_state(const char *buf, size_t n)
+{
+	suspend_state_t state;
+	char *p;
+	int len;
+
+	p = memchr(buf, '\n', n);
+	len = p ? p - buf : n;
+
+	for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) {
+		const char *label = mem_sleep_states[state];
+
+		if (label && len == strlen(label) && !strncmp(buf, label, len))
+			return state;
+	}
+
+	return PM_SUSPEND_ON;
+}
+
+static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr,
+			       const char *buf, size_t n)
+{
+	suspend_state_t state;
+	int error;
+
+	error = pm_autosleep_lock();
+	if (error)
+		return error;
+
+	if (pm_autosleep_state() > PM_SUSPEND_ON) {
+		error = -EBUSY;
+		goto out;
+	}
+
+	state = decode_suspend_state(buf, n);
+	if (state < PM_SUSPEND_MAX && state > PM_SUSPEND_ON)
+		mem_sleep_current = state;
+	else
+		error = -EINVAL;
+
+ out:
+	pm_autosleep_unlock();
+	return error ? error : n;
+}
+
+power_attr(mem_sleep);
+#endif /* CONFIG_SUSPEND */
+
 #ifdef CONFIG_PM_DEBUG
 int pm_test_level = TEST_NONE;
 
@@ -368,12 +440,16 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
 	}
 
 	state = decode_state(buf, n);
-	if (state < PM_SUSPEND_MAX)
+	if (state < PM_SUSPEND_MAX) {
+		if (state == PM_SUSPEND_MEM)
+			state = mem_sleep_current;
+
 		error = pm_suspend(state);
-	else if (state == PM_SUSPEND_MAX)
+	} else if (state == PM_SUSPEND_MAX) {
 		error = hibernate();
-	else
+	} else {
 		error = -EINVAL;
+	}
 
  out:
 	pm_autosleep_unlock();
@@ -485,6 +561,9 @@ static ssize_t autosleep_store(struct kobject *kobj,
 	    && strcmp(buf, "off") && strcmp(buf, "off\n"))
 		return -EINVAL;
 
+	if (state == PM_SUSPEND_MEM)
+		state = mem_sleep_current;
+
 	error = pm_autosleep_set_state(state);
 	return error ? error : n;
 }
@@ -602,6 +681,9 @@ static struct attribute * g[] = {
 #ifdef CONFIG_PM_SLEEP
 	&pm_async_attr.attr,
 	&wakeup_count_attr.attr,
+#ifdef CONFIG_SUSPEND
+	&mem_sleep_attr.attr,
+#endif
 #ifdef CONFIG_PM_AUTOSLEEP
 	&autosleep_attr.attr,
 #endif
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 56d1d0dedf76..1dfa0da827d3 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -189,11 +189,15 @@ extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *);
 
 #ifdef CONFIG_SUSPEND
 /* kernel/power/suspend.c */
-extern const char *pm_labels[];
+extern const char * const pm_labels[];
 extern const char *pm_states[];
+extern const char *mem_sleep_states[];
+extern suspend_state_t mem_sleep_current;
 
 extern int suspend_devices_and_enter(suspend_state_t state);
 #else /* !CONFIG_SUSPEND */
+#define mem_sleep_current	PM_SUSPEND_ON
+
 static inline int suspend_devices_and_enter(suspend_state_t state)
 {
 	return -ENOSYS;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6ccb08f57fcb..15e6baef5c73 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -32,8 +32,21 @@
 
 #include "power.h"
 
-const char *pm_labels[] = { "mem", "standby", "freeze", NULL };
+const char * const pm_labels[] = {
+	[PM_SUSPEND_FREEZE] = "freeze",
+	[PM_SUSPEND_STANDBY] = "standby",
+	[PM_SUSPEND_MEM] = "mem",
+};
 const char *pm_states[PM_SUSPEND_MAX];
+static const char * const mem_sleep_labels[] = {
+	[PM_SUSPEND_FREEZE] = "s2idle",
+	[PM_SUSPEND_STANDBY] = "shallow",
+	[PM_SUSPEND_MEM] = "deep",
+};
+const char *mem_sleep_states[PM_SUSPEND_MAX];
+
+suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE;
+static suspend_state_t mem_sleep_default = PM_SUSPEND_MEM;
 
 unsigned int pm_suspend_global_flags;
 EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
@@ -110,30 +123,32 @@ static bool valid_state(suspend_state_t state)
 	return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
 }
 
-/*
- * If this is set, the "mem" label always corresponds to the deepest sleep state
- * available, the "standby" label corresponds to the second deepest sleep state
- * available (if any), and the "freeze" label corresponds to the remaining
- * available sleep state (if there is one).
- */
-static bool relative_states;
-
 void __init pm_states_init(void)
 {
+	/* "mem" and "freeze" are always present in /sys/power/state. */
+	pm_states[PM_SUSPEND_MEM] = pm_labels[PM_SUSPEND_MEM];
+	pm_states[PM_SUSPEND_FREEZE] = pm_labels[PM_SUSPEND_FREEZE];
 	/*
-	 * freeze state should be supported even without any suspend_ops,
-	 * initialize pm_states accordingly here
+	 * Suspend-to-idle should be supported even without any suspend_ops,
+	 * initialize mem_sleep_states[] accordingly here.
 	 */
-	pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2];
+	mem_sleep_states[PM_SUSPEND_FREEZE] = mem_sleep_labels[PM_SUSPEND_FREEZE];
 }
 
-static int __init sleep_states_setup(char *str)
+static int __init mem_sleep_default_setup(char *str)
 {
-	relative_states = !strncmp(str, "1", 1);
+	suspend_state_t state;
+
+	for (state = PM_SUSPEND_FREEZE; state <= PM_SUSPEND_MEM; state++)
+		if (mem_sleep_labels[state] &&
+		    !strcmp(str, mem_sleep_labels[state])) {
+			mem_sleep_default = state;
+			break;
+		}
+
 	return 1;
 }
-
-__setup("relative_sleep_states=", sleep_states_setup);
+__setup("mem_sleep_default=", mem_sleep_default_setup);
 
 /**
  * suspend_set_ops - Set the global suspend method table.
@@ -141,21 +156,21 @@ __setup("relative_sleep_states=", sleep_states_setup);
  */
 void suspend_set_ops(const struct platform_suspend_ops *ops)
 {
-	suspend_state_t i;
-	int j = 0;
-
 	lock_system_sleep();
 
 	suspend_ops = ops;
-	for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--)
-		if (valid_state(i)) {
-			pm_states[i] = pm_labels[j++];
-		} else if (!relative_states) {
-			pm_states[i] = NULL;
-			j++;
-		}
 
-	pm_states[PM_SUSPEND_FREEZE] = pm_labels[j];
+	if (valid_state(PM_SUSPEND_STANDBY)) {
+		mem_sleep_states[PM_SUSPEND_STANDBY] = mem_sleep_labels[PM_SUSPEND_STANDBY];
+		pm_states[PM_SUSPEND_STANDBY] = pm_labels[PM_SUSPEND_STANDBY];
+		if (mem_sleep_default == PM_SUSPEND_STANDBY)
+			mem_sleep_current = PM_SUSPEND_STANDBY;
+	}
+	if (valid_state(PM_SUSPEND_MEM)) {
+		mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM];
+		if (mem_sleep_default == PM_SUSPEND_MEM)
+			mem_sleep_current = PM_SUSPEND_MEM;
+	}
 
 	unlock_system_sleep();
 }
-- 
cgit v1.2.3


From 08b98d3291652bdcd1029a059e39fbcae5ad93e2 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 17 Nov 2016 03:28:53 +0100
Subject: PM / sleep / ACPI: Use the ACPI_FADT_LOW_POWER_S0 flag

Modify the ACPI system sleep support setup code to select
suspend-to-idle as the default system sleep state if the
ACPI_FADT_LOW_POWER_S0 flag is set in the FADT and the
default sleep state was not selected from the kernel command
line.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Tested-by: Mario Limonciello <mario.limonciello@dell.com>
---
 Documentation/power/states.txt | 4 +++-
 drivers/acpi/sleep.c           | 8 ++++++++
 include/linux/suspend.h        | 2 ++
 kernel/power/suspend.c         | 4 ++--
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/Documentation/power/states.txt b/Documentation/power/states.txt
index 008ecb588317..8a39ce45d8a0 100644
--- a/Documentation/power/states.txt
+++ b/Documentation/power/states.txt
@@ -35,7 +35,9 @@ only one way to cause the system to go into the Suspend-To-RAM state (write
 The default suspend mode (ie. the one to be used without writing anything into
 /sys/power/mem_sleep) is either "deep" (if Suspend-To-RAM is supported) or
 "s2idle", but it can be overridden by the value of the "mem_sleep_default"
-parameter in the kernel command line.
+parameter in the kernel command line.  On some ACPI-based systems, depending on
+the information in the FADT, the default may be "s2idle" even if Suspend-To-RAM
+is supported.
 
 The properties of all of the sleep states are described below.
 
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index deb0ff78eba8..ce1855fd584b 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -691,6 +691,14 @@ static void acpi_sleep_suspend_setup(void)
 		if (acpi_sleep_state_supported(i))
 			sleep_states[i] = 1;
 
+	/*
+	 * Use suspend-to-idle by default if ACPI_FADT_LOW_POWER_S0 is set and
+	 * the default suspend mode was not selected from the command line.
+	 */
+	if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0 &&
+	    mem_sleep_default > PM_SUSPEND_MEM)
+		mem_sleep_default = PM_SUSPEND_FREEZE;
+
 	suspend_set_ops(old_suspend_ordering ?
 		&acpi_suspend_ops_old : &acpi_suspend_ops);
 	freeze_set_ops(&acpi_freeze_ops);
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index d9718378a8be..0c729c3c8549 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -194,6 +194,8 @@ struct platform_freeze_ops {
 };
 
 #ifdef CONFIG_SUSPEND
+extern suspend_state_t mem_sleep_default;
+
 /**
  * suspend_set_ops - set platform dependent suspend operations
  * @ops: The new suspend operations to set.
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 15e6baef5c73..f67ceb7768b8 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -46,7 +46,7 @@ static const char * const mem_sleep_labels[] = {
 const char *mem_sleep_states[PM_SUSPEND_MAX];
 
 suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE;
-static suspend_state_t mem_sleep_default = PM_SUSPEND_MEM;
+suspend_state_t mem_sleep_default = PM_SUSPEND_MAX;
 
 unsigned int pm_suspend_global_flags;
 EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
@@ -168,7 +168,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops)
 	}
 	if (valid_state(PM_SUSPEND_MEM)) {
 		mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM];
-		if (mem_sleep_default == PM_SUSPEND_MEM)
+		if (mem_sleep_default >= PM_SUSPEND_MEM)
 			mem_sleep_current = PM_SUSPEND_MEM;
 	}
 
-- 
cgit v1.2.3


From 46992d6b55b558ac4128c1f846de3cfddfa7cf7c Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Mon, 21 Nov 2016 16:33:19 -0800
Subject: cpufreq: intel_pstate: round up min_perf limits

In some use cases, user wants to enforce a minimum performance limit on
CPUs. But because of simple division the resultant performance is 100 MHz
less than the desired in some cases.

For example when the maximum frequency is 3.50 GHz, setting
scaling_min_frequency to 1.6 GHz always results in 1.5 GHz minimum. With
simple round up, the frequency can be set to 1.6 GHz to minimum in this
case. This round up is already done to max_policy_pct and max_perf, so do
the same for min_policy_pct and min_perf.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 0d82bf320838..e5ef51d7be4e 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1583,8 +1583,8 @@ static void intel_pstate_update_perf_limits(struct cpufreq_policy *policy,
 	if (policy->max == policy->min) {
 		limits->min_policy_pct = limits->max_policy_pct;
 	} else {
-		limits->min_policy_pct = (policy->min * 100) /
-						policy->cpuinfo.max_freq;
+		limits->min_policy_pct = DIV_ROUND_UP(policy->min * 100,
+						      policy->cpuinfo.max_freq);
 		limits->min_policy_pct = clamp_t(int, limits->min_policy_pct,
 						 0, 100);
 	}
@@ -1605,6 +1605,7 @@ static void intel_pstate_update_perf_limits(struct cpufreq_policy *policy,
 	limits->min_perf = div_fp(limits->min_perf_pct, 100);
 	limits->max_perf = div_fp(limits->max_perf_pct, 100);
 	limits->max_perf = round_up(limits->max_perf, FRAC_BITS);
+	limits->min_perf = round_up(limits->min_perf, FRAC_BITS);
 
 	mutex_unlock(&intel_pstate_limits_lock);
 
-- 
cgit v1.2.3


From d5dd33d9de0d50db7f3ba221f9c4e4f74e61a69d Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Mon, 21 Nov 2016 16:33:20 -0800
Subject: cpufreq: intel_pstate: increase precision of performance limits

Even with round up of limits->min_perf and limits->max_perf, in some
cases resultant performance is 100 MHz less than the desired.

For example when the maximum frequency is 3.50 GHz, setting
scaling_min_frequency to 2.3 GHz always results in 2.2 GHz minimum.

Currently the fixed floating point operation uses 8 bit precision for
calculating limits->min_perf and limits->max_perf. For some operations
in this driver the 14 bit precision is used. Using the 14 bit precision
also for calculating limits->min_perf and limits->max_perf, addresses
this issue.

Introduced fp_ext_toint() equivalent to fp_toint() and int_ext_tofp()
equivalent to int_tofp() with 14 bit precision.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index e5ef51d7be4e..7159dbde0160 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -54,6 +54,8 @@
 
 #define EXT_BITS 6
 #define EXT_FRAC_BITS (EXT_BITS + FRAC_BITS)
+#define fp_ext_toint(X) ((X) >> EXT_FRAC_BITS)
+#define int_ext_tofp(X) ((int64_t)(X) << EXT_FRAC_BITS)
 
 static inline int32_t mul_fp(int32_t x, int32_t y)
 {
@@ -351,9 +353,9 @@ static struct perf_limits performance_limits = {
 	.no_turbo = 0,
 	.turbo_disabled = 0,
 	.max_perf_pct = 100,
-	.max_perf = int_tofp(1),
+	.max_perf = int_ext_tofp(1),
 	.min_perf_pct = 100,
-	.min_perf = int_tofp(1),
+	.min_perf = int_ext_tofp(1),
 	.max_policy_pct = 100,
 	.max_sysfs_pct = 100,
 	.min_policy_pct = 0,
@@ -364,7 +366,7 @@ static struct perf_limits powersave_limits = {
 	.no_turbo = 0,
 	.turbo_disabled = 0,
 	.max_perf_pct = 100,
-	.max_perf = int_tofp(1),
+	.max_perf = int_ext_tofp(1),
 	.min_perf_pct = 0,
 	.min_perf = 0,
 	.max_policy_pct = 100,
@@ -776,7 +778,7 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
 				   limits->max_perf_pct);
 	limits->max_perf_pct = max(limits->min_perf_pct,
 				   limits->max_perf_pct);
-	limits->max_perf = div_fp(limits->max_perf_pct, 100);
+	limits->max_perf = div_ext_fp(limits->max_perf_pct, 100);
 
 	mutex_unlock(&intel_pstate_limits_lock);
 
@@ -804,7 +806,7 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
 				   limits->min_perf_pct);
 	limits->min_perf_pct = min(limits->max_perf_pct,
 				   limits->min_perf_pct);
-	limits->min_perf = div_fp(limits->min_perf_pct, 100);
+	limits->min_perf = div_ext_fp(limits->min_perf_pct, 100);
 
 	mutex_unlock(&intel_pstate_limits_lock);
 
@@ -1189,11 +1191,11 @@ static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max)
 	 * policy, or by cpu specific default values determined through
 	 * experimentation.
 	 */
-	max_perf_adj = fp_toint(max_perf * perf_limits->max_perf);
+	max_perf_adj = fp_ext_toint(max_perf * perf_limits->max_perf);
 	*max = clamp_t(int, max_perf_adj,
 			cpu->pstate.min_pstate, cpu->pstate.turbo_pstate);
 
-	min_perf = fp_toint(max_perf * perf_limits->min_perf);
+	min_perf = fp_ext_toint(max_perf * perf_limits->min_perf);
 	*min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf);
 }
 
@@ -1561,9 +1563,9 @@ static void intel_pstate_set_performance_limits(struct perf_limits *limits)
 	limits->no_turbo = 0;
 	limits->turbo_disabled = 0;
 	limits->max_perf_pct = 100;
-	limits->max_perf = int_tofp(1);
+	limits->max_perf = int_ext_tofp(1);
 	limits->min_perf_pct = 100;
-	limits->min_perf = int_tofp(1);
+	limits->min_perf = int_ext_tofp(1);
 	limits->max_policy_pct = 100;
 	limits->max_sysfs_pct = 100;
 	limits->min_policy_pct = 0;
@@ -1602,10 +1604,10 @@ static void intel_pstate_update_perf_limits(struct cpufreq_policy *policy,
 	/* Make sure min_perf_pct <= max_perf_pct */
 	limits->min_perf_pct = min(limits->max_perf_pct, limits->min_perf_pct);
 
-	limits->min_perf = div_fp(limits->min_perf_pct, 100);
-	limits->max_perf = div_fp(limits->max_perf_pct, 100);
-	limits->max_perf = round_up(limits->max_perf, FRAC_BITS);
-	limits->min_perf = round_up(limits->min_perf, FRAC_BITS);
+	limits->min_perf = div_ext_fp(limits->min_perf_pct, 100);
+	limits->max_perf = div_ext_fp(limits->max_perf_pct, 100);
+	limits->max_perf = round_up(limits->max_perf, EXT_FRAC_BITS);
+	limits->min_perf = round_up(limits->min_perf, EXT_FRAC_BITS);
 
 	mutex_unlock(&intel_pstate_limits_lock);
 
-- 
cgit v1.2.3


From a94e502c22b61072f808f53392a8433bc948b03d Mon Sep 17 00:00:00 2001
From: Sudeep Holla <Sudeep.Holla@arm.com>
Date: Thu, 10 Nov 2016 14:24:33 +0000
Subject: cpuidle: dt: assign ->enter_freeze to same as ->enter callback
 function

enter_freeze() callback is expected atleast to do the same as enter()
but it has to guarantee that interrupts aren't enabled at any point
in its execution, as the tick is frozen.

CPUs execute ->enter_freeze with the local tick or entire timekeeping
suspended, so it must not re-enable interrupts at any point (even
temporarily) or attempt to change states of clock event devices.

It will be called when the system goes to suspend-to-idle and will
reduce power usage because CPUs won't be awaken for unnecessary IRQs
(i.e. woken up only on IRQs from "wakeup sources")

We can reuse the same code for both the enter() and enter_freeze()
callbacks as along as they don't re-enable interrupts. Only "coupled"
cpuidle mechanism enables interrupts and doing that with timekeeping
suspended is generally not safe.

Since this generic DT based idle driver doesn't support "coupled"
states, it is safe to assume that the interrupts are not re-enabled.

This patch assign enter_freeze to same as enter callback function which
helps to save power without any intermittent spurious wakeups from
suspend-to-idle.

Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Tested-by: Andy Gross <andy.gross@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/dt_idle_states.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/cpuidle/dt_idle_states.c b/drivers/cpuidle/dt_idle_states.c
index a5c111b67f37..ffca4fc0061d 100644
--- a/drivers/cpuidle/dt_idle_states.c
+++ b/drivers/cpuidle/dt_idle_states.c
@@ -38,6 +38,12 @@ static int init_state_node(struct cpuidle_state *idle_state,
 	 * state enter function.
 	 */
 	idle_state->enter = match_id->data;
+	/*
+	 * Since this is not a "coupled" state, it's safe to assume interrupts
+	 * won't be enabled when it exits allowing the tick to be frozen
+	 * safely. So enter() can be also enter_freeze() callback.
+	 */
+	idle_state->enter_freeze = match_id->data;
 
 	err = of_property_read_u32(state_node, "wakeup-latency-us",
 				   &idle_state->exit_latency);
-- 
cgit v1.2.3


From d06e622d3d9206e6a2cc45a0f9a3256da8773ff4 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 24 Nov 2016 13:51:11 +0530
Subject: cpufreq: schedutil: Rectify comment in sugov_irq_work() function

This patch rectifies a comment present in sugov_irq_work() function to
follow proper grammar.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 kernel/sched/cpufreq_schedutil.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 42a220e78f00..fd4659313640 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -315,15 +315,15 @@ static void sugov_irq_work(struct irq_work *irq_work)
 	sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
 
 	/*
-	 * For Real Time and Deadline tasks, schedutil governor shoots the
-	 * frequency to maximum. And special care must be taken to ensure that
-	 * this kthread doesn't result in that.
+	 * For RT and deadline tasks, the schedutil governor shoots the
+	 * frequency to maximum. Special care must be taken to ensure that this
+	 * kthread doesn't result in the same behavior.
 	 *
 	 * This is (mostly) guaranteed by the work_in_progress flag. The flag is
-	 * updated only at the end of the sugov_work() and before that schedutil
-	 * rejects all other frequency scaling requests.
+	 * updated only at the end of the sugov_work() function and before that
+	 * the schedutil governor rejects all other frequency scaling requests.
 	 *
-	 * Though there is a very rare case where the RT thread yields right
+	 * There is a very rare case though, where the RT thread yields right
 	 * after the work_in_progress flag is cleared. The effects of that are
 	 * neglected for now.
 	 */
-- 
cgit v1.2.3


From bed5ab6375cd556d93661bbcea0d18c109c50df1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 22 Nov 2016 21:15:58 +0000
Subject: powercap/intel_rapl: Add missing domain data update on hotplug

The domain data of packages is only updated at init time, but new packages
created by hotplug miss that treatment.

Add it there and remove the global update at init time, because it's now
obsolete.

The more interesting question is why rapl_update_domain_data() exists at
all as nothing ever uses that data.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl.c | 42 ++++++++++++++++++------------------------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index 243b233ff31b..94b9901d192f 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -1164,24 +1164,20 @@ static const struct x86_cpu_id rapl_ids[] __initconst = {
 };
 MODULE_DEVICE_TABLE(x86cpu, rapl_ids);
 
-/* read once for all raw primitive data for all packages, domains */
-static void rapl_update_domain_data(void)
+/* Read once for all raw primitive data for domains */
+static void rapl_update_domain_data(struct rapl_package *rp)
 {
 	int dmn, prim;
 	u64 val;
-	struct rapl_package *rp;
 
-	list_for_each_entry(rp, &rapl_packages, plist) {
-		for (dmn = 0; dmn < rp->nr_domains; dmn++) {
-			pr_debug("update package %d domain %s data\n", rp->id,
-				rp->domains[dmn].name);
-			/* exclude non-raw primitives */
-			for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++)
-				if (!rapl_read_data_raw(&rp->domains[dmn], prim,
-								rpi[prim].unit,
-								&val))
-					rp->domains[dmn].rdd.primitives[prim] =
-									val;
+	for (dmn = 0; dmn < rp->nr_domains; dmn++) {
+		pr_debug("update package %d domain %s data\n", rp->id,
+			 rp->domains[dmn].name);
+		/* exclude non-raw primitives */
+		for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) {
+			if (!rapl_read_data_raw(&rp->domains[dmn], prim,
+						rpi[prim].unit, &val))
+				rp->domains[dmn].rdd.primitives[prim] =	val;
 		}
 	}
 
@@ -1234,10 +1230,12 @@ static int rapl_unregister_powercap(void)
 static int rapl_package_register_powercap(struct rapl_package *rp)
 {
 	struct rapl_domain *rd;
-	int ret = 0;
 	char dev_name[17]; /* max domain name = 7 + 1 + 8 for int + 1 for null*/
 	struct powercap_zone *power_zone = NULL;
-	int nr_pl;
+	int nr_pl, ret;;
+
+	/* Update the domain data of the new package */
+	rapl_update_domain_data(rp);
 
 	/* first we register package domain as the parent zone*/
 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
@@ -1257,8 +1255,7 @@ static int rapl_package_register_powercap(struct rapl_package *rp)
 			if (IS_ERR(power_zone)) {
 				pr_debug("failed to register package, %d\n",
 					rp->id);
-				ret = PTR_ERR(power_zone);
-				goto exit_package;
+				return PTR_ERR(power_zone);
 			}
 			/* track parent zone in per package/socket data */
 			rp->power_zone = power_zone;
@@ -1268,8 +1265,7 @@ static int rapl_package_register_powercap(struct rapl_package *rp)
 	}
 	if (!power_zone) {
 		pr_err("no package domain found, unknown topology!\n");
-		ret = -ENODEV;
-		goto exit_package;
+		return -ENODEV;
 	}
 	/* now register domains as children of the socket/package*/
 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
@@ -1290,9 +1286,8 @@ static int rapl_package_register_powercap(struct rapl_package *rp)
 			goto err_cleanup;
 		}
 	}
+	return 0;
 
-exit_package:
-	return ret;
 err_cleanup:
 	/* clean up previously initialized domains within the package if we
 	 * failed after the first domain setup.
@@ -1357,8 +1352,7 @@ static int rapl_register_powercap(void)
 		pr_debug("failed to register powercap control_type.\n");
 		return PTR_ERR(control_type);
 	}
-	/* read the initial data */
-	rapl_update_domain_data();
+
 	list_for_each_entry(rp, &rapl_packages, plist)
 		if (rapl_package_register_powercap(rp))
 			goto err_cleanup_package;
-- 
cgit v1.2.3


From a74f43675790ae55b908940652b4b04b236d8f18 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 22 Nov 2016 21:15:59 +0000
Subject: powercap/intel_rapl: Propagate error code when registration fails

If rapl_package_register_powercap() fails in rapl_add_package() the
function happily returns 0.

Capture the error code and propagate it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index 94b9901d192f..79b1e041f2ec 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -1563,9 +1563,8 @@ static void rapl_remove_package(struct rapl_package *rp)
 /* called from CPU hotplug notifier, hotplug lock held */
 static int rapl_add_package(int cpu)
 {
-	int ret = 0;
-	int phy_package_id;
 	struct rapl_package *rp;
+	int ret, phy_package_id;
 
 	phy_package_id = topology_physical_package_id(cpu);
 	rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL);
@@ -1583,10 +1582,11 @@ static int rapl_add_package(int cpu)
 		ret = -ENODEV;
 		goto err_free_package;
 	}
-	if (!rapl_package_register_powercap(rp)) {
+	ret = rapl_package_register_powercap(rp);
+	if (!ret) {
 		INIT_LIST_HEAD(&rp->plist);
 		list_add(&rp->plist, &rapl_packages);
-		return ret;
+		return 0;
 	}
 
 err_free_package:
-- 
cgit v1.2.3


From 5e4dc7915979a5a1ca9551659f0fdcd116a25b8f Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 22 Nov 2016 21:16:00 +0000
Subject: powercap/intel rapl: Convert to hotplug state machine

Install the callbacks via the state machine as a first step. The init/exit
code is a duplicate of the hotplug code. This is cleaned up in a
consecutive patch.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl.c | 94 ++++++++++++++++++++++---------------------
 1 file changed, 49 insertions(+), 45 deletions(-)

diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index 79b1e041f2ec..de5cf22371f5 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -1603,50 +1603,48 @@ err_free_package:
  * associated domains. Cooling devices are handled accordingly at
  * per-domain level.
  */
-static int rapl_cpu_callback(struct notifier_block *nfb,
-				unsigned long action, void *hcpu)
+static int rapl_cpu_online(unsigned int cpu)
+{
+	struct rapl_package *rp;
+	int phy_package_id;
+
+	phy_package_id = topology_physical_package_id(cpu);
+
+	rp = find_package_by_id(phy_package_id);
+	if (rp)
+		rp->nr_cpus++;
+	else
+		rapl_add_package(cpu);
+	return 0;
+}
+
+static int rapl_cpu_down_prep(unsigned int cpu)
 {
-	unsigned long cpu = (unsigned long)hcpu;
 	int phy_package_id;
 	struct rapl_package *rp;
 	int lead_cpu;
 
 	phy_package_id = topology_physical_package_id(cpu);
-	switch (action) {
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-	case CPU_DOWN_FAILED:
-	case CPU_DOWN_FAILED_FROZEN:
-		rp = find_package_by_id(phy_package_id);
-		if (rp)
-			++rp->nr_cpus;
-		else
-			rapl_add_package(cpu);
-		break;
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
-		rp = find_package_by_id(phy_package_id);
-		if (!rp)
-			break;
-		if (--rp->nr_cpus == 0)
-			rapl_remove_package(rp);
-		else if (cpu == rp->lead_cpu) {
-			/* choose another active cpu in the package */
-			lead_cpu = cpumask_any_but(topology_core_cpumask(cpu), cpu);
-			if (lead_cpu < nr_cpu_ids)
-				rp->lead_cpu = lead_cpu;
-			else /* should never go here */
-				pr_err("no active cpu available for package %d\n",
-					phy_package_id);
+	rp = find_package_by_id(phy_package_id);
+	if (!rp)
+		return 0;
+	if (--rp->nr_cpus == 0) {
+		rapl_remove_package(rp);
+	} else if (cpu == rp->lead_cpu) {
+		/* choose another active cpu in the package */
+		lead_cpu = cpumask_any_but(topology_core_cpumask(cpu), cpu);
+		if (lead_cpu < nr_cpu_ids) {
+			rp->lead_cpu = lead_cpu;
+		} else {
+			/* should never go here */
+			pr_err("no active cpu available for package %d\n",
+			       phy_package_id);
 		}
 	}
-
-	return NOTIFY_OK;
+	return 0;
 }
 
-static struct notifier_block rapl_cpu_notifier = {
-	.notifier_call = rapl_cpu_callback,
-};
+static enum cpuhp_state pcap_rapl_online;
 
 static int __init rapl_init(void)
 {
@@ -1663,36 +1661,42 @@ static int __init rapl_init(void)
 
 	rapl_defaults = (struct rapl_defaults *)id->driver_data;
 
-	cpu_notifier_register_begin();
-
 	/* prevent CPU hotplug during detection */
 	get_online_cpus();
 	ret = rapl_detect_topology();
 	if (ret)
-		goto done;
+		goto err;
 
 	if (rapl_register_powercap()) {
-		rapl_cleanup_data();
 		ret = -ENODEV;
-		goto done;
+		goto err_cleanup;
 	}
-	__register_hotcpu_notifier(&rapl_cpu_notifier);
-done:
+	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+					"powercap/rapl:online",
+					rapl_cpu_online, rapl_cpu_down_prep);
+	if (ret < 0)
+		goto err_unreg;
+	pcap_rapl_online = ret;
 	put_online_cpus();
-	cpu_notifier_register_done();
+	return 0;
+
+err_unreg:
+	rapl_unregister_powercap();
 
+err_cleanup:
+	rapl_cleanup_data();
+err:
+	put_online_cpus();
 	return ret;
 }
 
 static void __exit rapl_exit(void)
 {
-	cpu_notifier_register_begin();
 	get_online_cpus();
-	__unregister_hotcpu_notifier(&rapl_cpu_notifier);
+	cpuhp_remove_state(pcap_rapl_online);
 	rapl_unregister_powercap();
 	rapl_cleanup_data();
 	put_online_cpus();
-	cpu_notifier_register_done();
 }
 
 module_init(rapl_init);
-- 
cgit v1.2.3


From 58705069204cc8afa0d7b759e81b61147e973de9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 22 Nov 2016 21:16:02 +0000
Subject: powercap/intel_rapl: Cleanup duplicated init code

The whole init/exit code is a duplicate of the cpuhotplug code. So we can
just let the hotplug code do the actual work of setting up and tearing down
the domains.

This also restores the package hardware when a package is removed during
hotplug instead of leaving it in the last configured state and only reset
it when the driver is removed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl.c | 234 +++++++++---------------------------------
 1 file changed, 46 insertions(+), 188 deletions(-)

diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index de5cf22371f5..76a00696a5ce 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -275,18 +275,6 @@ static struct rapl_package *find_package_by_id(int id)
 	return NULL;
 }
 
-/* caller must hold cpu hotplug lock */
-static void rapl_cleanup_data(void)
-{
-	struct rapl_package *p, *tmp;
-
-	list_for_each_entry_safe(p, tmp, &rapl_packages, plist) {
-		kfree(p->domains);
-		list_del(&p->plist);
-		kfree(p);
-	}
-}
-
 static int get_energy_counter(struct powercap_zone *power_zone, u64 *energy_raw)
 {
 	struct rapl_domain *rd;
@@ -976,10 +964,20 @@ static void package_power_limit_irq_save(struct rapl_package *rp)
 	smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1);
 }
 
-static void power_limit_irq_restore_cpu(void *info)
+/*
+ * Restore per package power limit interrupt enable state. Called from cpu
+ * hotplug code on package removal.
+ */
+static void package_power_limit_irq_restore(struct rapl_package *rp)
 {
-	u32 l, h = 0;
-	struct rapl_package *rp = (struct rapl_package *)info;
+	u32 l, h;
+
+	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
+		return;
+
+	/* irq enable state not saved, nothing to restore */
+	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
+		return;
 
 	rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h);
 
@@ -991,19 +989,6 @@ static void power_limit_irq_restore_cpu(void *info)
 	wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
 }
 
-/* restore per package power limit interrupt enable state */
-static void package_power_limit_irq_restore(struct rapl_package *rp)
-{
-	if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN))
-		return;
-
-	/* irq enable state not saved, nothing to restore */
-	if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED))
-		return;
-
-	smp_call_function_single(rp->lead_cpu, power_limit_irq_restore_cpu, rp, 1);
-}
-
 static void set_floor_freq_default(struct rapl_domain *rd, bool mode)
 {
 	int nr_powerlimit = find_nr_power_limit(rd);
@@ -1183,48 +1168,14 @@ static void rapl_update_domain_data(struct rapl_package *rp)
 
 }
 
-static int rapl_unregister_powercap(void)
+static void rapl_unregister_powercap(void)
 {
-	struct rapl_package *rp;
-	struct rapl_domain *rd, *rd_package = NULL;
-
-	/* unregister all active rapl packages from the powercap layer,
-	 * hotplug lock held
-	 */
-	list_for_each_entry(rp, &rapl_packages, plist) {
-		package_power_limit_irq_restore(rp);
-
-		for (rd = rp->domains; rd < rp->domains + rp->nr_domains;
-		     rd++) {
-			pr_debug("remove package, undo power limit on %d: %s\n",
-				rp->id, rd->name);
-			rapl_write_data_raw(rd, PL1_ENABLE, 0);
-			rapl_write_data_raw(rd, PL1_CLAMP, 0);
-			if (find_nr_power_limit(rd) > 1) {
-				rapl_write_data_raw(rd, PL2_ENABLE, 0);
-				rapl_write_data_raw(rd, PL2_CLAMP, 0);
-			}
-			if (rd->id == RAPL_DOMAIN_PACKAGE) {
-				rd_package = rd;
-				continue;
-			}
-			powercap_unregister_zone(control_type, &rd->power_zone);
-		}
-		/* do the package zone last */
-		if (rd_package)
-			powercap_unregister_zone(control_type,
-						&rd_package->power_zone);
-	}
-
 	if (platform_rapl_domain) {
 		powercap_unregister_zone(control_type,
 					 &platform_rapl_domain->power_zone);
 		kfree(platform_rapl_domain);
 	}
-
 	powercap_unregister_control_type(control_type);
-
-	return 0;
 }
 
 static int rapl_package_register_powercap(struct rapl_package *rp)
@@ -1289,7 +1240,8 @@ static int rapl_package_register_powercap(struct rapl_package *rp)
 	return 0;
 
 err_cleanup:
-	/* clean up previously initialized domains within the package if we
+	/*
+	 * Clean up previously initialized domains within the package if we
 	 * failed after the first domain setup.
 	 */
 	while (--rd >= rp->domains) {
@@ -1300,7 +1252,7 @@ err_cleanup:
 	return ret;
 }
 
-static int rapl_register_psys(void)
+static int __init rapl_register_psys(void)
 {
 	struct rapl_domain *rd;
 	struct powercap_zone *power_zone;
@@ -1341,39 +1293,14 @@ static int rapl_register_psys(void)
 	return 0;
 }
 
-static int rapl_register_powercap(void)
+static int __init rapl_register_powercap(void)
 {
-	struct rapl_domain *rd;
-	struct rapl_package *rp;
-	int ret = 0;
-
 	control_type = powercap_register_control_type(NULL, "intel-rapl", NULL);
 	if (IS_ERR(control_type)) {
 		pr_debug("failed to register powercap control_type.\n");
 		return PTR_ERR(control_type);
 	}
-
-	list_for_each_entry(rp, &rapl_packages, plist)
-		if (rapl_package_register_powercap(rp))
-			goto err_cleanup_package;
-
-	/* Don't bail out if PSys is not supported */
-	rapl_register_psys();
-
-	return ret;
-
-err_cleanup_package:
-	/* clean up previously initialized packages */
-	list_for_each_entry_continue_reverse(rp, &rapl_packages, plist) {
-		for (rd = rp->domains; rd < rp->domains + rp->nr_domains;
-		     rd++) {
-			pr_debug("unregister zone/package %d, %s domain\n",
-				rp->id, rd->name);
-			powercap_unregister_zone(control_type, &rd->power_zone);
-		}
-	}
-
-	return ret;
+	return 0;
 }
 
 static int rapl_check_domain(int cpu, int domain)
@@ -1446,9 +1373,8 @@ static void rapl_detect_powerlimit(struct rapl_domain *rd)
  */
 static int rapl_detect_domains(struct rapl_package *rp, int cpu)
 {
-	int i;
-	int ret = 0;
 	struct rapl_domain *rd;
+	int i;
 
 	for (i = 0; i < RAPL_DOMAIN_MAX; i++) {
 		/* use physical package id to read counters */
@@ -1460,84 +1386,20 @@ static int rapl_detect_domains(struct rapl_package *rp, int cpu)
 	rp->nr_domains = bitmap_weight(&rp->domain_map,	RAPL_DOMAIN_MAX);
 	if (!rp->nr_domains) {
 		pr_debug("no valid rapl domains found in package %d\n", rp->id);
-		ret = -ENODEV;
-		goto done;
+		return -ENODEV;
 	}
 	pr_debug("found %d domains on package %d\n", rp->nr_domains, rp->id);
 
 	rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain),
 			GFP_KERNEL);
-	if (!rp->domains) {
-		ret = -ENOMEM;
-		goto done;
-	}
+	if (!rp->domains)
+		return -ENOMEM;
+
 	rapl_init_domains(rp);
 
 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++)
 		rapl_detect_powerlimit(rd);
 
-
-
-done:
-	return ret;
-}
-
-static bool is_package_new(int package)
-{
-	struct rapl_package *rp;
-
-	/* caller prevents cpu hotplug, there will be no new packages added
-	 * or deleted while traversing the package list, no need for locking.
-	 */
-	list_for_each_entry(rp, &rapl_packages, plist)
-		if (package == rp->id)
-			return false;
-
-	return true;
-}
-
-/* RAPL interface can be made of a two-level hierarchy: package level and domain
- * level. We first detect the number of packages then domains of each package.
- * We have to consider the possiblity of CPU online/offline due to hotplug and
- * other scenarios.
- */
-static int rapl_detect_topology(void)
-{
-	int i;
-	int phy_package_id;
-	struct rapl_package *new_package, *rp;
-
-	for_each_online_cpu(i) {
-		phy_package_id = topology_physical_package_id(i);
-		if (is_package_new(phy_package_id)) {
-			new_package = kzalloc(sizeof(*rp), GFP_KERNEL);
-			if (!new_package) {
-				rapl_cleanup_data();
-				return -ENOMEM;
-			}
-			/* add the new package to the list */
-			new_package->id = phy_package_id;
-			new_package->nr_cpus = 1;
-			/* use the first active cpu of the package to access */
-			new_package->lead_cpu = i;
-			/* check if the package contains valid domains */
-			if (rapl_detect_domains(new_package, i) ||
-				rapl_defaults->check_unit(new_package, i)) {
-				kfree(new_package->domains);
-				kfree(new_package);
-				/* free up the packages already initialized */
-				rapl_cleanup_data();
-				return -ENODEV;
-			}
-			INIT_LIST_HEAD(&new_package->plist);
-			list_add(&new_package->plist, &rapl_packages);
-		} else {
-			rp = find_package_by_id(phy_package_id);
-			if (rp)
-				++rp->nr_cpus;
-		}
-	}
-
 	return 0;
 }
 
@@ -1546,12 +1408,21 @@ static void rapl_remove_package(struct rapl_package *rp)
 {
 	struct rapl_domain *rd, *rd_package = NULL;
 
+	package_power_limit_irq_restore(rp);
+
 	for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) {
+		rapl_write_data_raw(rd, PL1_ENABLE, 0);
+		rapl_write_data_raw(rd, PL1_CLAMP, 0);
+		if (find_nr_power_limit(rd) > 1) {
+			rapl_write_data_raw(rd, PL2_ENABLE, 0);
+			rapl_write_data_raw(rd, PL2_CLAMP, 0);
+		}
 		if (rd->id == RAPL_DOMAIN_PACKAGE) {
 			rd_package = rd;
 			continue;
 		}
-		pr_debug("remove package %d, %s domain\n", rp->id, rd->name);
+		pr_debug("remove package, undo power limit on %d: %s\n",
+			 rp->id, rd->name);
 		powercap_unregister_zone(control_type, &rd->power_zone);
 	}
 	/* do parent zone last */
@@ -1611,11 +1482,11 @@ static int rapl_cpu_online(unsigned int cpu)
 	phy_package_id = topology_physical_package_id(cpu);
 
 	rp = find_package_by_id(phy_package_id);
-	if (rp)
+	if (rp) {
 		rp->nr_cpus++;
-	else
-		rapl_add_package(cpu);
-	return 0;
+		return 0;
+	}
+	return rapl_add_package(cpu);
 }
 
 static int rapl_cpu_down_prep(unsigned int cpu)
@@ -1648,8 +1519,8 @@ static enum cpuhp_state pcap_rapl_online;
 
 static int __init rapl_init(void)
 {
-	int ret = 0;
 	const struct x86_cpu_id *id;
+	int ret;
 
 	id = x86_match_cpu(rapl_ids);
 	if (!id) {
@@ -1661,42 +1532,29 @@ static int __init rapl_init(void)
 
 	rapl_defaults = (struct rapl_defaults *)id->driver_data;
 
-	/* prevent CPU hotplug during detection */
-	get_online_cpus();
-	ret = rapl_detect_topology();
+	ret = rapl_register_powercap();
 	if (ret)
-		goto err;
+		return ret;
 
-	if (rapl_register_powercap()) {
-		ret = -ENODEV;
-		goto err_cleanup;
-	}
-	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
-					"powercap/rapl:online",
-					rapl_cpu_online, rapl_cpu_down_prep);
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powercap/rapl:online",
+				rapl_cpu_online, rapl_cpu_down_prep);
 	if (ret < 0)
 		goto err_unreg;
 	pcap_rapl_online = ret;
-	put_online_cpus();
+
+	/* Don't bail out if PSys is not supported */
+	rapl_register_psys();
 	return 0;
 
 err_unreg:
 	rapl_unregister_powercap();
-
-err_cleanup:
-	rapl_cleanup_data();
-err:
-	put_online_cpus();
 	return ret;
 }
 
 static void __exit rapl_exit(void)
 {
-	get_online_cpus();
 	cpuhp_remove_state(pcap_rapl_online);
 	rapl_unregister_powercap();
-	rapl_cleanup_data();
-	put_online_cpus();
 }
 
 module_init(rapl_init);
-- 
cgit v1.2.3


From b4005e9278a4e62819fb16ba4bc3430ca650d0ab Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 22 Nov 2016 21:16:05 +0000
Subject: powercap/intel_rapl: Track active CPUs internally

The ability of the CPU hotplug code to stop online/offline at each step
makes it necessary to track the activated CPUs in a package directly,
because outerwise a CPU offline callback can find CPUs which have already
executed the offline callback, but are not yet marked offline in the
topology mask. That could make such a CPU the package leader and in case
that CPU goes fully offline leave the package lead orphaned.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl.c | 59 ++++++++++++++++++-------------------------
 1 file changed, 24 insertions(+), 35 deletions(-)

diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index 76a00696a5ce..65ed88a28030 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -189,14 +189,13 @@ struct rapl_package {
 	unsigned int time_unit;
 	struct rapl_domain *domains; /* array of domains, sized at runtime */
 	struct powercap_zone *power_zone; /* keep track of parent zone */
-	int nr_cpus; /* active cpus on the package, topology info is lost during
-		      * cpu hotplug. so we have to track ourselves.
-		      */
 	unsigned long power_limit_irq; /* keep track of package power limit
 					* notify interrupt enable status.
 					*/
 	struct list_head plist;
 	int lead_cpu; /* one active cpu per package for access */
+	/* Track active cpus */
+	struct cpumask cpumask;
 };
 
 struct rapl_defaults {
@@ -1432,19 +1431,17 @@ static void rapl_remove_package(struct rapl_package *rp)
 }
 
 /* called from CPU hotplug notifier, hotplug lock held */
-static int rapl_add_package(int cpu)
+static struct rapl_package *rapl_add_package(int cpu, int pkgid)
 {
 	struct rapl_package *rp;
-	int ret, phy_package_id;
+	int ret;
 
-	phy_package_id = topology_physical_package_id(cpu);
 	rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL);
 	if (!rp)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	/* add the new package to the list */
-	rp->id = phy_package_id;
-	rp->nr_cpus = 1;
+	rp->id = pkgid;
 	rp->lead_cpu = cpu;
 
 	/* check if the package contains valid domains */
@@ -1457,14 +1454,13 @@ static int rapl_add_package(int cpu)
 	if (!ret) {
 		INIT_LIST_HEAD(&rp->plist);
 		list_add(&rp->plist, &rapl_packages);
-		return 0;
+		return rp;
 	}
 
 err_free_package:
 	kfree(rp->domains);
 	kfree(rp);
-
-	return ret;
+	return ERR_PTR(ret);
 }
 
 /* Handles CPU hotplug on multi-socket systems.
@@ -1476,42 +1472,35 @@ err_free_package:
  */
 static int rapl_cpu_online(unsigned int cpu)
 {
+	int pkgid = topology_physical_package_id(cpu);
 	struct rapl_package *rp;
-	int phy_package_id;
-
-	phy_package_id = topology_physical_package_id(cpu);
 
-	rp = find_package_by_id(phy_package_id);
-	if (rp) {
-		rp->nr_cpus++;
-		return 0;
+	rp = find_package_by_id(pkgid);
+	if (!rp) {
+		rp = rapl_add_package(cpu, pkgid);
+		if (IS_ERR(rp))
+			return PTR_ERR(rp);
 	}
-	return rapl_add_package(cpu);
+	cpumask_set_cpu(cpu, &rp->cpumask);
+	return 0;
 }
 
 static int rapl_cpu_down_prep(unsigned int cpu)
 {
-	int phy_package_id;
+	int pkgid = topology_physical_package_id(cpu);
 	struct rapl_package *rp;
 	int lead_cpu;
 
-	phy_package_id = topology_physical_package_id(cpu);
-	rp = find_package_by_id(phy_package_id);
+	rp = find_package_by_id(pkgid);
 	if (!rp)
 		return 0;
-	if (--rp->nr_cpus == 0) {
+
+	cpumask_clear_cpu(cpu, &rp->cpumask);
+	lead_cpu = cpumask_first(&rp->cpumask);
+	if (lead_cpu >= nr_cpu_ids)
 		rapl_remove_package(rp);
-	} else if (cpu == rp->lead_cpu) {
-		/* choose another active cpu in the package */
-		lead_cpu = cpumask_any_but(topology_core_cpumask(cpu), cpu);
-		if (lead_cpu < nr_cpu_ids) {
-			rp->lead_cpu = lead_cpu;
-		} else {
-			/* should never go here */
-			pr_err("no active cpu available for package %d\n",
-			       phy_package_id);
-		}
-	}
+	else if (rp->lead_cpu == cpu)
+		rp->lead_cpu = lead_cpu;
 	return 0;
 }
 
-- 
cgit v1.2.3


From ed61390bff3ad43166a2552651b09ebd95dd1da5 Mon Sep 17 00:00:00 2001
From: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Date: Tue, 22 Nov 2016 16:08:06 +1100
Subject: cpuidle/powernv: staticise powernv_idle_driver

powernv_idle_driver isn't exported, it can be made static. Found by sparse.

Signed-off-by: Andrew Donnellan <andrew.donnellan@au1.ibm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/cpuidle-powernv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c
index 7fe442ca38f4..0835a37a5f3a 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -22,7 +22,7 @@
 
 #define POWERNV_THRESHOLD_LATENCY_NS 200000
 
-struct cpuidle_driver powernv_idle_driver = {
+static struct cpuidle_driver powernv_idle_driver = {
 	.name             = "powernv_idle",
 	.owner            = THIS_MODULE,
 };
-- 
cgit v1.2.3


From 8442885fca09b2d26375b9fe507759879a6f661e Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Thu, 24 Nov 2016 16:07:10 -0800
Subject: cpufreq: intel_pstate: Set EPP/EPB to 0 in performance mode

When user has selected performance policy, then set the EPP (Energy
Performance Preference) or EPB (Energy Performance Bias) to maximum
performance mode.

Also when user switch back to powersave, then restore EPP/EPB to last
EPP/EPB value before entering performance mode. If user has not changed
EPP/EPB manually then it will be power on default value.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 106 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 105 insertions(+), 1 deletion(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 7159dbde0160..0b90a63de46d 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -249,6 +249,9 @@ struct perf_limits {
  *			when per cpu controls are enforced
  * @acpi_perf_data:	Stores ACPI perf information read from _PSS
  * @valid_pss_table:	Set to true for valid ACPI _PSS entries found
+ * @epp_saved:		Last saved HWP energy performance preference
+ *			(EPP) or energy performance bias (EPB)
+ * @epp_policy:		Last saved policy used to set EPP/EPB
  *
  * This structure stores per CPU instance data for all CPUs.
  */
@@ -276,6 +279,8 @@ struct cpudata {
 	bool valid_pss_table;
 #endif
 	unsigned int iowait_boost;
+	s16 epp_saved;
+	s16 epp_policy;
 };
 
 static struct cpudata **all_cpu_data;
@@ -574,6 +579,48 @@ static inline void update_turbo_state(void)
 		 cpu->pstate.max_pstate == cpu->pstate.turbo_pstate);
 }
 
+static s16 intel_pstate_get_epb(struct cpudata *cpu_data)
+{
+	u64 epb;
+	int ret;
+
+	if (!static_cpu_has(X86_FEATURE_EPB))
+		return -ENXIO;
+
+	ret = rdmsrl_on_cpu(cpu_data->cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
+	if (ret)
+		return (s16)ret;
+
+	return (s16)(epb & 0x0f);
+}
+
+static s16 intel_pstate_get_epp(struct cpudata *cpu_data, u64 hwp_req_data)
+{
+	s16 epp;
+
+	if (static_cpu_has(X86_FEATURE_HWP_EPP))
+		epp = (hwp_req_data >> 24) & 0xff;
+	else
+		/* When there is no EPP present, HWP uses EPB settings */
+		epp = intel_pstate_get_epb(cpu_data);
+
+	return epp;
+}
+
+static void intel_pstate_set_epb(int cpu, s16 pref)
+{
+	u64 epb;
+
+	if (!static_cpu_has(X86_FEATURE_EPB))
+		return;
+
+	if (rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb))
+		return;
+
+	epb = (epb & ~0x0f) | pref;
+	wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, epb);
+}
+
 static void intel_pstate_hwp_set(const struct cpumask *cpumask)
 {
 	int min, hw_min, max, hw_max, cpu, range, adj_range;
@@ -582,6 +629,8 @@ static void intel_pstate_hwp_set(const struct cpumask *cpumask)
 
 	for_each_cpu(cpu, cpumask) {
 		int max_perf_pct, min_perf_pct;
+		struct cpudata *cpu_data = all_cpu_data[cpu];
+		s16 epp;
 
 		if (per_cpu_limits)
 			perf_limits = all_cpu_data[cpu]->perf_limits;
@@ -610,6 +659,48 @@ static void intel_pstate_hwp_set(const struct cpumask *cpumask)
 
 		value &= ~HWP_MAX_PERF(~0L);
 		value |= HWP_MAX_PERF(max);
+
+		if (cpu_data->epp_policy == cpu_data->policy)
+			goto skip_epp;
+
+		cpu_data->epp_policy = cpu_data->policy;
+
+		if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) {
+			epp = intel_pstate_get_epp(cpu_data, value);
+			/* If EPP read was failed, then don't try to write */
+			if (epp < 0) {
+				cpu_data->epp_saved = epp;
+				goto skip_epp;
+			}
+
+			cpu_data->epp_saved = epp;
+
+			epp = 0;
+		} else {
+			/* skip setting EPP, when saved value is invalid */
+			if (cpu_data->epp_saved < 0)
+				goto skip_epp;
+
+			/*
+			 * No need to restore EPP when it is not zero. This
+			 * means:
+			 *  - Policy is not changed
+			 *  - user has manually changed
+			 *  - Error reading EPB
+			 */
+			epp = intel_pstate_get_epp(cpu_data, value);
+			if (epp)
+				goto skip_epp;
+
+			epp = cpu_data->epp_saved;
+		}
+		if (static_cpu_has(X86_FEATURE_HWP_EPP)) {
+			value &= ~GENMASK_ULL(31, 24);
+			value |= (u64)epp << 24;
+		} else {
+			intel_pstate_set_epb(cpu, epp);
+		}
+skip_epp:
 		wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value);
 	}
 }
@@ -622,6 +713,17 @@ static int intel_pstate_hwp_set_policy(struct cpufreq_policy *policy)
 	return 0;
 }
 
+static int intel_pstate_resume(struct cpufreq_policy *policy)
+{
+	if (!hwp_active)
+		return 0;
+
+	all_cpu_data[policy->cpu]->epp_policy = 0;
+	all_cpu_data[policy->cpu]->epp_saved = -EINVAL;
+
+	return intel_pstate_hwp_set_policy(policy);
+}
+
 static void intel_pstate_hwp_set_online_cpus(void)
 {
 	get_online_cpus();
@@ -872,6 +974,8 @@ static void intel_pstate_hwp_enable(struct cpudata *cpudata)
 		wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00);
 
 	wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1);
+	cpudata->epp_policy = 0;
+	cpudata->epp_saved = -EINVAL;
 }
 
 static int atom_get_min_pstate(void)
@@ -1767,7 +1871,7 @@ static struct cpufreq_driver intel_pstate = {
 	.flags		= CPUFREQ_CONST_LOOPS,
 	.verify		= intel_pstate_verify_policy,
 	.setpolicy	= intel_pstate_set_policy,
-	.resume		= intel_pstate_hwp_set_policy,
+	.resume		= intel_pstate_resume,
 	.get		= intel_pstate_get,
 	.init		= intel_pstate_cpu_init,
 	.exit		= intel_pstate_cpu_exit,
-- 
cgit v1.2.3


From 7a3ba767f6bf9d98f1c823417908719af4a22a65 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 25 Nov 2016 17:50:20 +0100
Subject: cpufreq: intel_pstate: fix intel_pstate_exit_perf_limits() prototype
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The addition of the generic governor support marked the
intel_pstate_exit_perf_limits as inline(), which fixed a warning,
but it introduced another warning:

drivers/cpufreq/intel_pstate.c: In function ‘intel_pstate_exit_perf_limits’:
drivers/cpufreq/intel_pstate.c:483:1: error: no return statement in function returning non-void [-Werror=return-type]

This changes it back to a 'void' return type, and changes the
corresponding intel_pstate_init_acpi_perf_limits() function to
be inline as well for consistency.

Fixes: 001c76f05b01 (cpufreq: intel_pstate: Generic governors support)
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 0b90a63de46d..c20da85d7713 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -479,11 +479,11 @@ static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
 }
 
 #else
-static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
+static inline void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
 {
 }
 
-static inline int intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
+static inline void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy)
 {
 }
 #endif
-- 
cgit v1.2.3


From 4d66ddf28dd6e0582eeca0266651d1f5f5db431e Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 28 Nov 2016 10:51:02 +0100
Subject: cpufreq: acpi-cpufreq: Convert to hotplug state machine

Install the callbacks via the state machine.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/acpi-cpufreq.c | 92 +++++++++++++++++++++---------------------
 1 file changed, 45 insertions(+), 47 deletions(-)

diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index 297e9128fe9f..52b25f2356f2 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -536,46 +536,33 @@ static void free_acpi_perf_data(void)
 	free_percpu(acpi_perf_data);
 }
 
-static int boost_notify(struct notifier_block *nb, unsigned long action,
-		      void *hcpu)
+static int cpufreq_boost_online(unsigned int cpu)
 {
-	unsigned cpu = (long)hcpu;
 	const struct cpumask *cpumask;
 
 	cpumask = get_cpu_mask(cpu);
-
 	/*
-	 * Clear the boost-disable bit on the CPU_DOWN path so that
-	 * this cpu cannot block the remaining ones from boosting. On
-	 * the CPU_UP path we simply keep the boost-disable flag in
-	 * sync with the current global state.
+	 * On the CPU_UP path we simply keep the boost-disable flag
+	 * in sync with the current global state.
 	 */
+	boost_set_msrs(acpi_cpufreq_driver.boost_enabled, cpumask);
+	return 0;
+}
 
-	switch (action) {
-	case CPU_DOWN_FAILED:
-	case CPU_DOWN_FAILED_FROZEN:
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-		boost_set_msrs(acpi_cpufreq_driver.boost_enabled, cpumask);
-		break;
-
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
-		boost_set_msrs(1, cpumask);
-		break;
+static int cpufreq_boost_down_prep(unsigned int cpu)
+{
+	const struct cpumask *cpumask;
 
-	default:
-		break;
-	}
+	cpumask = get_cpu_mask(cpu);
 
-	return NOTIFY_OK;
+	/*
+	 * Clear the boost-disable bit on the CPU_DOWN path so that
+	 * this cpu cannot block the remaining ones from boosting.
+	 */
+	boost_set_msrs(1, cpumask);
+	return 0;
 }
 
-
-static struct notifier_block boost_nb = {
-	.notifier_call          = boost_notify,
-};
-
 /*
  * acpi_cpufreq_early_init - initialize ACPI P-States library
  *
@@ -922,37 +909,48 @@ static struct cpufreq_driver acpi_cpufreq_driver = {
 	.attr		= acpi_cpufreq_attr,
 };
 
+static enum cpuhp_state acpi_cpufreq_online;
+
 static void __init acpi_cpufreq_boost_init(void)
 {
-	if (boot_cpu_has(X86_FEATURE_CPB) || boot_cpu_has(X86_FEATURE_IDA)) {
-		msrs = msrs_alloc();
-
-		if (!msrs)
-			return;
+	int ret;
 
-		acpi_cpufreq_driver.set_boost = set_boost;
-		acpi_cpufreq_driver.boost_enabled = boost_state(0);
+	if (!(boot_cpu_has(X86_FEATURE_CPB) || boot_cpu_has(X86_FEATURE_IDA)))
+		return;
 
-		cpu_notifier_register_begin();
+	msrs = msrs_alloc();
 
-		/* Force all MSRs to the same value */
-		boost_set_msrs(acpi_cpufreq_driver.boost_enabled,
-			       cpu_online_mask);
+	if (!msrs)
+		return;
 
-		__register_cpu_notifier(&boost_nb);
+	acpi_cpufreq_driver.set_boost = set_boost;
+	acpi_cpufreq_driver.boost_enabled = boost_state(0);
 
-		cpu_notifier_register_done();
+	/*
+	 * This calls the online callback on all online cpu and forces all
+	 * MSRs to the same value.
+	 */
+	ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "cpufreq/acpi:online",
+				cpufreq_boost_online, cpufreq_boost_down_prep);
+	if (ret < 0) {
+		pr_err("acpi_cpufreq: failed to register hotplug callbacks\n");
+		msrs_free(msrs);
+		msrs = NULL;
+		return;
 	}
+	acpi_cpufreq_online = ret;
 }
 
 static void acpi_cpufreq_boost_exit(void)
 {
-	if (msrs) {
-		unregister_cpu_notifier(&boost_nb);
+	if (!msrs)
+		return;
 
-		msrs_free(msrs);
-		msrs = NULL;
-	}
+	if (acpi_cpufreq_online >= 0)
+		cpuhp_remove_state_nocalls(acpi_cpufreq_online);
+
+	msrs_free(msrs);
+	msrs = NULL;
 }
 
 static int __init acpi_cpufreq_init(void)
-- 
cgit v1.2.3


From a3605c46e0c05b117e5710d72128f74b779c2e76 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 28 Nov 2016 10:52:03 +0100
Subject: cpufreq: acpi-cpufreq: drop rdmsr_on_cpus() usage

The online / pre_down callback is invoked on the target CPU since commit
1cf4f629d9d2 ("cpu/hotplug: Move online calls to hotplugged cpu") which means
for the hotplug callback we can use rmdsrl() instead of rdmsr_on_cpus().

This leaves us with set_boost() as the only user which still needs to
read/write the MSR on different CPUs. There is no point in doing that
update on all cpus with the read modify write magic via per cpu data. We
simply can issue a function call on all online CPUs which also means that we
need half that many IPIs.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/acpi-cpufreq.c | 59 ++++++++++++++----------------------------
 1 file changed, 20 insertions(+), 39 deletions(-)

diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index 52b25f2356f2..3a98702b7445 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -84,7 +84,6 @@ static inline struct acpi_processor_performance *to_perf_data(struct acpi_cpufre
 static struct cpufreq_driver acpi_cpufreq_driver;
 
 static unsigned int acpi_pstate_strict;
-static struct msr __percpu *msrs;
 
 static bool boost_state(unsigned int cpu)
 {
@@ -104,11 +103,10 @@ static bool boost_state(unsigned int cpu)
 	return false;
 }
 
-static void boost_set_msrs(bool enable, const struct cpumask *cpumask)
+static int boost_set_msr(bool enable)
 {
-	u32 cpu;
 	u32 msr_addr;
-	u64 msr_mask;
+	u64 msr_mask, val;
 
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_INTEL:
@@ -120,26 +118,31 @@ static void boost_set_msrs(bool enable, const struct cpumask *cpumask)
 		msr_mask = MSR_K7_HWCR_CPB_DIS;
 		break;
 	default:
-		return;
+		return -EINVAL;
 	}
 
-	rdmsr_on_cpus(cpumask, msr_addr, msrs);
+	rdmsrl(msr_addr, val);
 
-	for_each_cpu(cpu, cpumask) {
-		struct msr *reg = per_cpu_ptr(msrs, cpu);
-		if (enable)
-			reg->q &= ~msr_mask;
-		else
-			reg->q |= msr_mask;
-	}
+	if (enable)
+		val &= ~msr_mask;
+	else
+		val |= msr_mask;
 
-	wrmsr_on_cpus(cpumask, msr_addr, msrs);
+	wrmsrl(msr_addr, val);
+	return 0;
+}
+
+static void boost_set_msr_each(void *p_en)
+{
+	bool enable = (bool) p_en;
+
+	boost_set_msr(enable);
 }
 
 static int set_boost(int val)
 {
 	get_online_cpus();
-	boost_set_msrs(val, cpu_online_mask);
+	on_each_cpu(boost_set_msr_each, (void *)(long)val, 1);
 	put_online_cpus();
 	pr_debug("Core Boosting %sabled.\n", val ? "en" : "dis");
 
@@ -538,29 +541,20 @@ static void free_acpi_perf_data(void)
 
 static int cpufreq_boost_online(unsigned int cpu)
 {
-	const struct cpumask *cpumask;
-
-	cpumask = get_cpu_mask(cpu);
 	/*
 	 * On the CPU_UP path we simply keep the boost-disable flag
 	 * in sync with the current global state.
 	 */
-	boost_set_msrs(acpi_cpufreq_driver.boost_enabled, cpumask);
-	return 0;
+	return boost_set_msr(acpi_cpufreq_driver.boost_enabled);
 }
 
 static int cpufreq_boost_down_prep(unsigned int cpu)
 {
-	const struct cpumask *cpumask;
-
-	cpumask = get_cpu_mask(cpu);
-
 	/*
 	 * Clear the boost-disable bit on the CPU_DOWN path so that
 	 * this cpu cannot block the remaining ones from boosting.
 	 */
-	boost_set_msrs(1, cpumask);
-	return 0;
+	return boost_set_msr(1);
 }
 
 /*
@@ -918,11 +912,6 @@ static void __init acpi_cpufreq_boost_init(void)
 	if (!(boot_cpu_has(X86_FEATURE_CPB) || boot_cpu_has(X86_FEATURE_IDA)))
 		return;
 
-	msrs = msrs_alloc();
-
-	if (!msrs)
-		return;
-
 	acpi_cpufreq_driver.set_boost = set_boost;
 	acpi_cpufreq_driver.boost_enabled = boost_state(0);
 
@@ -934,8 +923,6 @@ static void __init acpi_cpufreq_boost_init(void)
 				cpufreq_boost_online, cpufreq_boost_down_prep);
 	if (ret < 0) {
 		pr_err("acpi_cpufreq: failed to register hotplug callbacks\n");
-		msrs_free(msrs);
-		msrs = NULL;
 		return;
 	}
 	acpi_cpufreq_online = ret;
@@ -943,14 +930,8 @@ static void __init acpi_cpufreq_boost_init(void)
 
 static void acpi_cpufreq_boost_exit(void)
 {
-	if (!msrs)
-		return;
-
 	if (acpi_cpufreq_online >= 0)
 		cpuhp_remove_state_nocalls(acpi_cpufreq_online);
-
-	msrs_free(msrs);
-	msrs = NULL;
 }
 
 static int __init acpi_cpufreq_init(void)
-- 
cgit v1.2.3


From cb43f81b8489dcb87555e16c17453f0a9fa690f2 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Mon, 28 Nov 2016 13:53:11 -0800
Subject: powercap/intel_rapl: fix and tidy up error handling

Commit e1399ba20eee ("powercap / RAPL: handle missing MSRs") added
contraint_to_pl() function to return index into an array. But it
can potentially return -EINVAL if powercap layer sends an out of
range constraint ID. This patch adds sanity check.

Unnecessary RAPL domain pointer check is removed since it must be
initialized before calling rapl_unit_xlate().

Fixes: e1399ba20eee ("powercap / RAPL: handle missing MSRs")
Reported-by: Odzioba, Lukasz <lukasz.odzioba@intel.com>
Reported-by: Koss, Marcin <marcin.koss@intel.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index 65ed88a28030..f6937d0b742e 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -429,6 +429,7 @@ static int contraint_to_pl(struct rapl_domain *rd, int cid)
 			return i;
 		}
 	}
+	pr_err("Cannot find matching power limit for constraint %d\n", cid);
 
 	return -EINVAL;
 }
@@ -444,6 +445,10 @@ static int set_power_limit(struct powercap_zone *power_zone, int cid,
 	get_online_cpus();
 	rd = power_zone_to_rapl_domain(power_zone);
 	id = contraint_to_pl(rd, cid);
+	if (id < 0) {
+		ret = id;
+		goto set_exit;
+	}
 
 	rp = rd->rp;
 
@@ -483,6 +488,11 @@ static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
 	get_online_cpus();
 	rd = power_zone_to_rapl_domain(power_zone);
 	id = contraint_to_pl(rd, cid);
+	if (id < 0) {
+		ret = id;
+		goto get_exit;
+	}
+
 	switch (rd->rpl[id].prim_id) {
 	case PL1_ENABLE:
 		prim = POWER_LIMIT1;
@@ -499,6 +509,7 @@ static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
 	else
 		*data = val;
 
+get_exit:
 	put_online_cpus();
 
 	return ret;
@@ -514,6 +525,10 @@ static int set_time_window(struct powercap_zone *power_zone, int cid,
 	get_online_cpus();
 	rd = power_zone_to_rapl_domain(power_zone);
 	id = contraint_to_pl(rd, cid);
+	if (id < 0) {
+		ret = id;
+		goto set_time_exit;
+	}
 
 	switch (rd->rpl[id].prim_id) {
 	case PL1_ENABLE:
@@ -525,6 +540,8 @@ static int set_time_window(struct powercap_zone *power_zone, int cid,
 	default:
 		ret = -EINVAL;
 	}
+
+set_time_exit:
 	put_online_cpus();
 	return ret;
 }
@@ -539,6 +556,10 @@ static int get_time_window(struct powercap_zone *power_zone, int cid, u64 *data)
 	get_online_cpus();
 	rd = power_zone_to_rapl_domain(power_zone);
 	id = contraint_to_pl(rd, cid);
+	if (id < 0) {
+		ret = id;
+		goto get_time_exit;
+	}
 
 	switch (rd->rpl[id].prim_id) {
 	case PL1_ENABLE:
@@ -553,6 +574,8 @@ static int get_time_window(struct powercap_zone *power_zone, int cid, u64 *data)
 	}
 	if (!ret)
 		*data = val;
+
+get_time_exit:
 	put_online_cpus();
 
 	return ret;
@@ -694,7 +717,7 @@ static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type,
 	case ENERGY_UNIT:
 		scale = ENERGY_UNIT_SCALE;
 		/* per domain unit takes precedence */
-		if (rd && rd->domain_energy_unit)
+		if (rd->domain_energy_unit)
 			units = rd->domain_energy_unit;
 		else
 			units = rp->energy_unit;
-- 
cgit v1.2.3


From bb8313b603eb8fd52de48a079bfcd72dcab2ef1e Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Mon, 28 Nov 2016 23:03:04 -0800
Subject: cpuidle: Allow enforcing deepest idle state selection

When idle injection is used to cap power, we need to override the
governor's choice of idle states.

For this reason, make it possible the deepest idle state selection to
be enforced by setting a flag on a given CPU to achieve the maximum
potential power draw reduction.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
[ rjw: Subject & changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/cpuidle.c | 13 ++++++++++++-
 include/linux/cpuidle.h   |  7 ++++++-
 kernel/sched/idle.c       | 13 ++++++++-----
 3 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index c73207abb5a4..afc005b917fe 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -97,7 +97,17 @@ static int find_deepest_state(struct cpuidle_driver *drv,
 	return ret;
 }
 
-#ifdef CONFIG_SUSPEND
+/* Set the current cpu to use the deepest idle state, override governors */
+void cpuidle_use_deepest_state(bool enable)
+{
+	struct cpuidle_device *dev;
+
+	preempt_disable();
+	dev = cpuidle_get_device();
+	dev->use_deepest_state = enable;
+	preempt_enable();
+}
+
 /**
  * cpuidle_find_deepest_state - Find the deepest available idle state.
  * @drv: cpuidle driver for the given CPU.
@@ -109,6 +119,7 @@ int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 	return find_deepest_state(drv, dev, UINT_MAX, 0, false);
 }
 
+#ifdef CONFIG_SUSPEND
 static void enter_freeze_proper(struct cpuidle_driver *drv,
 				struct cpuidle_device *dev, int index)
 {
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 15deea449edc..da346f2817a8 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -74,6 +74,7 @@ struct cpuidle_driver_kobj;
 struct cpuidle_device {
 	unsigned int		registered:1;
 	unsigned int		enabled:1;
+	unsigned int		use_deepest_state:1;
 	unsigned int		cpu;
 
 	int			last_residency;
@@ -192,11 +193,12 @@ static inline struct cpuidle_driver *cpuidle_get_cpu_driver(
 static inline struct cpuidle_device *cpuidle_get_device(void) {return NULL; }
 #endif
 
-#if defined(CONFIG_CPU_IDLE) && defined(CONFIG_SUSPEND)
+#ifdef CONFIG_CPU_IDLE
 extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 				      struct cpuidle_device *dev);
 extern int cpuidle_enter_freeze(struct cpuidle_driver *drv,
 				struct cpuidle_device *dev);
+extern void cpuidle_use_deepest_state(bool enable);
 #else
 static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 					     struct cpuidle_device *dev)
@@ -204,6 +206,9 @@ static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
 static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv,
 				       struct cpuidle_device *dev)
 {return -ENODEV; }
+static inline void cpuidle_use_deepest_state(bool enable)
+{
+}
 #endif
 
 /* kernel/sched/idle.c */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 1d8718d5300d..513e4dfeeae7 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -164,11 +164,14 @@ static void cpuidle_idle_call(void)
 	 * timekeeping to prevent timer interrupts from kicking us out of idle
 	 * until a proper wakeup interrupt happens.
 	 */
-	if (idle_should_freeze()) {
-		entered_state = cpuidle_enter_freeze(drv, dev);
-		if (entered_state > 0) {
-			local_irq_enable();
-			goto exit_idle;
+
+	if (idle_should_freeze() || dev->use_deepest_state) {
+		if (idle_should_freeze()) {
+			entered_state = cpuidle_enter_freeze(drv, dev);
+			if (entered_state > 0) {
+				local_irq_enable();
+				goto exit_idle;
+			}
 		}
 
 		next_state = cpuidle_find_deepest_state(drv, dev);
-- 
cgit v1.2.3


From c1de45ca831acee9b72c9320dde447edafadb43f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 28 Nov 2016 23:03:05 -0800
Subject: sched/idle: Add support for tasks that inject idle

Idle injection drivers such as Intel powerclamp and ACPI PAD drivers use
realtime tasks to take control of CPU then inject idle. There are two
issues with this approach:

 1. Low efficiency: injected idle task is treated as busy so sched ticks
    do not stop during injected idle period, the result of these
    unwanted wakeups can be ~20% loss in power savings.

 2. Idle accounting: injected idle time is presented to user as busy.

This patch addresses the issues by introducing a new PF_IDLE flag which
allows any given task to be treated as idle task while the flag is set.
Therefore, idle injection tasks can run through the normal flow of NOHZ
idle enter/exit to get the correct accounting as well as tick stop when
possible.

The implication is that idle task is then no longer limited to PID == 0.

Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/cpu.h   |   2 +
 include/linux/sched.h |   3 +-
 kernel/fork.c         |   2 +-
 kernel/sched/core.c   |   1 +
 kernel/sched/idle.c   | 162 +++++++++++++++++++++++++++++++-------------------
 5 files changed, 107 insertions(+), 63 deletions(-)

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index b886dc17f2f3..ac0efae38072 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -245,6 +245,8 @@ void arch_cpu_idle_dead(void);
 int cpu_report_state(int cpu);
 int cpu_check_up_prepare(int cpu);
 void cpu_set_state_online(int cpu);
+void play_idle(unsigned long duration_ms);
+
 #ifdef CONFIG_HOTPLUG_CPU
 bool cpu_wait_death(unsigned int cpu, int seconds);
 bool cpu_report_death(void);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 348f51b0ec92..114c7fcb6af6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2254,6 +2254,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
 /*
  * Per process flags
  */
+#define PF_IDLE		0x00000002	/* I am an IDLE thread */
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_EXITPIDONE	0x00000008	/* pi exit done on shut down */
 #define PF_VCPU		0x00000010	/* I'm a virtual CPU */
@@ -2609,7 +2610,7 @@ extern struct task_struct *idle_task(int cpu);
  */
 static inline bool is_idle_task(const struct task_struct *p)
 {
-	return p->pid == 0;
+	return !!(p->flags & PF_IDLE);
 }
 extern struct task_struct *curr_task(int cpu);
 extern void ia64_set_curr_task(int cpu, struct task_struct *p);
diff --git a/kernel/fork.c b/kernel/fork.c
index 623259fc794d..5074b2f0827b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1537,7 +1537,7 @@ static __latent_entropy struct task_struct *copy_process(
 		goto bad_fork_cleanup_count;
 
 	delayacct_tsk_init(p);	/* Must remain after dup_task_struct() */
-	p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
+	p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE);
 	p->flags |= PF_FORKNOEXEC;
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 94732d1ab00a..63b3a8a49884 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5285,6 +5285,7 @@ void init_idle(struct task_struct *idle, int cpu)
 	__sched_fork(0, idle);
 	idle->state = TASK_RUNNING;
 	idle->se.exec_start = sched_clock();
+	idle->flags |= PF_IDLE;
 
 	kasan_unpoison_task_stack(idle);
 
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 513e4dfeeae7..6a4bae0a649d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -205,76 +205,65 @@ exit_idle:
  *
  * Called with polling cleared.
  */
-static void cpu_idle_loop(void)
+static void do_idle(void)
 {
-	int cpu = smp_processor_id();
-
-	while (1) {
-		/*
-		 * If the arch has a polling bit, we maintain an invariant:
-		 *
-		 * Our polling bit is clear if we're not scheduled (i.e. if
-		 * rq->curr != rq->idle).  This means that, if rq->idle has
-		 * the polling bit set, then setting need_resched is
-		 * guaranteed to cause the cpu to reschedule.
-		 */
-
-		__current_set_polling();
-		quiet_vmstat();
-		tick_nohz_idle_enter();
+	/*
+	 * If the arch has a polling bit, we maintain an invariant:
+	 *
+	 * Our polling bit is clear if we're not scheduled (i.e. if rq->curr !=
+	 * rq->idle). This means that, if rq->idle has the polling bit set,
+	 * then setting need_resched is guaranteed to cause the CPU to
+	 * reschedule.
+	 */
 
-		while (!need_resched()) {
-			check_pgt_cache();
-			rmb();
+	__current_set_polling();
+	tick_nohz_idle_enter();
 
-			if (cpu_is_offline(cpu)) {
-				cpuhp_report_idle_dead();
-				arch_cpu_idle_dead();
-			}
+	while (!need_resched()) {
+		check_pgt_cache();
+		rmb();
 
-			local_irq_disable();
-			arch_cpu_idle_enter();
-
-			/*
-			 * In poll mode we reenable interrupts and spin.
-			 *
-			 * Also if we detected in the wakeup from idle
-			 * path that the tick broadcast device expired
-			 * for us, we don't want to go deep idle as we
-			 * know that the IPI is going to arrive right
-			 * away
-			 */
-			if (cpu_idle_force_poll || tick_check_broadcast_expired())
-				cpu_idle_poll();
-			else
-				cpuidle_idle_call();
-
-			arch_cpu_idle_exit();
+		if (cpu_is_offline(smp_processor_id())) {
+			cpuhp_report_idle_dead();
+			arch_cpu_idle_dead();
 		}
 
-		/*
-		 * Since we fell out of the loop above, we know
-		 * TIF_NEED_RESCHED must be set, propagate it into
-		 * PREEMPT_NEED_RESCHED.
-		 *
-		 * This is required because for polling idle loops we will
-		 * not have had an IPI to fold the state for us.
-		 */
-		preempt_set_need_resched();
-		tick_nohz_idle_exit();
-		__current_clr_polling();
+		local_irq_disable();
+		arch_cpu_idle_enter();
 
 		/*
-		 * We promise to call sched_ttwu_pending and reschedule
-		 * if need_resched is set while polling is set.  That
-		 * means that clearing polling needs to be visible
-		 * before doing these things.
+		 * In poll mode we reenable interrupts and spin. Also if we
+		 * detected in the wakeup from idle path that the tick
+		 * broadcast device expired for us, we don't want to go deep
+		 * idle as we know that the IPI is going to arrive right away.
 		 */
-		smp_mb__after_atomic();
-
-		sched_ttwu_pending();
-		schedule_preempt_disabled();
+		if (cpu_idle_force_poll || tick_check_broadcast_expired())
+			cpu_idle_poll();
+		else
+			cpuidle_idle_call();
+		arch_cpu_idle_exit();
 	}
+
+	/*
+	 * Since we fell out of the loop above, we know TIF_NEED_RESCHED must
+	 * be set, propagate it into PREEMPT_NEED_RESCHED.
+	 *
+	 * This is required because for polling idle loops we will not have had
+	 * an IPI to fold the state for us.
+	 */
+	preempt_set_need_resched();
+	tick_nohz_idle_exit();
+	__current_clr_polling();
+
+	/*
+	 * We promise to call sched_ttwu_pending() and reschedule if
+	 * need_resched() is set while polling is set. That means that clearing
+	 * polling needs to be visible before doing these things.
+	 */
+	smp_mb__after_atomic();
+
+	sched_ttwu_pending();
+	schedule_preempt_disabled();
 }
 
 bool cpu_in_idle(unsigned long pc)
@@ -283,6 +272,56 @@ bool cpu_in_idle(unsigned long pc)
 		pc < (unsigned long)__cpuidle_text_end;
 }
 
+struct idle_timer {
+	struct hrtimer timer;
+	int done;
+};
+
+static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
+{
+	struct idle_timer *it = container_of(timer, struct idle_timer, timer);
+
+	WRITE_ONCE(it->done, 1);
+	set_tsk_need_resched(current);
+
+	return HRTIMER_NORESTART;
+}
+
+void play_idle(unsigned long duration_ms)
+{
+	struct idle_timer it;
+
+	/*
+	 * Only FIFO tasks can disable the tick since they don't need the forced
+	 * preemption.
+	 */
+	WARN_ON_ONCE(current->policy != SCHED_FIFO);
+	WARN_ON_ONCE(current->nr_cpus_allowed != 1);
+	WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
+	WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
+	WARN_ON_ONCE(!duration_ms);
+
+	rcu_sleep_check();
+	preempt_disable();
+	current->flags |= PF_IDLE;
+	cpuidle_use_deepest_state(true);
+
+	it.done = 0;
+	hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	it.timer.function = idle_inject_timer_fn;
+	hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED);
+
+	while (!READ_ONCE(it.done))
+		do_idle();
+
+	cpuidle_use_deepest_state(false);
+	current->flags &= ~PF_IDLE;
+
+	preempt_fold_need_resched();
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(play_idle);
+
 void cpu_startup_entry(enum cpuhp_state state)
 {
 	/*
@@ -302,5 +341,6 @@ void cpu_startup_entry(enum cpuhp_state state)
 #endif
 	arch_cpu_idle_prepare();
 	cpuhp_online_idle(state);
-	cpu_idle_loop();
+	while (1)
+		do_idle();
 }
-- 
cgit v1.2.3


From 14f3f7d8cbceedab17f16cf301414fa3384117fe Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Mon, 28 Nov 2016 13:44:49 -0800
Subject: thermal/intel_powerclamp: Remove duplicated code that starts the
 kthread

This patch removes code duplication. It does not modify
the functionality.

Signed-off-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/intel_powerclamp.c | 45 +++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c
index 0e4dc0afcfd2..63657d193db5 100644
--- a/drivers/thermal/intel_powerclamp.c
+++ b/drivers/thermal/intel_powerclamp.c
@@ -508,10 +508,27 @@ static void poll_pkg_cstate(struct work_struct *dummy)
 		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
 }
 
+static void start_power_clamp_thread(unsigned long cpu)
+{
+	struct task_struct **p = per_cpu_ptr(powerclamp_thread, cpu);
+	struct task_struct *thread;
+
+	thread = kthread_create_on_node(clamp_thread,
+					(void *) cpu,
+					cpu_to_node(cpu),
+					"kidle_inject/%ld", cpu);
+	if (IS_ERR(thread))
+		return;
+
+	/* bind to cpu here */
+	kthread_bind(thread, cpu);
+	wake_up_process(thread);
+	*p = thread;
+}
+
 static int start_power_clamp(void)
 {
 	unsigned long cpu;
-	struct task_struct *thread;
 
 	set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
 	/* prevent cpu hotplug */
@@ -527,20 +544,7 @@ static int start_power_clamp(void)
 
 	/* start one thread per online cpu */
 	for_each_online_cpu(cpu) {
-		struct task_struct **p =
-			per_cpu_ptr(powerclamp_thread, cpu);
-
-		thread = kthread_create_on_node(clamp_thread,
-						(void *) cpu,
-						cpu_to_node(cpu),
-						"kidle_inject/%ld", cpu);
-		/* bind to cpu here */
-		if (likely(!IS_ERR(thread))) {
-			kthread_bind(thread, cpu);
-			wake_up_process(thread);
-			*p = thread;
-		}
-
+		start_power_clamp_thread(cpu);
 	}
 	put_online_cpus();
 
@@ -572,7 +576,6 @@ static int powerclamp_cpu_callback(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
 {
 	unsigned long cpu = (unsigned long)hcpu;
-	struct task_struct *thread;
 	struct task_struct **percpu_thread =
 		per_cpu_ptr(powerclamp_thread, cpu);
 
@@ -581,15 +584,7 @@ static int powerclamp_cpu_callback(struct notifier_block *nfb,
 
 	switch (action) {
 	case CPU_ONLINE:
-		thread = kthread_create_on_node(clamp_thread,
-						(void *) cpu,
-						cpu_to_node(cpu),
-						"kidle_inject/%lu", cpu);
-		if (likely(!IS_ERR(thread))) {
-			kthread_bind(thread, cpu);
-			wake_up_process(thread);
-			*percpu_thread = thread;
-		}
+		start_power_clamp_thread(cpu);
 		/* prefer BSP as controlling CPU */
 		if (cpu == 0) {
 			control_cpu = 0;
-- 
cgit v1.2.3


From 8d962ac7f396bc83fb381469521c27aed7b70f84 Mon Sep 17 00:00:00 2001
From: Petr Mladek <pmladek@suse.com>
Date: Mon, 28 Nov 2016 13:44:50 -0800
Subject: thermal/intel_powerclamp: Convert the kthread to kthread worker API

Kthreads are currently implemented as an infinite loop. Each
has its own variant of checks for terminating, freezing,
awakening. In many cases it is unclear to say in which state
it is and sometimes it is done a wrong way.

The plan is to convert kthreads into kthread_worker or workqueues
API. It allows to split the functionality into separate operations.
It helps to make a better structure. Also it defines a clean state
where no locks are taken, IRQs blocked, the kthread might sleep
or even be safely migrated.

The kthread worker API is useful when we want to have a dedicated
single thread for the work. It helps to make sure that it is
available when needed. Also it allows a better control, e.g.
define a scheduling priority.

This patch converts the intel powerclamp kthreads into the kthread
worker because they need to have a good control over the assigned
CPUs.

IMHO, the most natural way is to split one cycle into two works.
First one does some balancing and let the CPU work normal
way for some time. The second work checks what the CPU has done
in the meantime and put it into C-state to reach the required
idle time ratio. The delay between the two works is achieved
by the delayed kthread work.

The two works have to share some data that used to be local
variables of the single kthread function. This is achieved
by the new per-CPU struct kthread_worker_data. It might look
as a complication. On the other hand, the long original kthread
function was not nice either.

The patch tries to avoid extra init and cleanup works. All the
actions might be done outside the thread. They are moved
to the functions that create or destroy the worker. Especially,
I checked that the timers are assigned to the right CPU.

The two works are queuing each other. It makes it a bit tricky to
break it when we want to stop the worker. We use the global and
per-worker "clamping" variables to make sure that the re-queuing
eventually stops. We also cancel the works to make it faster.
Note that the canceling is not reliable because the handling
of the two variables and queuing is not synchronized via a lock.
But it is not a big deal because it is just an optimization.
The job is stopped faster than before in most cases.

Signed-off-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/intel_powerclamp.c | 292 +++++++++++++++++++++----------------
 1 file changed, 170 insertions(+), 122 deletions(-)

diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c
index 63657d193db5..a94f7c849a4e 100644
--- a/drivers/thermal/intel_powerclamp.c
+++ b/drivers/thermal/intel_powerclamp.c
@@ -86,11 +86,27 @@ static unsigned int control_cpu; /* The cpu assigned to collect stat and update
 				  */
 static bool clamping;
 
+static const struct sched_param sparam = {
+	.sched_priority = MAX_USER_RT_PRIO / 2,
+};
+struct powerclamp_worker_data {
+	struct kthread_worker *worker;
+	struct kthread_work balancing_work;
+	struct kthread_delayed_work idle_injection_work;
+	struct timer_list wakeup_timer;
+	unsigned int cpu;
+	unsigned int count;
+	unsigned int guard;
+	unsigned int window_size_now;
+	unsigned int target_ratio;
+	unsigned int duration_jiffies;
+	bool clamping;
+};
 
-static struct task_struct * __percpu *powerclamp_thread;
+static struct powerclamp_worker_data * __percpu worker_data;
 static struct thermal_cooling_device *cooling_dev;
 static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
-					   * clamping thread
+					   * clamping kthread worker
 					   */
 
 static unsigned int duration;
@@ -368,103 +384,104 @@ static bool powerclamp_adjust_controls(unsigned int target_ratio,
 	return set_target_ratio + guard <= current_ratio;
 }
 
-static int clamp_thread(void *arg)
+static void clamp_balancing_func(struct kthread_work *work)
 {
-	int cpunr = (unsigned long)arg;
-	DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
-	static const struct sched_param param = {
-		.sched_priority = MAX_USER_RT_PRIO/2,
-	};
-	unsigned int count = 0;
-	unsigned int target_ratio;
+	struct powerclamp_worker_data *w_data;
+	int sleeptime;
+	unsigned long target_jiffies;
+	unsigned int compensated_ratio;
+	int interval; /* jiffies to sleep for each attempt */
 
-	set_bit(cpunr, cpu_clamping_mask);
-	set_freezable();
-	init_timer_on_stack(&wakeup_timer);
-	sched_setscheduler(current, SCHED_FIFO, &param);
-
-	while (true == clamping && !kthread_should_stop() &&
-		cpu_online(cpunr)) {
-		int sleeptime;
-		unsigned long target_jiffies;
-		unsigned int guard;
-		unsigned int compensated_ratio;
-		int interval; /* jiffies to sleep for each attempt */
-		unsigned int duration_jiffies = msecs_to_jiffies(duration);
-		unsigned int window_size_now;
-
-		try_to_freeze();
-		/*
-		 * make sure user selected ratio does not take effect until
-		 * the next round. adjust target_ratio if user has changed
-		 * target such that we can converge quickly.
-		 */
-		target_ratio = set_target_ratio;
-		guard = 1 + target_ratio/20;
-		window_size_now = window_size;
-		count++;
+	w_data = container_of(work, struct powerclamp_worker_data,
+			      balancing_work);
 
-		/*
-		 * systems may have different ability to enter package level
-		 * c-states, thus we need to compensate the injected idle ratio
-		 * to achieve the actual target reported by the HW.
-		 */
-		compensated_ratio = target_ratio +
-			get_compensation(target_ratio);
-		if (compensated_ratio <= 0)
-			compensated_ratio = 1;
-		interval = duration_jiffies * 100 / compensated_ratio;
-
-		/* align idle time */
-		target_jiffies = roundup(jiffies, interval);
-		sleeptime = target_jiffies - jiffies;
-		if (sleeptime <= 0)
-			sleeptime = 1;
-		schedule_timeout_interruptible(sleeptime);
-		/*
-		 * only elected controlling cpu can collect stats and update
-		 * control parameters.
-		 */
-		if (cpunr == control_cpu && !(count%window_size_now)) {
-			should_skip =
-				powerclamp_adjust_controls(target_ratio,
-							guard, window_size_now);
-			smp_mb();
-		}
+	/*
+	 * make sure user selected ratio does not take effect until
+	 * the next round. adjust target_ratio if user has changed
+	 * target such that we can converge quickly.
+	 */
+	w_data->target_ratio = READ_ONCE(set_target_ratio);
+	w_data->guard = 1 + w_data->target_ratio / 20;
+	w_data->window_size_now = window_size;
+	w_data->duration_jiffies = msecs_to_jiffies(duration);
+	w_data->count++;
+
+	/*
+	 * systems may have different ability to enter package level
+	 * c-states, thus we need to compensate the injected idle ratio
+	 * to achieve the actual target reported by the HW.
+	 */
+	compensated_ratio = w_data->target_ratio +
+		get_compensation(w_data->target_ratio);
+	if (compensated_ratio <= 0)
+		compensated_ratio = 1;
+	interval = w_data->duration_jiffies * 100 / compensated_ratio;
+
+	/* align idle time */
+	target_jiffies = roundup(jiffies, interval);
+	sleeptime = target_jiffies - jiffies;
+	if (sleeptime <= 0)
+		sleeptime = 1;
+
+	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
+		kthread_queue_delayed_work(w_data->worker,
+					   &w_data->idle_injection_work,
+					   sleeptime);
+}
+
+static void clamp_idle_injection_func(struct kthread_work *work)
+{
+	struct powerclamp_worker_data *w_data;
+	unsigned long target_jiffies;
+
+	w_data = container_of(work, struct powerclamp_worker_data,
+			      idle_injection_work.work);
+
+	/*
+	 * only elected controlling cpu can collect stats and update
+	 * control parameters.
+	 */
+	if (w_data->cpu == control_cpu &&
+	    !(w_data->count % w_data->window_size_now)) {
+		should_skip =
+			powerclamp_adjust_controls(w_data->target_ratio,
+						   w_data->guard,
+						   w_data->window_size_now);
+		smp_mb();
+	}
 
-		if (should_skip)
-			continue;
+	if (should_skip)
+		goto balance;
+
+	target_jiffies = jiffies + w_data->duration_jiffies;
+	mod_timer(&w_data->wakeup_timer, target_jiffies);
+	if (unlikely(local_softirq_pending()))
+		goto balance;
+	/*
+	 * stop tick sched during idle time, interrupts are still
+	 * allowed. thus jiffies are updated properly.
+	 */
+	preempt_disable();
+	/* mwait until target jiffies is reached */
+	while (time_before(jiffies, target_jiffies)) {
+		unsigned long ecx = 1;
+		unsigned long eax = target_mwait;
 
-		target_jiffies = jiffies + duration_jiffies;
-		mod_timer(&wakeup_timer, target_jiffies);
-		if (unlikely(local_softirq_pending()))
-			continue;
 		/*
-		 * stop tick sched during idle time, interrupts are still
-		 * allowed. thus jiffies are updated properly.
+		 * REVISIT: may call enter_idle() to notify drivers who
+		 * can save power during cpu idle. same for exit_idle()
 		 */
-		preempt_disable();
-		/* mwait until target jiffies is reached */
-		while (time_before(jiffies, target_jiffies)) {
-			unsigned long ecx = 1;
-			unsigned long eax = target_mwait;
-
-			/*
-			 * REVISIT: may call enter_idle() to notify drivers who
-			 * can save power during cpu idle. same for exit_idle()
-			 */
-			local_touch_nmi();
-			stop_critical_timings();
-			mwait_idle_with_hints(eax, ecx);
-			start_critical_timings();
-			atomic_inc(&idle_wakeup_counter);
-		}
-		preempt_enable();
+		local_touch_nmi();
+		stop_critical_timings();
+		mwait_idle_with_hints(eax, ecx);
+		start_critical_timings();
+		atomic_inc(&idle_wakeup_counter);
 	}
-	del_timer_sync(&wakeup_timer);
-	clear_bit(cpunr, cpu_clamping_mask);
+	preempt_enable();
 
-	return 0;
+balance:
+	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
+		kthread_queue_work(w_data->worker, &w_data->balancing_work);
 }
 
 /*
@@ -508,22 +525,58 @@ static void poll_pkg_cstate(struct work_struct *dummy)
 		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
 }
 
-static void start_power_clamp_thread(unsigned long cpu)
+static void start_power_clamp_worker(unsigned long cpu)
 {
-	struct task_struct **p = per_cpu_ptr(powerclamp_thread, cpu);
-	struct task_struct *thread;
-
-	thread = kthread_create_on_node(clamp_thread,
-					(void *) cpu,
-					cpu_to_node(cpu),
-					"kidle_inject/%ld", cpu);
-	if (IS_ERR(thread))
+	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
+	struct kthread_worker *worker;
+
+	worker = kthread_create_worker_on_cpu(cpu, KTW_FREEZABLE,
+					      "kidle_inject/%ld", cpu);
+	if (IS_ERR(worker))
 		return;
 
-	/* bind to cpu here */
-	kthread_bind(thread, cpu);
-	wake_up_process(thread);
-	*p = thread;
+	w_data->worker = worker;
+	w_data->count = 0;
+	w_data->cpu = cpu;
+	w_data->clamping = true;
+	set_bit(cpu, cpu_clamping_mask);
+	setup_timer(&w_data->wakeup_timer, noop_timer, 0);
+	sched_setscheduler(worker->task, SCHED_FIFO, &sparam);
+	kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
+	kthread_init_delayed_work(&w_data->idle_injection_work,
+				  clamp_idle_injection_func);
+	kthread_queue_work(w_data->worker, &w_data->balancing_work);
+}
+
+static void stop_power_clamp_worker(unsigned long cpu)
+{
+	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
+
+	if (!w_data->worker)
+		return;
+
+	w_data->clamping = false;
+	/*
+	 * Make sure that all works that get queued after this point see
+	 * the clamping disabled. The counter part is not needed because
+	 * there is an implicit memory barrier when the queued work
+	 * is proceed.
+	 */
+	smp_wmb();
+	kthread_cancel_work_sync(&w_data->balancing_work);
+	kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
+	/*
+	 * The balancing work still might be queued here because
+	 * the handling of the "clapming" variable, cancel, and queue
+	 * operations are not synchronized via a lock. But it is not
+	 * a big deal. The balancing work is fast and destroy kthread
+	 * will wait for it.
+	 */
+	del_timer_sync(&w_data->wakeup_timer);
+	clear_bit(w_data->cpu, cpu_clamping_mask);
+	kthread_destroy_worker(w_data->worker);
+
+	w_data->worker = NULL;
 }
 
 static int start_power_clamp(void)
@@ -542,9 +595,9 @@ static int start_power_clamp(void)
 	clamping = true;
 	schedule_delayed_work(&poll_pkg_cstate_work, 0);
 
-	/* start one thread per online cpu */
+	/* start one kthread worker per online cpu */
 	for_each_online_cpu(cpu) {
-		start_power_clamp_thread(cpu);
+		start_power_clamp_worker(cpu);
 	}
 	put_online_cpus();
 
@@ -554,20 +607,17 @@ static int start_power_clamp(void)
 static void end_power_clamp(void)
 {
 	int i;
-	struct task_struct *thread;
 
-	clamping = false;
 	/*
-	 * make clamping visible to other cpus and give per cpu clamping threads
-	 * sometime to exit, or gets killed later.
+	 * Block requeuing in all the kthread workers. They will flush and
+	 * stop faster.
 	 */
-	smp_mb();
-	msleep(20);
+	clamping = false;
 	if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
 		for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
-			pr_debug("clamping thread for cpu %d alive, kill\n", i);
-			thread = *per_cpu_ptr(powerclamp_thread, i);
-			kthread_stop(thread);
+			pr_debug("clamping worker for cpu %d alive, destroy\n",
+				 i);
+			stop_power_clamp_worker(i);
 		}
 	}
 }
@@ -576,15 +626,13 @@ static int powerclamp_cpu_callback(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
 {
 	unsigned long cpu = (unsigned long)hcpu;
-	struct task_struct **percpu_thread =
-		per_cpu_ptr(powerclamp_thread, cpu);
 
 	if (false == clamping)
 		goto exit_ok;
 
 	switch (action) {
 	case CPU_ONLINE:
-		start_power_clamp_thread(cpu);
+		start_power_clamp_worker(cpu);
 		/* prefer BSP as controlling CPU */
 		if (cpu == 0) {
 			control_cpu = 0;
@@ -595,7 +643,7 @@ static int powerclamp_cpu_callback(struct notifier_block *nfb,
 		if (test_bit(cpu, cpu_clamping_mask)) {
 			pr_err("cpu %lu dead but powerclamping thread is not\n",
 				cpu);
-			kthread_stop(*percpu_thread);
+			stop_power_clamp_worker(cpu);
 		}
 		if (cpu == control_cpu) {
 			control_cpu = smp_processor_id();
@@ -759,8 +807,8 @@ static int __init powerclamp_init(void)
 	window_size = 2;
 	register_hotcpu_notifier(&powerclamp_cpu_notifier);
 
-	powerclamp_thread = alloc_percpu(struct task_struct *);
-	if (!powerclamp_thread) {
+	worker_data = alloc_percpu(struct powerclamp_worker_data);
+	if (!worker_data) {
 		retval = -ENOMEM;
 		goto exit_unregister;
 	}
@@ -780,7 +828,7 @@ static int __init powerclamp_init(void)
 	return 0;
 
 exit_free_thread:
-	free_percpu(powerclamp_thread);
+	free_percpu(worker_data);
 exit_unregister:
 	unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
 exit_free:
@@ -793,7 +841,7 @@ static void __exit powerclamp_exit(void)
 {
 	unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
 	end_power_clamp();
-	free_percpu(powerclamp_thread);
+	free_percpu(worker_data);
 	thermal_cooling_device_unregister(cooling_dev);
 	kfree(cpu_clamping_mask);
 
-- 
cgit v1.2.3


From cb91fef1b71954e3edc79fb4171b43f6aa2028c7 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 28 Nov 2016 13:44:51 -0800
Subject: thermal/intel_powerclamp: Convert to CPU hotplug state

This is a conversation to the new hotplug state machine with
the difference that CPU_DEAD becomes CPU_PREDOWN.

At the same time it makes the handling of the two states symmetrical.
stop_power_clamp_worker() is called unconditionally and the controversial
error message is removed.

Finally, the hotplug state callbacks are removed after the powerclamping
is stopped to avoid a potential race.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
[pmladek@suse.com: Fixed the possible race in powerclamp_exit()]
Signed-off-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/intel_powerclamp.c | 73 +++++++++++++++++++-------------------
 1 file changed, 36 insertions(+), 37 deletions(-)

diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c
index a94f7c849a4e..c99af71c0ae7 100644
--- a/drivers/thermal/intel_powerclamp.c
+++ b/drivers/thermal/intel_powerclamp.c
@@ -43,7 +43,6 @@
 #include <linux/kernel.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
-#include <linux/freezer.h>
 #include <linux/cpu.h>
 #include <linux/thermal.h>
 #include <linux/slab.h>
@@ -530,8 +529,7 @@ static void start_power_clamp_worker(unsigned long cpu)
 	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
 	struct kthread_worker *worker;
 
-	worker = kthread_create_worker_on_cpu(cpu, KTW_FREEZABLE,
-					      "kidle_inject/%ld", cpu);
+	worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inject/%ld", cpu);
 	if (IS_ERR(worker))
 		return;
 
@@ -622,43 +620,35 @@ static void end_power_clamp(void)
 	}
 }
 
-static int powerclamp_cpu_callback(struct notifier_block *nfb,
-				unsigned long action, void *hcpu)
+static int powerclamp_cpu_online(unsigned int cpu)
 {
-	unsigned long cpu = (unsigned long)hcpu;
+	if (clamping == false)
+		return 0;
+	start_power_clamp_worker(cpu);
+	/* prefer BSP as controlling CPU */
+	if (cpu == 0) {
+		control_cpu = 0;
+		smp_mb();
+	}
+	return 0;
+}
 
-	if (false == clamping)
-		goto exit_ok;
+static int powerclamp_cpu_predown(unsigned int cpu)
+{
+	if (clamping == false)
+		return 0;
 
-	switch (action) {
-	case CPU_ONLINE:
-		start_power_clamp_worker(cpu);
-		/* prefer BSP as controlling CPU */
-		if (cpu == 0) {
-			control_cpu = 0;
-			smp_mb();
-		}
-		break;
-	case CPU_DEAD:
-		if (test_bit(cpu, cpu_clamping_mask)) {
-			pr_err("cpu %lu dead but powerclamping thread is not\n",
-				cpu);
-			stop_power_clamp_worker(cpu);
-		}
-		if (cpu == control_cpu) {
-			control_cpu = smp_processor_id();
-			smp_mb();
-		}
-	}
+	stop_power_clamp_worker(cpu);
+	if (cpu != control_cpu)
+		return 0;
 
-exit_ok:
-	return NOTIFY_OK;
+	control_cpu = cpumask_first(cpu_online_mask);
+	if (control_cpu == cpu)
+		control_cpu = cpumask_next(cpu, cpu_online_mask);
+	smp_mb();
+	return 0;
 }
 
-static struct notifier_block powerclamp_cpu_notifier = {
-	.notifier_call = powerclamp_cpu_callback,
-};
-
 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
 				 unsigned long *state)
 {
@@ -788,6 +778,8 @@ file_error:
 	debugfs_remove_recursive(debug_dir);
 }
 
+static enum cpuhp_state hp_state;
+
 static int __init powerclamp_init(void)
 {
 	int retval;
@@ -805,7 +797,14 @@ static int __init powerclamp_init(void)
 
 	/* set default limit, maybe adjusted during runtime based on feedback */
 	window_size = 2;
-	register_hotcpu_notifier(&powerclamp_cpu_notifier);
+	retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+					   "thermal/intel_powerclamp:online",
+					   powerclamp_cpu_online,
+					   powerclamp_cpu_predown);
+	if (retval < 0)
+		goto exit_free;
+
+	hp_state = retval;
 
 	worker_data = alloc_percpu(struct powerclamp_worker_data);
 	if (!worker_data) {
@@ -830,7 +829,7 @@ static int __init powerclamp_init(void)
 exit_free_thread:
 	free_percpu(worker_data);
 exit_unregister:
-	unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
+	cpuhp_remove_state_nocalls(hp_state);
 exit_free:
 	kfree(cpu_clamping_mask);
 	return retval;
@@ -839,8 +838,8 @@ module_init(powerclamp_init);
 
 static void __exit powerclamp_exit(void)
 {
-	unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
 	end_power_clamp();
+	cpuhp_remove_state_nocalls(hp_state);
 	free_percpu(worker_data);
 	thermal_cooling_device_unregister(cooling_dev);
 	kfree(cpu_clamping_mask);
-- 
cgit v1.2.3


From feb6cd6a0f9f7d214351624d79e408cb2af91631 Mon Sep 17 00:00:00 2001
From: Jacob Pan <jacob.jun.pan@linux.intel.com>
Date: Mon, 28 Nov 2016 13:44:52 -0800
Subject: thermal/intel_powerclamp: stop sched tick in forced idle

With the introduction of play_idle(), idle injection kthread can
go through the normal idle task processing to get correct accounting
and turn off scheduler tick when possible.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/thermal/intel_powerclamp.c | 35 +----------------------------------
 1 file changed, 1 insertion(+), 34 deletions(-)

diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c
index c99af71c0ae7..6d93f1d65dba 100644
--- a/drivers/thermal/intel_powerclamp.c
+++ b/drivers/thermal/intel_powerclamp.c
@@ -92,7 +92,6 @@ struct powerclamp_worker_data {
 	struct kthread_worker *worker;
 	struct kthread_work balancing_work;
 	struct kthread_delayed_work idle_injection_work;
-	struct timer_list wakeup_timer;
 	unsigned int cpu;
 	unsigned int count;
 	unsigned int guard;
@@ -277,11 +276,6 @@ static u64 pkg_state_counter(void)
 	return count;
 }
 
-static void noop_timer(unsigned long foo)
-{
-	/* empty... just the fact that we get the interrupt wakes us up */
-}
-
 static unsigned int get_compensation(int ratio)
 {
 	unsigned int comp = 0;
@@ -431,7 +425,6 @@ static void clamp_balancing_func(struct kthread_work *work)
 static void clamp_idle_injection_func(struct kthread_work *work)
 {
 	struct powerclamp_worker_data *w_data;
-	unsigned long target_jiffies;
 
 	w_data = container_of(work, struct powerclamp_worker_data,
 			      idle_injection_work.work);
@@ -452,31 +445,7 @@ static void clamp_idle_injection_func(struct kthread_work *work)
 	if (should_skip)
 		goto balance;
 
-	target_jiffies = jiffies + w_data->duration_jiffies;
-	mod_timer(&w_data->wakeup_timer, target_jiffies);
-	if (unlikely(local_softirq_pending()))
-		goto balance;
-	/*
-	 * stop tick sched during idle time, interrupts are still
-	 * allowed. thus jiffies are updated properly.
-	 */
-	preempt_disable();
-	/* mwait until target jiffies is reached */
-	while (time_before(jiffies, target_jiffies)) {
-		unsigned long ecx = 1;
-		unsigned long eax = target_mwait;
-
-		/*
-		 * REVISIT: may call enter_idle() to notify drivers who
-		 * can save power during cpu idle. same for exit_idle()
-		 */
-		local_touch_nmi();
-		stop_critical_timings();
-		mwait_idle_with_hints(eax, ecx);
-		start_critical_timings();
-		atomic_inc(&idle_wakeup_counter);
-	}
-	preempt_enable();
+	play_idle(jiffies_to_msecs(w_data->duration_jiffies));
 
 balance:
 	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
@@ -538,7 +507,6 @@ static void start_power_clamp_worker(unsigned long cpu)
 	w_data->cpu = cpu;
 	w_data->clamping = true;
 	set_bit(cpu, cpu_clamping_mask);
-	setup_timer(&w_data->wakeup_timer, noop_timer, 0);
 	sched_setscheduler(worker->task, SCHED_FIFO, &sparam);
 	kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
 	kthread_init_delayed_work(&w_data->idle_injection_work,
@@ -570,7 +538,6 @@ static void stop_power_clamp_worker(unsigned long cpu)
 	 * a big deal. The balancing work is fast and destroy kthread
 	 * will wait for it.
 	 */
-	del_timer_sync(&w_data->wakeup_timer);
 	clear_bit(w_data->cpu, cpu_clamping_mask);
 	kthread_destroy_worker(w_data->worker);
 
-- 
cgit v1.2.3


From ee061da8d9dfc30ced06f25c18694cffa70eac1e Mon Sep 17 00:00:00 2001
From: Andrew Lutomirski <luto@kernel.org>
Date: Tue, 29 Nov 2016 17:11:50 -0800
Subject: PM / QoS: Improve sysfs pm_qos_latency_tolerance validation

Negative values are special.  Don't let users write them directly.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/sysfs.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c
index a7b46798c81d..33b4b902741a 100644
--- a/drivers/base/power/sysfs.c
+++ b/drivers/base/power/sysfs.c
@@ -263,7 +263,11 @@ static ssize_t pm_qos_latency_tolerance_store(struct device *dev,
 	s32 value;
 	int ret;
 
-	if (kstrtos32(buf, 0, &value)) {
+	if (kstrtos32(buf, 0, &value) == 0) {
+		/* Users can't write negative values directly */
+		if (value < 0)
+			return -EINVAL;
+	} else {
 		if (!strcmp(buf, "auto") || !strcmp(buf, "auto\n"))
 			value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT;
 		else if (!strcmp(buf, "any") || !strcmp(buf, "any\n"))
-- 
cgit v1.2.3


From 80a6f7c79b7822726a096ce9e01cc00a1eacc2c4 Mon Sep 17 00:00:00 2001
From: Andrew Lutomirski <luto@kernel.org>
Date: Tue, 29 Nov 2016 17:11:51 -0800
Subject: PM / QoS: Fix writing 'auto' to pm_qos_latency_tolerance_us

If it was already 'auto', then writing 'auto' again would
incorrectly fail.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/qos.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index 7f3646e459cb..6a1f2c7e01ea 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -856,7 +856,10 @@ int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val)
 		struct dev_pm_qos_request *req;
 
 		if (val < 0) {
-			ret = -EINVAL;
+			if (val == PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT)
+				ret = 0;
+			else
+				ret = -EINVAL;
 			goto out;
 		}
 		req = kzalloc(sizeof(*req), GFP_KERNEL);
-- 
cgit v1.2.3


From 034e7906211c18c230ef4da43a1c44796dd5b95e Mon Sep 17 00:00:00 2001
From: Andrew Lutomirski <luto@kernel.org>
Date: Tue, 29 Nov 2016 17:11:52 -0800
Subject: PM / QoS: Export dev_pm_qos_update_user_latency_tolerance

nvme wants a module parameter that overrides the default latency
tolerance.  This makes it easy for nvme to reflect that default in
sysfs.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/qos.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c
index 6a1f2c7e01ea..58fcc758334e 100644
--- a/drivers/base/power/qos.c
+++ b/drivers/base/power/qos.c
@@ -886,6 +886,7 @@ int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val)
 	mutex_unlock(&dev_pm_qos_mtx);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(dev_pm_qos_update_user_latency_tolerance);
 
 /**
  * dev_pm_qos_expose_latency_tolerance - Expose latency tolerance to userspace
-- 
cgit v1.2.3


From 34994692216b5af2f05858324c3b4863cbaf41cf Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 30 Nov 2016 13:24:56 +0100
Subject: PM / Domains: Do not print PM domain add error message if
 EPROBE_DEFER

EPROBE_DEFER is not an error, hence printing an error message like

    renesas_irqc e61c0000.interrupt-controller: failed to add to PM domain always-on: -517

may confuse the user.

Suppress the error message in case of EPROBE_DEFER to fix this.

Reported-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Simon Horman <horms+renesas@verge.net.au>
Acked-by: Kevin Hilman <khilman@baylibre.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/domain.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 7b4d41ff9506..46e7f6052c9b 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -2031,8 +2031,9 @@ int genpd_dev_pm_attach(struct device *dev)
 	mutex_unlock(&gpd_list_lock);
 
 	if (ret < 0) {
-		dev_err(dev, "failed to add to PM domain %s: %d",
-			pd->name, ret);
+		if (ret != -EPROBE_DEFER)
+			dev_err(dev, "failed to add to PM domain %s: %d",
+				pd->name, ret);
 		goto out;
 	}
 
-- 
cgit v1.2.3


From 91291d9ad92faa65a56a9a19d658d8049b78d3d4 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Wed, 30 Nov 2016 16:21:25 +0530
Subject: PM / OPP: Pass opp_table to dev_pm_opp_put_regulator()

Joonyoung Shim reported an interesting problem on his ARM octa-core
Odoroid-XU3 platform. During system suspend, dev_pm_opp_put_regulator()
was failing for a struct device for which dev_pm_opp_set_regulator() is
called earlier.

This happened because an earlier call to
dev_pm_opp_of_cpumask_remove_table() function (from cpufreq-dt.c file)
removed all the entries from opp_table->dev_list apart from the last CPU
device in the cpumask of CPUs sharing the OPP.

But both dev_pm_opp_set_regulator() and dev_pm_opp_put_regulator()
routines get CPU device for the first CPU in the cpumask. And so the OPP
core failed to find the OPP table for the struct device.

This patch attempts to fix this problem by returning a pointer to the
opp_table from dev_pm_opp_set_regulator() and using that as the
parameter to dev_pm_opp_put_regulator(). This ensures that the
dev_pm_opp_put_regulator() doesn't fail to find the opp table.

Note that similar design problem also exists with other
dev_pm_opp_put_*() APIs, but those aren't used currently by anyone and
so we don't need to update them for now.

Cc: 4.4+ <stable@vger.kernel.org> # 4.4+
Reported-by: Joonyoung Shim <jy0922.shim@samsung.com>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
[ Viresh: Wrote commit log and tested on exynos 5250 ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/opp/core.c | 22 ++++++----------------
 drivers/cpufreq/cpufreq-dt.c  | 12 ++++++++----
 include/linux/pm_opp.h        | 11 ++++++-----
 3 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index 4c7c6da7a989..2824d3a5e9f0 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -1316,7 +1316,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
  * that this function is *NOT* called under RCU protection or in contexts where
  * mutex cannot be locked.
  */
-int dev_pm_opp_set_regulator(struct device *dev, const char *name)
+struct opp_table *dev_pm_opp_set_regulator(struct device *dev, const char *name)
 {
 	struct opp_table *opp_table;
 	struct regulator *reg;
@@ -1354,20 +1354,20 @@ int dev_pm_opp_set_regulator(struct device *dev, const char *name)
 	opp_table->regulator = reg;
 
 	mutex_unlock(&opp_table_lock);
-	return 0;
+	return opp_table;
 
 err:
 	_remove_opp_table(opp_table);
 unlock:
 	mutex_unlock(&opp_table_lock);
 
-	return ret;
+	return ERR_PTR(ret);
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulator);
 
 /**
  * dev_pm_opp_put_regulator() - Releases resources blocked for regulator
- * @dev: Device for which regulator was set.
+ * @opp_table: OPP table returned from dev_pm_opp_set_regulator().
  *
  * Locking: The internal opp_table and opp structures are RCU protected.
  * Hence this function internally uses RCU updater strategy with mutex locks
@@ -1375,22 +1375,12 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulator);
  * that this function is *NOT* called under RCU protection or in contexts where
  * mutex cannot be locked.
  */
-void dev_pm_opp_put_regulator(struct device *dev)
+void dev_pm_opp_put_regulator(struct opp_table *opp_table)
 {
-	struct opp_table *opp_table;
-
 	mutex_lock(&opp_table_lock);
 
-	/* Check for existing table for 'dev' first */
-	opp_table = _find_opp_table(dev);
-	if (IS_ERR(opp_table)) {
-		dev_err(dev, "Failed to find opp_table: %ld\n",
-			PTR_ERR(opp_table));
-		goto unlock;
-	}
-
 	if (IS_ERR(opp_table->regulator)) {
-		dev_err(dev, "%s: Doesn't have regulator set\n", __func__);
+		pr_err("%s: Doesn't have regulator set\n", __func__);
 		goto unlock;
 	}
 
diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 5c07ae05d69a..4d3ec92cbabf 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -28,6 +28,7 @@
 #include "cpufreq-dt.h"
 
 struct private_data {
+	struct opp_table *opp_table;
 	struct device *cpu_dev;
 	struct thermal_cooling_device *cdev;
 	const char *reg_name;
@@ -143,6 +144,7 @@ static int resources_available(void)
 static int cpufreq_init(struct cpufreq_policy *policy)
 {
 	struct cpufreq_frequency_table *freq_table;
+	struct opp_table *opp_table = NULL;
 	struct private_data *priv;
 	struct device *cpu_dev;
 	struct clk *cpu_clk;
@@ -186,8 +188,9 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 	 */
 	name = find_supply_name(cpu_dev);
 	if (name) {
-		ret = dev_pm_opp_set_regulator(cpu_dev, name);
-		if (ret) {
+		opp_table = dev_pm_opp_set_regulator(cpu_dev, name);
+		if (IS_ERR(opp_table)) {
+			ret = PTR_ERR(opp_table);
 			dev_err(cpu_dev, "Failed to set regulator for cpu%d: %d\n",
 				policy->cpu, ret);
 			goto out_put_clk;
@@ -237,6 +240,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 	}
 
 	priv->reg_name = name;
+	priv->opp_table = opp_table;
 
 	ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &freq_table);
 	if (ret) {
@@ -285,7 +289,7 @@ out_free_priv:
 out_free_opp:
 	dev_pm_opp_of_cpumask_remove_table(policy->cpus);
 	if (name)
-		dev_pm_opp_put_regulator(cpu_dev);
+		dev_pm_opp_put_regulator(opp_table);
 out_put_clk:
 	clk_put(cpu_clk);
 
@@ -300,7 +304,7 @@ static int cpufreq_exit(struct cpufreq_policy *policy)
 	dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &policy->freq_table);
 	dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);
 	if (priv->reg_name)
-		dev_pm_opp_put_regulator(priv->cpu_dev);
+		dev_pm_opp_put_regulator(priv->opp_table);
 
 	clk_put(policy->clk);
 	kfree(priv);
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index bca26157f5b6..f6bc76501912 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -19,6 +19,7 @@
 
 struct dev_pm_opp;
 struct device;
+struct opp_table;
 
 enum dev_pm_opp_event {
 	OPP_EVENT_ADD, OPP_EVENT_REMOVE, OPP_EVENT_ENABLE, OPP_EVENT_DISABLE,
@@ -62,8 +63,8 @@ int dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions,
 void dev_pm_opp_put_supported_hw(struct device *dev);
 int dev_pm_opp_set_prop_name(struct device *dev, const char *name);
 void dev_pm_opp_put_prop_name(struct device *dev);
-int dev_pm_opp_set_regulator(struct device *dev, const char *name);
-void dev_pm_opp_put_regulator(struct device *dev);
+struct opp_table *dev_pm_opp_set_regulator(struct device *dev, const char *name);
+void dev_pm_opp_put_regulator(struct opp_table *opp_table);
 int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq);
 int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, const struct cpumask *cpumask);
 int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask);
@@ -170,12 +171,12 @@ static inline int dev_pm_opp_set_prop_name(struct device *dev, const char *name)
 
 static inline void dev_pm_opp_put_prop_name(struct device *dev) {}
 
-static inline int dev_pm_opp_set_regulator(struct device *dev, const char *name)
+static inline struct opp_table *dev_pm_opp_set_regulator(struct device *dev, const char *name)
 {
-	return -ENOTSUPP;
+	return ERR_PTR(-ENOTSUPP);
 }
 
-static inline void dev_pm_opp_put_regulator(struct device *dev) {}
+static inline void dev_pm_opp_put_regulator(struct opp_table *opp_table) {}
 
 static inline int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 {
-- 
cgit v1.2.3


From ab83805667603e60177afec241a63139602bbd47 Mon Sep 17 00:00:00 2001
From: Baoyou Xie <baoyou.xie@linaro.org>
Date: Wed, 30 Nov 2016 15:35:29 +0800
Subject: cpufreq: dt: Add support for zx296718

Add the compatible string for supporting the generic cpufreq driver on
the ZTE's zx296718 SoC.

Signed-off-by: Baoyou Xie <baoyou.xie@linaro.org>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq-dt-platdev.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c
index 0c73c7dbd587..bc97b6a4b1cf 100644
--- a/drivers/cpufreq/cpufreq-dt-platdev.c
+++ b/drivers/cpufreq/cpufreq-dt-platdev.c
@@ -94,6 +94,8 @@ static const struct of_device_id machines[] __initconst = {
 
 	{ .compatible = "xlnx,zynq-7000", },
 
+	{ .compatible = "zte,zx296718", },
+
 	{ }
 };
 
-- 
cgit v1.2.3


From bd9089162d8002666f734dd18552c7297118fec3 Mon Sep 17 00:00:00 2001
From: Piotr Luc <piotr.luc@intel.com>
Date: Thu, 13 Oct 2016 17:31:04 +0200
Subject: powercap / RAPL: Add Knights Mill CPUID

Add Knights Mill (KNM) to the list of CPUIDs supported by intel_rapl

Signed-off-by: Piotr Luc <piotr.luc@intel.com>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/powercap/intel_rapl.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c
index f6937d0b742e..9a25110c4a46 100644
--- a/drivers/powercap/intel_rapl.c
+++ b/drivers/powercap/intel_rapl.c
@@ -1167,6 +1167,7 @@ static const struct x86_cpu_id rapl_ids[] __initconst = {
 	RAPL_CPU(INTEL_FAM6_ATOM_DENVERTON,	rapl_defaults_core),
 
 	RAPL_CPU(INTEL_FAM6_XEON_PHI_KNL,	rapl_defaults_hsw_server),
+	RAPL_CPU(INTEL_FAM6_XEON_PHI_KNM,	rapl_defaults_hsw_server),
 	{}
 };
 MODULE_DEVICE_TABLE(x86cpu, rapl_ids);
-- 
cgit v1.2.3


From 5e7ec268fd48d63cfd0e3a9be6c6443f01673bd4 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 25 Oct 2016 17:11:39 +0300
Subject: x86/intel_idle: Add CPU model 0x4a (Atom Z34xx series)

Add CPU ID for Atom Z34xx processors. Datasheets indicate support for this,
detailed information about potential quirks or limitations are missing, though.
So we just reuse the definition from official BSP code.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/idle/intel_idle.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 4466a2f969d7..5ded9b22b015 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -724,6 +724,50 @@ static struct cpuidle_state atom_cstates[] = {
 	{
 		.enter = NULL }
 };
+static struct cpuidle_state tangier_cstates[] = {
+	{
+		.name = "C1-TNG",
+		.desc = "MWAIT 0x00",
+		.flags = MWAIT2flg(0x00),
+		.exit_latency = 1,
+		.target_residency = 4,
+		.enter = &intel_idle,
+		.enter_freeze = intel_idle_freeze, },
+	{
+		.name = "C4-TNG",
+		.desc = "MWAIT 0x30",
+		.flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 100,
+		.target_residency = 400,
+		.enter = &intel_idle,
+		.enter_freeze = intel_idle_freeze, },
+	{
+		.name = "C6-TNG",
+		.desc = "MWAIT 0x52",
+		.flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 140,
+		.target_residency = 560,
+		.enter = &intel_idle,
+		.enter_freeze = intel_idle_freeze, },
+	{
+		.name = "C7-TNG",
+		.desc = "MWAIT 0x60",
+		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 1200,
+		.target_residency = 4000,
+		.enter = &intel_idle,
+		.enter_freeze = intel_idle_freeze, },
+	{
+		.name = "C9-TNG",
+		.desc = "MWAIT 0x64",
+		.flags = MWAIT2flg(0x64) | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 10000,
+		.target_residency = 20000,
+		.enter = &intel_idle,
+		.enter_freeze = intel_idle_freeze, },
+	{
+		.enter = NULL }
+};
 static struct cpuidle_state avn_cstates[] = {
 	{
 		.name = "C1-AVN",
@@ -978,6 +1022,10 @@ static const struct idle_cpu idle_cpu_atom = {
 	.state_table = atom_cstates,
 };
 
+static const struct idle_cpu idle_cpu_tangier = {
+	.state_table = tangier_cstates,
+};
+
 static const struct idle_cpu idle_cpu_lincroft = {
 	.state_table = atom_cstates,
 	.auto_demotion_disable_flags = ATM_LNC_C6_AUTO_DEMOTE,
@@ -1066,6 +1114,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	ICPU(INTEL_FAM6_SANDYBRIDGE_X,		idle_cpu_snb),
 	ICPU(INTEL_FAM6_ATOM_CEDARVIEW,		idle_cpu_atom),
 	ICPU(INTEL_FAM6_ATOM_SILVERMONT1,	idle_cpu_byt),
+	ICPU(INTEL_FAM6_ATOM_MERRIFIELD,	idle_cpu_tangier),
 	ICPU(INTEL_FAM6_ATOM_AIRMONT,		idle_cpu_cht),
 	ICPU(INTEL_FAM6_IVYBRIDGE,		idle_cpu_ivb),
 	ICPU(INTEL_FAM6_IVYBRIDGE_X,		idle_cpu_ivt),
-- 
cgit v1.2.3


From a2c1bc645e87346150516b3abf1933ed29d0f48b Mon Sep 17 00:00:00 2001
From: Piotr Luc <piotr.luc@intel.com>
Date: Thu, 13 Oct 2016 17:30:58 +0200
Subject: x86/intel_idle: Add Knights Mill CPUID

Add Knights Mill (KNM) to the list of CPUIDs supported by intel_idle.

Signed-off-by: Piotr Luc <piotr.luc@intel.com>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/idle/intel_idle.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 5ded9b22b015..d6377c16b3f8 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1133,6 +1133,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
 	ICPU(INTEL_FAM6_KABYLAKE_DESKTOP,	idle_cpu_skl),
 	ICPU(INTEL_FAM6_SKYLAKE_X,		idle_cpu_skx),
 	ICPU(INTEL_FAM6_XEON_PHI_KNL,		idle_cpu_knl),
+	ICPU(INTEL_FAM6_XEON_PHI_KNM,		idle_cpu_knl),
 	ICPU(INTEL_FAM6_ATOM_GOLDMONT,		idle_cpu_bxt),
 	ICPU(INTEL_FAM6_ATOM_DENVERTON,		idle_cpu_dnv),
 	{}
-- 
cgit v1.2.3


From 2ed38cbe05e7ea27022e29d75e7b93c20ccc5eaa Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 30 Nov 2016 02:32:17 +0100
Subject: MAINTAINERS: Add bug tracking system location entries for cpuidle

The kernel Bugzilla is used for tracking bugs in the cpuidle core and
intel_idle, so document that.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Len Brown <len.brown@intel.com>
---
 MAINTAINERS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 1cd38a7e0064..08753a5a23be 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3367,6 +3367,7 @@ M:	Daniel Lezcano <daniel.lezcano@linaro.org>
 L:	linux-pm@vger.kernel.org
 S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git
+B:	https://bugzilla.kernel.org
 F:	drivers/cpuidle/*
 F:	include/linux/cpuidle.h
 
@@ -6271,6 +6272,7 @@ INTEL IDLE DRIVER
 M:	Len Brown <lenb@kernel.org>
 L:	linux-pm@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git
+B:	https://bugzilla.kernel.org
 S:	Supported
 F:	drivers/idle/intel_idle.c
 
-- 
cgit v1.2.3


From 6af33995318fdfb4914fb1c5e67450fdb3d32084 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 30 Nov 2016 02:33:07 +0100
Subject: MAINTAINERS: Add Jacob Pan as a new intel_idle maintainer

The intel_idle driver is going to be maintained by Jacob Pan now, so
update MAINTAINERS accordingly.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Len Brown <len.brown@intel.com>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 08753a5a23be..0248f13c8adf 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6269,6 +6269,7 @@ S:	Maintained
 F:	drivers/platform/x86/intel-vbtn.c
 
 INTEL IDLE DRIVER
+M:	Jacob Pan <jacob.jun.pan@linux.intel.com>
 M:	Len Brown <lenb@kernel.org>
 L:	linux-pm@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git
-- 
cgit v1.2.3


From 993e5483a471869d0486dcc6c9691e2d41982aaf Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 30 Nov 2016 02:31:02 +0100
Subject: MAINTAINERS: Add bug tracking system location entry for cpufreq

The kernel Bugzilla is used for tracking cpufreq bugs, so document
that.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Len Brown <len.brown@intel.com>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index d0ca1198d8ab..21dc608fa29d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3336,6 +3336,7 @@ L:	linux-pm@vger.kernel.org
 S:	Maintained
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git
 T:	git git://git.linaro.org/people/vireshk/linux.git (For ARM Updates)
+B:	https://bugzilla.kernel.org
 F:	Documentation/cpu-freq/
 F:	drivers/cpufreq/
 F:	include/linux/cpufreq.h
-- 
cgit v1.2.3


From 58bf454272d9761312792053c706b72a41f5e835 Mon Sep 17 00:00:00 2001
From: Piotr Luc <piotr.luc@intel.com>
Date: Thu, 13 Oct 2016 17:31:00 +0200
Subject: cpufreq: intel_pstate: Add Knights Mill CPUID

Add Knights Mill (KNM) to the list of CPUIDs supported by intel_pstate.

Signed-off-by: Piotr Luc <piotr.luc@intel.com>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Acked-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index c20da85d7713..a7f597a476f5 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1575,6 +1575,7 @@ static const struct x86_cpu_id intel_pstate_cpu_ids[] = {
 	ICPU(INTEL_FAM6_SKYLAKE_DESKTOP,	core_params),
 	ICPU(INTEL_FAM6_BROADWELL_XEON_D,	core_params),
 	ICPU(INTEL_FAM6_XEON_PHI_KNL,		knl_params),
+	ICPU(INTEL_FAM6_XEON_PHI_KNM,		knl_params),
 	ICPU(INTEL_FAM6_ATOM_GOLDMONT,		bxt_params),
 	{}
 };
-- 
cgit v1.2.3


From 4dd63b49a7b06b6a49841f46d7510962de476c39 Mon Sep 17 00:00:00 2001
From: Chen Yu <yu.c.chen@intel.com>
Date: Mon, 4 Jan 2016 12:14:45 +0800
Subject: cpufreq: ondemand: Set MIN_FREQUENCY_UP_THRESHOLD to 1

Currently the minimal up_threshold is 11, and user may want to
use a smaller minimal up_threshold for performance tuning,
so MIN_FREQUENCY_UP_THRESHOLD could be set to 1 because:

1. Current systems wouldn't be affected as they have already
   a value >= 11.
2. New systems with a default kernel would keep still the default
   value that is >= 11.

Users now have the advantage that they can make their own decisions
and customize the 'trip point' to switch to the max frequency.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=65501
Signed-off-by: Chen Yu <yu.c.chen@intel.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq_ondemand.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 1e2bd9851fba..4a017e895296 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -25,7 +25,7 @@
 #define MAX_SAMPLING_DOWN_FACTOR		(100000)
 #define MICRO_FREQUENCY_UP_THRESHOLD		(95)
 #define MICRO_FREQUENCY_MIN_SAMPLE_RATE		(10000)
-#define MIN_FREQUENCY_UP_THRESHOLD		(11)
+#define MIN_FREQUENCY_UP_THRESHOLD		(1)
 #define MAX_FREQUENCY_UP_THRESHOLD		(100)
 
 static struct od_ops od_ops;
-- 
cgit v1.2.3


From 29d7bbada98e0969dd2ea159104aa55052a1c439 Mon Sep 17 00:00:00 2001
From: Anna-Maria Gleixner <anna-maria@linutronix.de>
Date: Sun, 27 Nov 2016 00:13:31 +0100
Subject: intel_idle: Remove superfluous SMP fuction call

Since commit 1cf4f629d9d2 ("cpu/hotplug: Move online calls to
hotplugged cpu") the CPU_ONLINE and CPU_DOWN_PREPARE notifiers are
always run on the hot plugged CPU, and as of commit 3b9d6da67e11
("cpu/hotplug: Fix rollback during error-out in __cpu_disable()") the
CPU_DOWN_FAILED notifier also runs on the hot plugged CPU. This patch
converts the SMP functional calls into direct calls.

smp_function_call_single() executes the function with interrupts
disabled. This calling convention is not preserved, because
tick_broadcast_enable() and tick_braodcast_disable() handle
interrupts themselves.

Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/idle/intel_idle.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index d6377c16b3f8..3828babfb77b 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -971,8 +971,7 @@ static int cpu_hotplug_notify(struct notifier_block *n,
 	case CPU_ONLINE:
 
 		if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE)
-			smp_call_function_single(hotcpu, __setup_broadcast_timer,
-						 (void *)true, 1);
+			__setup_broadcast_timer((void *)true);
 
 		/*
 		 * Some systems can hotplug a cpu at runtime after
-- 
cgit v1.2.3


From fb1013a01673acf7e94e38cda169828ac76b345a Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 29 Nov 2016 10:51:43 +0100
Subject: intel_idle: Convert to hotplug state machine

Install the callbacks via the state machine and let the core invoke the
callbacks on the already online CPUs.

The two smp_call_function_single() invocations in intel_idle_cpu_init() have
been removed because intel_idle_cpu_init() is now invoked via the hotplug
callback which runs on the target CPU. The IRQ-off calling convention for
auto_demotion_disable() and c1e_promotion_disable() has not been preserved
because only those two modify the MSR during CPU intialization.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/idle/intel_idle.c | 103 ++++++++++++++++++----------------------------
 1 file changed, 39 insertions(+), 64 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 3828babfb77b..7d8ea3d5fda6 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -98,8 +98,6 @@ static int intel_idle(struct cpuidle_device *dev,
 			struct cpuidle_driver *drv, int index);
 static void intel_idle_freeze(struct cpuidle_device *dev,
 			      struct cpuidle_driver *drv, int index);
-static int intel_idle_cpu_init(int cpu);
-
 static struct cpuidle_state *cpuidle_state_table;
 
 /*
@@ -951,50 +949,15 @@ static void intel_idle_freeze(struct cpuidle_device *dev,
 	mwait_idle_with_hints(eax, ecx);
 }
 
-static void __setup_broadcast_timer(void *arg)
+static void __setup_broadcast_timer(bool on)
 {
-	unsigned long on = (unsigned long)arg;
-
 	if (on)
 		tick_broadcast_enable();
 	else
 		tick_broadcast_disable();
 }
 
-static int cpu_hotplug_notify(struct notifier_block *n,
-			      unsigned long action, void *hcpu)
-{
-	int hotcpu = (unsigned long)hcpu;
-	struct cpuidle_device *dev;
-
-	switch (action & ~CPU_TASKS_FROZEN) {
-	case CPU_ONLINE:
-
-		if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE)
-			__setup_broadcast_timer((void *)true);
-
-		/*
-		 * Some systems can hotplug a cpu at runtime after
-		 * the kernel has booted, we have to initialize the
-		 * driver in this case
-		 */
-		dev = per_cpu_ptr(intel_idle_cpuidle_devices, hotcpu);
-		if (dev->registered)
-			break;
-
-		if (intel_idle_cpu_init(hotcpu))
-			return NOTIFY_BAD;
-
-		break;
-	}
-	return NOTIFY_OK;
-}
-
-static struct notifier_block cpu_hotplug_notifier = {
-	.notifier_call = cpu_hotplug_notify,
-};
-
-static void auto_demotion_disable(void *dummy)
+static void auto_demotion_disable(void)
 {
 	unsigned long long msr_bits;
 
@@ -1002,7 +965,7 @@ static void auto_demotion_disable(void *dummy)
 	msr_bits &= ~(icpu->auto_demotion_disable_flags);
 	wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits);
 }
-static void c1e_promotion_disable(void *dummy)
+static void c1e_promotion_disable(void)
 {
 	unsigned long long msr_bits;
 
@@ -1422,12 +1385,11 @@ static void __init intel_idle_cpuidle_driver_init(void)
  * allocate, initialize, register cpuidle_devices
  * @cpu: cpu/core to initialize
  */
-static int intel_idle_cpu_init(int cpu)
+static int intel_idle_cpu_init(unsigned int cpu)
 {
 	struct cpuidle_device *dev;
 
 	dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu);
-
 	dev->cpu = cpu;
 
 	if (cpuidle_register_device(dev)) {
@@ -1436,17 +1398,36 @@ static int intel_idle_cpu_init(int cpu)
 	}
 
 	if (icpu->auto_demotion_disable_flags)
-		smp_call_function_single(cpu, auto_demotion_disable, NULL, 1);
+		auto_demotion_disable();
 
 	if (icpu->disable_promotion_to_c1e)
-		smp_call_function_single(cpu, c1e_promotion_disable, NULL, 1);
+		c1e_promotion_disable();
+
+	return 0;
+}
+
+static int intel_idle_cpu_online(unsigned int cpu)
+{
+	struct cpuidle_device *dev;
+
+	if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE)
+		__setup_broadcast_timer(true);
+
+	/*
+	 * Some systems can hotplug a cpu at runtime after
+	 * the kernel has booted, we have to initialize the
+	 * driver in this case
+	 */
+	dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu);
+	if (!dev->registered)
+		return intel_idle_cpu_init(cpu);
 
 	return 0;
 }
 
 static int __init intel_idle_init(void)
 {
-	int retval, i;
+	int retval;
 
 	/* Do not load intel_idle at all for now if idle= is passed */
 	if (boot_option_idle_override != IDLE_NO_OVERRIDE)
@@ -1466,35 +1447,29 @@ static int __init intel_idle_init(void)
 		struct cpuidle_driver *drv = cpuidle_get_driver();
 		printk(KERN_DEBUG PREFIX "intel_idle yielding to %s",
 			drv ? drv->name : "none");
-		free_percpu(intel_idle_cpuidle_devices);
-		return retval;
+		goto init_driver_fail;
 	}
 
-	cpu_notifier_register_begin();
-
-	for_each_online_cpu(i) {
-		retval = intel_idle_cpu_init(i);
-		if (retval) {
-			intel_idle_cpuidle_devices_uninit();
-			cpu_notifier_register_done();
-			cpuidle_unregister_driver(&intel_idle_driver);
-			free_percpu(intel_idle_cpuidle_devices);
-			return retval;
-		}
-	}
-	__register_cpu_notifier(&cpu_hotplug_notifier);
-
 	if (boot_cpu_has(X86_FEATURE_ARAT))	/* Always Reliable APIC Timer */
 		lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE;
-	else
-		on_each_cpu(__setup_broadcast_timer, (void *)true, 1);
 
-	cpu_notifier_register_done();
+	retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online",
+				   intel_idle_cpu_online, NULL);
+	if (retval < 0)
+		goto hp_setup_fail;
 
 	pr_debug(PREFIX "lapic_timer_reliable_states 0x%x\n",
 		lapic_timer_reliable_states);
 
 	return 0;
+
+hp_setup_fail:
+	intel_idle_cpuidle_devices_uninit();
+	cpuidle_unregister_driver(&intel_idle_driver);
+init_driver_fail:
+	free_percpu(intel_idle_cpuidle_devices);
+	return retval;
+
 }
 device_initcall(intel_idle_init);
 
-- 
cgit v1.2.3


From b53f40db59b27b62bc294c30506b02a0cae47e0b Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Fri, 2 Dec 2016 11:42:21 -0600
Subject: x86/suspend: fix false positive KASAN warning on suspend/resume

Resuming from a suspend operation is showing a KASAN false positive
warning:

  BUG: KASAN: stack-out-of-bounds in unwind_get_return_address+0x11d/0x130 at addr ffff8803867d7878
  Read of size 8 by task pm-suspend/7774
  page:ffffea000e19f5c0 count:0 mapcount:0 mapping:          (null) index:0x0
  flags: 0x2ffff0000000000()
  page dumped because: kasan: bad access detected
  CPU: 0 PID: 7774 Comm: pm-suspend Tainted: G    B           4.9.0-rc7+ #8
  Hardware name: Gigabyte Technology Co., Ltd. Z170X-UD5/Z170X-UD5-CF, BIOS F5 03/07/2016
  Call Trace:
    dump_stack+0x63/0x82
    kasan_report_error+0x4b4/0x4e0
    ? acpi_hw_read_port+0xd0/0x1ea
    ? kfree_const+0x22/0x30
    ? acpi_hw_validate_io_request+0x1a6/0x1a6
    __asan_report_load8_noabort+0x61/0x70
    ? unwind_get_return_address+0x11d/0x130
    unwind_get_return_address+0x11d/0x130
    ? unwind_next_frame+0x97/0xf0
    __save_stack_trace+0x92/0x100
    save_stack_trace+0x1b/0x20
    save_stack+0x46/0xd0
    ? save_stack_trace+0x1b/0x20
    ? save_stack+0x46/0xd0
    ? kasan_kmalloc+0xad/0xe0
    ? kasan_slab_alloc+0x12/0x20
    ? acpi_hw_read+0x2b6/0x3aa
    ? acpi_hw_validate_register+0x20b/0x20b
    ? acpi_hw_write_port+0x72/0xc7
    ? acpi_hw_write+0x11f/0x15f
    ? acpi_hw_read_multiple+0x19f/0x19f
    ? memcpy+0x45/0x50
    ? acpi_hw_write_port+0x72/0xc7
    ? acpi_hw_write+0x11f/0x15f
    ? acpi_hw_read_multiple+0x19f/0x19f
    ? kasan_unpoison_shadow+0x36/0x50
    kasan_kmalloc+0xad/0xe0
    kasan_slab_alloc+0x12/0x20
    kmem_cache_alloc_trace+0xbc/0x1e0
    ? acpi_get_sleep_type_data+0x9a/0x578
    acpi_get_sleep_type_data+0x9a/0x578
    acpi_hw_legacy_wake_prep+0x88/0x22c
    ? acpi_hw_legacy_sleep+0x3c7/0x3c7
    ? acpi_write_bit_register+0x28d/0x2d3
    ? acpi_read_bit_register+0x19b/0x19b
    acpi_hw_sleep_dispatch+0xb5/0xba
    acpi_leave_sleep_state_prep+0x17/0x19
    acpi_suspend_enter+0x154/0x1e0
    ? trace_suspend_resume+0xe8/0xe8
    suspend_devices_and_enter+0xb09/0xdb0
    ? printk+0xa8/0xd8
    ? arch_suspend_enable_irqs+0x20/0x20
    ? try_to_freeze_tasks+0x295/0x600
    pm_suspend+0x6c9/0x780
    ? finish_wait+0x1f0/0x1f0
    ? suspend_devices_and_enter+0xdb0/0xdb0
    state_store+0xa2/0x120
    ? kobj_attr_show+0x60/0x60
    kobj_attr_store+0x36/0x70
    sysfs_kf_write+0x131/0x200
    kernfs_fop_write+0x295/0x3f0
    __vfs_write+0xef/0x760
    ? handle_mm_fault+0x1346/0x35e0
    ? do_iter_readv_writev+0x660/0x660
    ? __pmd_alloc+0x310/0x310
    ? do_lock_file_wait+0x1e0/0x1e0
    ? apparmor_file_permission+0x18/0x20
    ? security_file_permission+0x73/0x1c0
    ? rw_verify_area+0xbd/0x2b0
    vfs_write+0x149/0x4a0
    SyS_write+0xd9/0x1c0
    ? SyS_read+0x1c0/0x1c0
    entry_SYSCALL_64_fastpath+0x1e/0xad
  Memory state around the buggy address:
   ffff8803867d7700: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
   ffff8803867d7780: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
  >ffff8803867d7800: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 f4
                                                                  ^
   ffff8803867d7880: f3 f3 f3 f3 00 00 00 00 00 00 00 00 00 00 00 00
   ffff8803867d7900: 00 00 00 f1 f1 f1 f1 04 f4 f4 f4 f3 f3 f3 f3 00

KASAN instrumentation poisons the stack when entering a function and
unpoisons it when exiting the function.  However, in the suspend path,
some functions never return, so their stack never gets unpoisoned,
resulting in stale KASAN shadow data which can cause later false
positive warnings like the one above.

Reported-by: Scott Bauer <scott.bauer@intel.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/kernel/acpi/wakeup_64.S | 9 +++++++++
 mm/kasan/kasan.c                 | 9 ++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 169963f471bb..50b8ed0317a3 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -109,6 +109,15 @@ ENTRY(do_suspend_lowlevel)
 	movq	pt_regs_r14(%rax), %r14
 	movq	pt_regs_r15(%rax), %r15
 
+#ifdef CONFIG_KASAN
+	/*
+	 * The suspend path may have poisoned some areas deeper in the stack,
+	 * which we now need to unpoison.
+	 */
+	movq	%rsp, %rdi
+	call	kasan_unpoison_task_stack_below
+#endif
+
 	xorl	%eax, %eax
 	addq	$8, %rsp
 	FRAME_END
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 70c009741aab..f7dd9595ef43 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -80,7 +80,14 @@ void kasan_unpoison_task_stack(struct task_struct *task)
 /* Unpoison the stack for the current task beyond a watermark sp value. */
 asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
 {
-	__kasan_unpoison_stack(current, watermark);
+	/*
+	 * Calculate the task stack base address.  Avoid using 'current'
+	 * because this function is called by early resume code which hasn't
+	 * yet set up the percpu register (%gs).
+	 */
+	void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1));
+
+	kasan_unpoison_shadow(base, watermark - base);
 }
 
 /*
-- 
cgit v1.2.3


From 8f6040cebd2382a5cfb201419d40e7a4a193a412 Mon Sep 17 00:00:00 2001
From: Pan Bian <bianpan2016@163.com>
Date: Sat, 3 Dec 2016 23:02:27 +0800
Subject: cpuidle: fix improper return value on error

In function cpuidle_add_state_sysfs(), variable ret takes the return
value. Its value should be negative on errors. Because ret is reset in
the loop, its value will be 0 during the second and after repeat of the
loop. If kzalloc() returns a NULL pointer then, it will return 0. It may
be better to explicitly assign "-ENOMEM" when the call to kzalloc()
fails.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=188901
Signed-off-by: Pan Bian <bianpan2016@163.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/sysfs.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index 832a2c3f01ff..c5adc8c9ac43 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -403,8 +403,10 @@ static int cpuidle_add_state_sysfs(struct cpuidle_device *device)
 	/* state statistics */
 	for (i = 0; i < drv->state_count; i++) {
 		kobj = kzalloc(sizeof(struct cpuidle_state_kobj), GFP_KERNEL);
-		if (!kobj)
+		if (!kobj) {
+			ret = -ENOMEM;
 			goto error_state;
+		}
 		kobj->state = &drv->states[i];
 		kobj->state_usage = &device->states_usage[i];
 		init_completion(&kobj->kobj_unregister);
-- 
cgit v1.2.3


From 0e7414b7aa8b294fddefbad020798f7c8ebe1622 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 1 Dec 2016 23:31:32 +0100
Subject: cpuidle: Add a kerneldoc comment to cpuidle_use_deepest_state()

Since cpuidle_use_deepest_state() is not static, add a proper
kerneldoc comment to it to document its purpose.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/cpuidle.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index afc005b917fe..62810ff3b00f 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -97,7 +97,13 @@ static int find_deepest_state(struct cpuidle_driver *drv,
 	return ret;
 }
 
-/* Set the current cpu to use the deepest idle state, override governors */
+/**
+ * cpuidle_use_deepest_state - Set/clear governor override flag.
+ * @enable: New value of the flag.
+ *
+ * Set/unset the current CPU to use the deepest idle state (override governors
+ * going forward if set).
+ */
 void cpuidle_use_deepest_state(bool enable)
 {
 	struct cpuidle_device *dev;
-- 
cgit v1.2.3


From 309fe8d65034d1dd04a37f0e04d360b8764a3384 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 1 Dec 2016 16:28:14 +0530
Subject: PM / OPP: Fix incorrect cpu-supply property in binding

The regulator bindings allow the "<name>-supply" property to define a
single parent supply and not a list of parents.

Fix the wrong example code present in OPP bindings.

While at it also change the compatible string as Rob pointed out earlier
that none of A7 implementation have multiple supplies for the CPU core.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Rob Herring <robh@kernel.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/devicetree/bindings/opp/opp.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/Documentation/devicetree/bindings/opp/opp.txt b/Documentation/devicetree/bindings/opp/opp.txt
index ee91cbdd95ee..f0239f68d186 100644
--- a/Documentation/devicetree/bindings/opp/opp.txt
+++ b/Documentation/devicetree/bindings/opp/opp.txt
@@ -386,10 +386,12 @@ Example 4: Handling multiple regulators
 / {
 	cpus {
 		cpu@0 {
-			compatible = "arm,cortex-a7";
+			compatible = "vendor,cpu-type";
 			...
 
-			cpu-supply = <&cpu_supply0>, <&cpu_supply1>, <&cpu_supply2>;
+			vcc0-supply = <&cpu_supply0>;
+			vcc1-supply = <&cpu_supply1>;
+			vcc2-supply = <&cpu_supply2>;
 			operating-points-v2 = <&cpu0_opp_table>;
 		};
 	};
-- 
cgit v1.2.3


From 579c49efdf5a04869b29891e46cde0ed4df5480a Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 1 Dec 2016 16:28:15 +0530
Subject: PM / OPP: Reword binding supporting multiple regulators per device

On certain platforms (like TI), DVFS for a single device (CPU) requires
configuring multiple power supplies.

The OPP bindings already contains binding and example to explain this
case, but it isn't sufficient.

- There is no way for the code parsing these bindings to know which
  voltage values belong to which power supply.

- It is not possible to know the order in which the supplies need to be
  configured while switching OPPs.

This patch clarifies on those details by mentioning that such
information is left for the implementation specific bindings to explain.
They may want to hardcode such details or implement their own properties
to get such information. All implementations using multiple regulators
for their devices must provide a binding document explaining their
implementation.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Rob Herring <robh@kernel.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/devicetree/bindings/opp/opp.txt | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/Documentation/devicetree/bindings/opp/opp.txt b/Documentation/devicetree/bindings/opp/opp.txt
index f0239f68d186..9f5ca4457b5f 100644
--- a/Documentation/devicetree/bindings/opp/opp.txt
+++ b/Documentation/devicetree/bindings/opp/opp.txt
@@ -86,8 +86,14 @@ Optional properties:
   Single entry is for target voltage and three entries are for <target min max>
   voltages.
 
-  Entries for multiple regulators must be present in the same order as
-  regulators are specified in device's DT node.
+  Entries for multiple regulators shall be provided in the same field separated
+  by angular brackets <>. The OPP binding doesn't provide any provisions to
+  relate the values to their power supplies or the order in which the supplies
+  need to be configured and that is left for the implementation specific
+  binding.
+
+  Entries for all regulators shall be of the same size, i.e. either all use a
+  single value or triplets.
 
 - opp-microvolt-<name>: Named opp-microvolt property. This is exactly similar to
   the above opp-microvolt property, but allows multiple voltage ranges to be
@@ -104,10 +110,13 @@ Optional properties:
 
   Should only be set if opp-microvolt is set for the OPP.
 
-  Entries for multiple regulators must be present in the same order as
-  regulators are specified in device's DT node. If this property isn't required
-  for few regulators, then this should be marked as zero for them. If it isn't
-  required for any regulator, then this property need not be present.
+  Entries for multiple regulators shall be provided in the same field separated
+  by angular brackets <>. If current values aren't required for a regulator,
+  then it shall be filled with 0. If current values aren't required for any of
+  the regulators, then this field is not required. The OPP binding doesn't
+  provide any provisions to relate the values to their power supplies or the
+  order in which the supplies need to be configured and that is left for the
+  implementation specific binding.
 
 - opp-microamp-<name>: Named opp-microamp property. Similar to
   opp-microvolt-<name> property, but for microamp instead.
-- 
cgit v1.2.3


From dc39d06fcd7a4a82d72eae7b71e94e888b96d29e Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 1 Dec 2016 16:28:16 +0530
Subject: PM / OPP: Don't use OPP structure outside of rcu protected section

The OPP structure must not be used out of the rcu protected section.
Cache the values to be used in separate variables instead.

Cc: 4.6+ <stable@vger.kernel.org> # 4.6+
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Tested-by: Dave Gerlach <d-gerlach@ti.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/opp/core.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index 2824d3a5e9f0..6441dfda489f 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -584,6 +584,7 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 	struct clk *clk;
 	unsigned long freq, old_freq;
 	unsigned long u_volt, u_volt_min, u_volt_max;
+	unsigned long old_u_volt, old_u_volt_min, old_u_volt_max;
 	int ret;
 
 	if (unlikely(!target_freq)) {
@@ -633,6 +634,14 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 		return ret;
 	}
 
+	if (IS_ERR(old_opp)) {
+		old_u_volt = 0;
+	} else {
+		old_u_volt = old_opp->u_volt;
+		old_u_volt_min = old_opp->u_volt_min;
+		old_u_volt_max = old_opp->u_volt_max;
+	}
+
 	u_volt = opp->u_volt;
 	u_volt_min = opp->u_volt_min;
 	u_volt_max = opp->u_volt_max;
@@ -677,9 +686,10 @@ restore_freq:
 			__func__, old_freq);
 restore_voltage:
 	/* This shouldn't harm even if the voltages weren't updated earlier */
-	if (!IS_ERR(old_opp))
-		_set_opp_voltage(dev, reg, old_opp->u_volt,
-				 old_opp->u_volt_min, old_opp->u_volt_max);
+	if (old_u_volt) {
+		_set_opp_voltage(dev, reg, old_u_volt, old_u_volt_min,
+				 old_u_volt_max);
+	}
 
 	return ret;
 }
-- 
cgit v1.2.3


From 0f0fe7e01327b3d524787a2e8b7e78f010db2bb8 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 1 Dec 2016 16:28:17 +0530
Subject: PM / OPP: Manage supply's voltage/current in a separate structure

This is a preparatory step for multiple regulator per device support.
Move the voltage/current variables to a new structure.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Dave Gerlach <d-gerlach@ti.com>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/opp/core.c    | 44 +++++++++++++++++++++-------------------
 drivers/base/power/opp/debugfs.c |  8 ++++----
 drivers/base/power/opp/of.c      | 18 ++++++++--------
 drivers/base/power/opp/opp.h     | 11 +++-------
 include/linux/pm_opp.h           | 16 +++++++++++++++
 5 files changed, 55 insertions(+), 42 deletions(-)

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index 6441dfda489f..188048d3d203 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -112,7 +112,7 @@ unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
 	if (IS_ERR_OR_NULL(tmp_opp))
 		pr_err("%s: Invalid parameters\n", __func__);
 	else
-		v = tmp_opp->u_volt;
+		v = tmp_opp->supply.u_volt;
 
 	return v;
 }
@@ -246,10 +246,10 @@ unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
 		if (!opp->available)
 			continue;
 
-		if (opp->u_volt_min < min_uV)
-			min_uV = opp->u_volt_min;
-		if (opp->u_volt_max > max_uV)
-			max_uV = opp->u_volt_max;
+		if (opp->supply.u_volt_min < min_uV)
+			min_uV = opp->supply.u_volt_min;
+		if (opp->supply.u_volt_max > max_uV)
+			max_uV = opp->supply.u_volt_max;
 	}
 
 	rcu_read_unlock();
@@ -637,14 +637,14 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 	if (IS_ERR(old_opp)) {
 		old_u_volt = 0;
 	} else {
-		old_u_volt = old_opp->u_volt;
-		old_u_volt_min = old_opp->u_volt_min;
-		old_u_volt_max = old_opp->u_volt_max;
+		old_u_volt = old_opp->supply.u_volt;
+		old_u_volt_min = old_opp->supply.u_volt_min;
+		old_u_volt_max = old_opp->supply.u_volt_max;
 	}
 
-	u_volt = opp->u_volt;
-	u_volt_min = opp->u_volt_min;
-	u_volt_max = opp->u_volt_max;
+	u_volt = opp->supply.u_volt;
+	u_volt_min = opp->supply.u_volt_min;
+	u_volt_max = opp->supply.u_volt_max;
 
 	reg = opp_table->regulator;
 
@@ -957,10 +957,11 @@ static bool _opp_supported_by_regulators(struct dev_pm_opp *opp,
 	struct regulator *reg = opp_table->regulator;
 
 	if (!IS_ERR(reg) &&
-	    !regulator_is_supported_voltage(reg, opp->u_volt_min,
-					    opp->u_volt_max)) {
+	    !regulator_is_supported_voltage(reg, opp->supply.u_volt_min,
+					    opp->supply.u_volt_max)) {
 		pr_warn("%s: OPP minuV: %lu maxuV: %lu, not supported by regulator\n",
-			__func__, opp->u_volt_min, opp->u_volt_max);
+			__func__, opp->supply.u_volt_min,
+			opp->supply.u_volt_max);
 		return false;
 	}
 
@@ -993,11 +994,12 @@ int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
 
 		/* Duplicate OPPs */
 		dev_warn(dev, "%s: duplicate OPPs detected. Existing: freq: %lu, volt: %lu, enabled: %d. New: freq: %lu, volt: %lu, enabled: %d\n",
-			 __func__, opp->rate, opp->u_volt, opp->available,
-			 new_opp->rate, new_opp->u_volt, new_opp->available);
+			 __func__, opp->rate, opp->supply.u_volt,
+			 opp->available, new_opp->rate, new_opp->supply.u_volt,
+			 new_opp->available);
 
-		return opp->available && new_opp->u_volt == opp->u_volt ?
-			0 : -EEXIST;
+		return opp->available &&
+		       new_opp->supply.u_volt == opp->supply.u_volt ? 0 : -EEXIST;
 	}
 
 	new_opp->opp_table = opp_table;
@@ -1064,9 +1066,9 @@ int _opp_add_v1(struct device *dev, unsigned long freq, long u_volt,
 	/* populate the opp table */
 	new_opp->rate = freq;
 	tol = u_volt * opp_table->voltage_tolerance_v1 / 100;
-	new_opp->u_volt = u_volt;
-	new_opp->u_volt_min = u_volt - tol;
-	new_opp->u_volt_max = u_volt + tol;
+	new_opp->supply.u_volt = u_volt;
+	new_opp->supply.u_volt_min = u_volt - tol;
+	new_opp->supply.u_volt_max = u_volt + tol;
 	new_opp->available = true;
 	new_opp->dynamic = dynamic;
 
diff --git a/drivers/base/power/opp/debugfs.c b/drivers/base/power/opp/debugfs.c
index ef1ae6b52042..c897676ca35f 100644
--- a/drivers/base/power/opp/debugfs.c
+++ b/drivers/base/power/opp/debugfs.c
@@ -63,16 +63,16 @@ int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
 	if (!debugfs_create_ulong("rate_hz", S_IRUGO, d, &opp->rate))
 		return -ENOMEM;
 
-	if (!debugfs_create_ulong("u_volt_target", S_IRUGO, d, &opp->u_volt))
+	if (!debugfs_create_ulong("u_volt_target", S_IRUGO, d, &opp->supply.u_volt))
 		return -ENOMEM;
 
-	if (!debugfs_create_ulong("u_volt_min", S_IRUGO, d, &opp->u_volt_min))
+	if (!debugfs_create_ulong("u_volt_min", S_IRUGO, d, &opp->supply.u_volt_min))
 		return -ENOMEM;
 
-	if (!debugfs_create_ulong("u_volt_max", S_IRUGO, d, &opp->u_volt_max))
+	if (!debugfs_create_ulong("u_volt_max", S_IRUGO, d, &opp->supply.u_volt_max))
 		return -ENOMEM;
 
-	if (!debugfs_create_ulong("u_amp", S_IRUGO, d, &opp->u_amp))
+	if (!debugfs_create_ulong("u_amp", S_IRUGO, d, &opp->supply.u_amp))
 		return -ENOMEM;
 
 	if (!debugfs_create_ulong("clock_latency_ns", S_IRUGO, d,
diff --git a/drivers/base/power/opp/of.c b/drivers/base/power/opp/of.c
index 5b3755e49731..bdf409d42126 100644
--- a/drivers/base/power/opp/of.c
+++ b/drivers/base/power/opp/of.c
@@ -148,14 +148,14 @@ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
 		return -EINVAL;
 	}
 
-	opp->u_volt = microvolt[0];
+	opp->supply.u_volt = microvolt[0];
 
 	if (count == 1) {
-		opp->u_volt_min = opp->u_volt;
-		opp->u_volt_max = opp->u_volt;
+		opp->supply.u_volt_min = opp->supply.u_volt;
+		opp->supply.u_volt_max = opp->supply.u_volt;
 	} else {
-		opp->u_volt_min = microvolt[1];
-		opp->u_volt_max = microvolt[2];
+		opp->supply.u_volt_min = microvolt[1];
+		opp->supply.u_volt_max = microvolt[2];
 	}
 
 	/* Search for "opp-microamp-<name>" */
@@ -173,7 +173,7 @@ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
 	}
 
 	if (prop && !of_property_read_u32(opp->np, name, &val))
-		opp->u_amp = val;
+		opp->supply.u_amp = val;
 
 	return 0;
 }
@@ -303,9 +303,9 @@ static int _opp_add_static_v2(struct device *dev, struct device_node *np)
 	mutex_unlock(&opp_table_lock);
 
 	pr_debug("%s: turbo:%d rate:%lu uv:%lu uvmin:%lu uvmax:%lu latency:%lu\n",
-		 __func__, new_opp->turbo, new_opp->rate, new_opp->u_volt,
-		 new_opp->u_volt_min, new_opp->u_volt_max,
-		 new_opp->clock_latency_ns);
+		 __func__, new_opp->turbo, new_opp->rate,
+		 new_opp->supply.u_volt, new_opp->supply.u_volt_min,
+		 new_opp->supply.u_volt_max, new_opp->clock_latency_ns);
 
 	/*
 	 * Notify the changes in the availability of the operable
diff --git a/drivers/base/power/opp/opp.h b/drivers/base/power/opp/opp.h
index 96cd30ac6c1d..8a02516542c2 100644
--- a/drivers/base/power/opp/opp.h
+++ b/drivers/base/power/opp/opp.h
@@ -61,10 +61,7 @@ extern struct list_head opp_tables;
  * @turbo:	true if turbo (boost) OPP
  * @suspend:	true if suspend OPP
  * @rate:	Frequency in hertz
- * @u_volt:	Target voltage in microvolts corresponding to this OPP
- * @u_volt_min:	Minimum voltage in microvolts corresponding to this OPP
- * @u_volt_max:	Maximum voltage in microvolts corresponding to this OPP
- * @u_amp:	Maximum current drawn by the device in microamperes
+ * @supply:	Power supply voltage/current values
  * @clock_latency_ns: Latency (in nanoseconds) of switching to this OPP's
  *		frequency from any other OPP's frequency.
  * @opp_table:	points back to the opp_table struct this opp belongs to
@@ -83,10 +80,8 @@ struct dev_pm_opp {
 	bool suspend;
 	unsigned long rate;
 
-	unsigned long u_volt;
-	unsigned long u_volt_min;
-	unsigned long u_volt_max;
-	unsigned long u_amp;
+	struct dev_pm_opp_supply supply;
+
 	unsigned long clock_latency_ns;
 
 	struct opp_table *opp_table;
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index f6bc76501912..824f7268f687 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -25,6 +25,22 @@ enum dev_pm_opp_event {
 	OPP_EVENT_ADD, OPP_EVENT_REMOVE, OPP_EVENT_ENABLE, OPP_EVENT_DISABLE,
 };
 
+/**
+ * struct dev_pm_opp_supply - Power supply voltage/current values
+ * @u_volt:	Target voltage in microvolts corresponding to this OPP
+ * @u_volt_min:	Minimum voltage in microvolts corresponding to this OPP
+ * @u_volt_max:	Maximum voltage in microvolts corresponding to this OPP
+ * @u_amp:	Maximum current drawn by the device in microamperes
+ *
+ * This structure stores the voltage/current values for a single power supply.
+ */
+struct dev_pm_opp_supply {
+	unsigned long u_volt;
+	unsigned long u_volt_min;
+	unsigned long u_volt_max;
+	unsigned long u_amp;
+};
+
 #if defined(CONFIG_PM_OPP)
 
 unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp);
-- 
cgit v1.2.3


From ce31781a7574ac1b11b1b66e0d54c8bab41f56eb Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 1 Dec 2016 16:28:18 +0530
Subject: PM / OPP: Pass struct dev_pm_opp_supply to _set_opp_voltage()

Pass the entire supply structure instead of all of its fields.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Dave Gerlach <d-gerlach@ti.com>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/opp/core.c | 44 +++++++++++++++++--------------------------
 1 file changed, 17 insertions(+), 27 deletions(-)

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index 188048d3d203..46ad8470c438 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -542,8 +542,7 @@ unlock:
 }
 
 static int _set_opp_voltage(struct device *dev, struct regulator *reg,
-			    unsigned long u_volt, unsigned long u_volt_min,
-			    unsigned long u_volt_max)
+			    struct dev_pm_opp_supply *supply)
 {
 	int ret;
 
@@ -554,14 +553,15 @@ static int _set_opp_voltage(struct device *dev, struct regulator *reg,
 		return 0;
 	}
 
-	dev_dbg(dev, "%s: voltages (mV): %lu %lu %lu\n", __func__, u_volt_min,
-		u_volt, u_volt_max);
+	dev_dbg(dev, "%s: voltages (mV): %lu %lu %lu\n", __func__,
+		supply->u_volt_min, supply->u_volt, supply->u_volt_max);
 
-	ret = regulator_set_voltage_triplet(reg, u_volt_min, u_volt,
-					    u_volt_max);
+	ret = regulator_set_voltage_triplet(reg, supply->u_volt_min,
+					    supply->u_volt, supply->u_volt_max);
 	if (ret)
 		dev_err(dev, "%s: failed to set voltage (%lu %lu %lu mV): %d\n",
-			__func__, u_volt_min, u_volt, u_volt_max, ret);
+			__func__, supply->u_volt_min, supply->u_volt,
+			supply->u_volt_max, ret);
 
 	return ret;
 }
@@ -583,8 +583,7 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 	struct regulator *reg;
 	struct clk *clk;
 	unsigned long freq, old_freq;
-	unsigned long u_volt, u_volt_min, u_volt_max;
-	unsigned long old_u_volt, old_u_volt_min, old_u_volt_max;
+	struct dev_pm_opp_supply old_supply, new_supply;
 	int ret;
 
 	if (unlikely(!target_freq)) {
@@ -634,17 +633,12 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 		return ret;
 	}
 
-	if (IS_ERR(old_opp)) {
-		old_u_volt = 0;
-	} else {
-		old_u_volt = old_opp->supply.u_volt;
-		old_u_volt_min = old_opp->supply.u_volt_min;
-		old_u_volt_max = old_opp->supply.u_volt_max;
-	}
+	if (IS_ERR(old_opp))
+		old_supply.u_volt = 0;
+	else
+		memcpy(&old_supply, &old_opp->supply, sizeof(old_supply));
 
-	u_volt = opp->supply.u_volt;
-	u_volt_min = opp->supply.u_volt_min;
-	u_volt_max = opp->supply.u_volt_max;
+	memcpy(&new_supply, &opp->supply, sizeof(new_supply));
 
 	reg = opp_table->regulator;
 
@@ -652,8 +646,7 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 
 	/* Scaling up? Scale voltage before frequency */
 	if (freq > old_freq) {
-		ret = _set_opp_voltage(dev, reg, u_volt, u_volt_min,
-				       u_volt_max);
+		ret = _set_opp_voltage(dev, reg, &new_supply);
 		if (ret)
 			goto restore_voltage;
 	}
@@ -672,8 +665,7 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 
 	/* Scaling down? Scale voltage after frequency */
 	if (freq < old_freq) {
-		ret = _set_opp_voltage(dev, reg, u_volt, u_volt_min,
-				       u_volt_max);
+		ret = _set_opp_voltage(dev, reg, &new_supply);
 		if (ret)
 			goto restore_freq;
 	}
@@ -686,10 +678,8 @@ restore_freq:
 			__func__, old_freq);
 restore_voltage:
 	/* This shouldn't harm even if the voltages weren't updated earlier */
-	if (old_u_volt) {
-		_set_opp_voltage(dev, reg, old_u_volt, old_u_volt_min,
-				 old_u_volt_max);
-	}
+	if (old_supply.u_volt)
+		_set_opp_voltage(dev, reg, &old_supply);
 
 	return ret;
 }
-- 
cgit v1.2.3


From dfbe4678d709e25e0f36e6b6333e2a7a67aefb7e Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 1 Dec 2016 16:28:19 +0530
Subject: PM / OPP: Add infrastructure to manage multiple regulators

This patch adds infrastructure to manage multiple regulators and updates
the only user (cpufreq-dt) of dev_pm_opp_set{put}_regulator().

This is preparatory work for adding full support for devices with
multiple regulators.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Dave Gerlach <d-gerlach@ti.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/opp/core.c    | 248 +++++++++++++++++++++++++++------------
 drivers/base/power/opp/debugfs.c |  52 ++++++--
 drivers/base/power/opp/of.c      | 103 +++++++++++-----
 drivers/base/power/opp/opp.h     |  10 +-
 drivers/cpufreq/cpufreq-dt.c     |   6 +-
 include/linux/pm_opp.h           |   8 +-
 6 files changed, 300 insertions(+), 127 deletions(-)

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index 46ad8470c438..b4da31c5a5eb 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -93,6 +93,8 @@ struct opp_table *_find_opp_table(struct device *dev)
  * Return: voltage in micro volt corresponding to the opp, else
  * return 0
  *
+ * This is useful only for devices with single power supply.
+ *
  * Locking: This function must be called under rcu_read_lock(). opp is a rcu
  * protected pointer. This means that opp which could have been fetched by
  * opp_find_freq_{exact,ceil,floor} functions is valid as long as we are
@@ -112,7 +114,7 @@ unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp)
 	if (IS_ERR_OR_NULL(tmp_opp))
 		pr_err("%s: Invalid parameters\n", __func__);
 	else
-		v = tmp_opp->supply.u_volt;
+		v = tmp_opp->supplies[0].u_volt;
 
 	return v;
 }
@@ -210,6 +212,24 @@ unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_clock_latency);
 
+static int _get_regulator_count(struct device *dev)
+{
+	struct opp_table *opp_table;
+	int count;
+
+	rcu_read_lock();
+
+	opp_table = _find_opp_table(dev);
+	if (!IS_ERR(opp_table))
+		count = opp_table->regulator_count;
+	else
+		count = 0;
+
+	rcu_read_unlock();
+
+	return count;
+}
+
 /**
  * dev_pm_opp_get_max_volt_latency() - Get max voltage latency in nanoseconds
  * @dev: device for which we do this operation
@@ -222,34 +242,51 @@ unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
 {
 	struct opp_table *opp_table;
 	struct dev_pm_opp *opp;
-	struct regulator *reg;
+	struct regulator *reg, **regulators;
 	unsigned long latency_ns = 0;
-	unsigned long min_uV = ~0, max_uV = 0;
-	int ret;
+	int ret, i, count;
+	struct {
+		unsigned long min;
+		unsigned long max;
+	} *uV;
+
+	count = _get_regulator_count(dev);
+
+	/* Regulator may not be required for the device */
+	if (!count)
+		return 0;
+
+	regulators = kmalloc_array(count, sizeof(*regulators), GFP_KERNEL);
+	if (!regulators)
+		return 0;
+
+	uV = kmalloc_array(count, sizeof(*uV), GFP_KERNEL);
+	if (!uV)
+		goto free_regulators;
 
 	rcu_read_lock();
 
 	opp_table = _find_opp_table(dev);
 	if (IS_ERR(opp_table)) {
 		rcu_read_unlock();
-		return 0;
+		goto free_uV;
 	}
 
-	reg = opp_table->regulator;
-	if (IS_ERR(reg)) {
-		/* Regulator may not be required for device */
-		rcu_read_unlock();
-		return 0;
-	}
+	memcpy(regulators, opp_table->regulators, count * sizeof(*regulators));
 
-	list_for_each_entry_rcu(opp, &opp_table->opp_list, node) {
-		if (!opp->available)
-			continue;
+	for (i = 0; i < count; i++) {
+		uV[i].min = ~0;
+		uV[i].max = 0;
 
-		if (opp->supply.u_volt_min < min_uV)
-			min_uV = opp->supply.u_volt_min;
-		if (opp->supply.u_volt_max > max_uV)
-			max_uV = opp->supply.u_volt_max;
+		list_for_each_entry_rcu(opp, &opp_table->opp_list, node) {
+			if (!opp->available)
+				continue;
+
+			if (opp->supplies[i].u_volt_min < uV[i].min)
+				uV[i].min = opp->supplies[i].u_volt_min;
+			if (opp->supplies[i].u_volt_max > uV[i].max)
+				uV[i].max = opp->supplies[i].u_volt_max;
+		}
 	}
 
 	rcu_read_unlock();
@@ -258,9 +295,16 @@ unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev)
 	 * The caller needs to ensure that opp_table (and hence the regulator)
 	 * isn't freed, while we are executing this routine.
 	 */
-	ret = regulator_set_voltage_time(reg, min_uV, max_uV);
-	if (ret > 0)
-		latency_ns = ret * 1000;
+	for (i = 0; reg = regulators[i], i < count; i++) {
+		ret = regulator_set_voltage_time(reg, uV[i].min, uV[i].max);
+		if (ret > 0)
+			latency_ns += ret * 1000;
+	}
+
+free_uV:
+	kfree(uV);
+free_regulators:
+	kfree(regulators);
 
 	return latency_ns;
 }
@@ -580,7 +624,7 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 {
 	struct opp_table *opp_table;
 	struct dev_pm_opp *old_opp, *opp;
-	struct regulator *reg;
+	struct regulator *reg = ERR_PTR(-ENXIO);
 	struct clk *clk;
 	unsigned long freq, old_freq;
 	struct dev_pm_opp_supply old_supply, new_supply;
@@ -633,14 +677,23 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 		return ret;
 	}
 
+	if (opp_table->regulators) {
+		/* This function only supports single regulator per device */
+		if (WARN_ON(opp_table->regulator_count > 1)) {
+			dev_err(dev, "multiple regulators not supported\n");
+			rcu_read_unlock();
+			return -EINVAL;
+		}
+
+		reg = opp_table->regulators[0];
+	}
+
 	if (IS_ERR(old_opp))
 		old_supply.u_volt = 0;
 	else
-		memcpy(&old_supply, &old_opp->supply, sizeof(old_supply));
-
-	memcpy(&new_supply, &opp->supply, sizeof(new_supply));
+		memcpy(&old_supply, old_opp->supplies, sizeof(old_supply));
 
-	reg = opp_table->regulator;
+	memcpy(&new_supply, opp->supplies, sizeof(new_supply));
 
 	rcu_read_unlock();
 
@@ -764,9 +817,6 @@ static struct opp_table *_add_opp_table(struct device *dev)
 
 	_of_init_opp_table(opp_table, dev);
 
-	/* Set regulator to a non-NULL error value */
-	opp_table->regulator = ERR_PTR(-ENXIO);
-
 	/* Find clk for the device */
 	opp_table->clk = clk_get(dev, NULL);
 	if (IS_ERR(opp_table->clk)) {
@@ -815,7 +865,7 @@ static void _remove_opp_table(struct opp_table *opp_table)
 	if (opp_table->prop_name)
 		return;
 
-	if (!IS_ERR(opp_table->regulator))
+	if (opp_table->regulators)
 		return;
 
 	/* Release clk */
@@ -924,35 +974,50 @@ struct dev_pm_opp *_allocate_opp(struct device *dev,
 				 struct opp_table **opp_table)
 {
 	struct dev_pm_opp *opp;
+	int count, supply_size;
+	struct opp_table *table;
 
-	/* allocate new OPP node */
-	opp = kzalloc(sizeof(*opp), GFP_KERNEL);
-	if (!opp)
+	table = _add_opp_table(dev);
+	if (!table)
 		return NULL;
 
-	INIT_LIST_HEAD(&opp->node);
+	/* Allocate space for at least one supply */
+	count = table->regulator_count ? table->regulator_count : 1;
+	supply_size = sizeof(*opp->supplies) * count;
 
-	*opp_table = _add_opp_table(dev);
-	if (!*opp_table) {
-		kfree(opp);
+	/* allocate new OPP node and supplies structures */
+	opp = kzalloc(sizeof(*opp) + supply_size, GFP_KERNEL);
+	if (!opp) {
+		kfree(table);
 		return NULL;
 	}
 
+	/* Put the supplies at the end of the OPP structure as an empty array */
+	opp->supplies = (struct dev_pm_opp_supply *)(opp + 1);
+	INIT_LIST_HEAD(&opp->node);
+
+	*opp_table = table;
+
 	return opp;
 }
 
 static bool _opp_supported_by_regulators(struct dev_pm_opp *opp,
 					 struct opp_table *opp_table)
 {
-	struct regulator *reg = opp_table->regulator;
-
-	if (!IS_ERR(reg) &&
-	    !regulator_is_supported_voltage(reg, opp->supply.u_volt_min,
-					    opp->supply.u_volt_max)) {
-		pr_warn("%s: OPP minuV: %lu maxuV: %lu, not supported by regulator\n",
-			__func__, opp->supply.u_volt_min,
-			opp->supply.u_volt_max);
-		return false;
+	struct regulator *reg;
+	int i;
+
+	for (i = 0; i < opp_table->regulator_count; i++) {
+		reg = opp_table->regulators[i];
+
+		if (!regulator_is_supported_voltage(reg,
+					opp->supplies[i].u_volt_min,
+					opp->supplies[i].u_volt_max)) {
+			pr_warn("%s: OPP minuV: %lu maxuV: %lu, not supported by regulator\n",
+				__func__, opp->supplies[i].u_volt_min,
+				opp->supplies[i].u_volt_max);
+			return false;
+		}
 	}
 
 	return true;
@@ -984,12 +1049,13 @@ int _opp_add(struct device *dev, struct dev_pm_opp *new_opp,
 
 		/* Duplicate OPPs */
 		dev_warn(dev, "%s: duplicate OPPs detected. Existing: freq: %lu, volt: %lu, enabled: %d. New: freq: %lu, volt: %lu, enabled: %d\n",
-			 __func__, opp->rate, opp->supply.u_volt,
-			 opp->available, new_opp->rate, new_opp->supply.u_volt,
-			 new_opp->available);
+			 __func__, opp->rate, opp->supplies[0].u_volt,
+			 opp->available, new_opp->rate,
+			 new_opp->supplies[0].u_volt, new_opp->available);
 
+		/* Should we compare voltages for all regulators here ? */
 		return opp->available &&
-		       new_opp->supply.u_volt == opp->supply.u_volt ? 0 : -EEXIST;
+		       new_opp->supplies[0].u_volt == opp->supplies[0].u_volt ? 0 : -EEXIST;
 	}
 
 	new_opp->opp_table = opp_table;
@@ -1056,9 +1122,9 @@ int _opp_add_v1(struct device *dev, unsigned long freq, long u_volt,
 	/* populate the opp table */
 	new_opp->rate = freq;
 	tol = u_volt * opp_table->voltage_tolerance_v1 / 100;
-	new_opp->supply.u_volt = u_volt;
-	new_opp->supply.u_volt_min = u_volt - tol;
-	new_opp->supply.u_volt_max = u_volt + tol;
+	new_opp->supplies[0].u_volt = u_volt;
+	new_opp->supplies[0].u_volt_min = u_volt - tol;
+	new_opp->supplies[0].u_volt_max = u_volt + tol;
 	new_opp->available = true;
 	new_opp->dynamic = dynamic;
 
@@ -1303,12 +1369,14 @@ unlock:
 EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
 
 /**
- * dev_pm_opp_set_regulator() - Set regulator name for the device
+ * dev_pm_opp_set_regulators() - Set regulator names for the device
  * @dev: Device for which regulator name is being set.
- * @name: Name of the regulator.
+ * @names: Array of pointers to the names of the regulator.
+ * @count: Number of regulators.
  *
  * In order to support OPP switching, OPP layer needs to know the name of the
- * device's regulator, as the core would be required to switch voltages as well.
+ * device's regulators, as the core would be required to switch voltages as
+ * well.
  *
  * This must be called before any OPPs are initialized for the device.
  *
@@ -1318,11 +1386,13 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
  * that this function is *NOT* called under RCU protection or in contexts where
  * mutex cannot be locked.
  */
-struct opp_table *dev_pm_opp_set_regulator(struct device *dev, const char *name)
+struct opp_table *dev_pm_opp_set_regulators(struct device *dev,
+					    const char * const names[],
+					    unsigned int count)
 {
 	struct opp_table *opp_table;
 	struct regulator *reg;
-	int ret;
+	int ret, i;
 
 	mutex_lock(&opp_table_lock);
 
@@ -1338,26 +1408,44 @@ struct opp_table *dev_pm_opp_set_regulator(struct device *dev, const char *name)
 		goto err;
 	}
 
-	/* Already have a regulator set */
-	if (WARN_ON(!IS_ERR(opp_table->regulator))) {
+	/* Already have regulators set */
+	if (WARN_ON(opp_table->regulators)) {
 		ret = -EBUSY;
 		goto err;
 	}
-	/* Allocate the regulator */
-	reg = regulator_get_optional(dev, name);
-	if (IS_ERR(reg)) {
-		ret = PTR_ERR(reg);
-		if (ret != -EPROBE_DEFER)
-			dev_err(dev, "%s: no regulator (%s) found: %d\n",
-				__func__, name, ret);
+
+	opp_table->regulators = kmalloc_array(count,
+					      sizeof(*opp_table->regulators),
+					      GFP_KERNEL);
+	if (!opp_table->regulators) {
+		ret = -ENOMEM;
 		goto err;
 	}
 
-	opp_table->regulator = reg;
+	for (i = 0; i < count; i++) {
+		reg = regulator_get_optional(dev, names[i]);
+		if (IS_ERR(reg)) {
+			ret = PTR_ERR(reg);
+			if (ret != -EPROBE_DEFER)
+				dev_err(dev, "%s: no regulator (%s) found: %d\n",
+					__func__, names[i], ret);
+			goto free_regulators;
+		}
+
+		opp_table->regulators[i] = reg;
+	}
+
+	opp_table->regulator_count = count;
 
 	mutex_unlock(&opp_table_lock);
 	return opp_table;
 
+free_regulators:
+	while (i != 0)
+		regulator_put(opp_table->regulators[--i]);
+
+	kfree(opp_table->regulators);
+	opp_table->regulators = NULL;
 err:
 	_remove_opp_table(opp_table);
 unlock:
@@ -1365,11 +1453,11 @@ unlock:
 
 	return ERR_PTR(ret);
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulator);
+EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulators);
 
 /**
- * dev_pm_opp_put_regulator() - Releases resources blocked for regulator
- * @opp_table: OPP table returned from dev_pm_opp_set_regulator().
+ * dev_pm_opp_put_regulators() - Releases resources blocked for regulator
+ * @opp_table: OPP table returned from dev_pm_opp_set_regulators().
  *
  * Locking: The internal opp_table and opp structures are RCU protected.
  * Hence this function internally uses RCU updater strategy with mutex locks
@@ -1377,20 +1465,26 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulator);
  * that this function is *NOT* called under RCU protection or in contexts where
  * mutex cannot be locked.
  */
-void dev_pm_opp_put_regulator(struct opp_table *opp_table)
+void dev_pm_opp_put_regulators(struct opp_table *opp_table)
 {
+	int i;
+
 	mutex_lock(&opp_table_lock);
 
-	if (IS_ERR(opp_table->regulator)) {
-		pr_err("%s: Doesn't have regulator set\n", __func__);
+	if (!opp_table->regulators) {
+		pr_err("%s: Doesn't have regulators set\n", __func__);
 		goto unlock;
 	}
 
 	/* Make sure there are no concurrent readers while updating opp_table */
 	WARN_ON(!list_empty(&opp_table->opp_list));
 
-	regulator_put(opp_table->regulator);
-	opp_table->regulator = ERR_PTR(-ENXIO);
+	for (i = opp_table->regulator_count - 1; i >= 0; i--)
+		regulator_put(opp_table->regulators[i]);
+
+	kfree(opp_table->regulators);
+	opp_table->regulators = NULL;
+	opp_table->regulator_count = 0;
 
 	/* Try freeing opp_table if this was the last blocking resource */
 	_remove_opp_table(opp_table);
@@ -1398,7 +1492,7 @@ void dev_pm_opp_put_regulator(struct opp_table *opp_table)
 unlock:
 	mutex_unlock(&opp_table_lock);
 }
-EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulator);
+EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulators);
 
 /**
  * dev_pm_opp_add()  - Add an OPP table from a table definitions
diff --git a/drivers/base/power/opp/debugfs.c b/drivers/base/power/opp/debugfs.c
index c897676ca35f..95f433db4ac7 100644
--- a/drivers/base/power/opp/debugfs.c
+++ b/drivers/base/power/opp/debugfs.c
@@ -15,6 +15,7 @@
 #include <linux/err.h>
 #include <linux/init.h>
 #include <linux/limits.h>
+#include <linux/slab.h>
 
 #include "opp.h"
 
@@ -34,6 +35,46 @@ void opp_debug_remove_one(struct dev_pm_opp *opp)
 	debugfs_remove_recursive(opp->dentry);
 }
 
+static bool opp_debug_create_supplies(struct dev_pm_opp *opp,
+				      struct opp_table *opp_table,
+				      struct dentry *pdentry)
+{
+	struct dentry *d;
+	int i = 0;
+	char *name;
+
+	/* Always create at least supply-0 directory */
+	do {
+		name = kasprintf(GFP_KERNEL, "supply-%d", i);
+
+		/* Create per-opp directory */
+		d = debugfs_create_dir(name, pdentry);
+
+		kfree(name);
+
+		if (!d)
+			return false;
+
+		if (!debugfs_create_ulong("u_volt_target", S_IRUGO, d,
+					  &opp->supplies[i].u_volt))
+			return false;
+
+		if (!debugfs_create_ulong("u_volt_min", S_IRUGO, d,
+					  &opp->supplies[i].u_volt_min))
+			return false;
+
+		if (!debugfs_create_ulong("u_volt_max", S_IRUGO, d,
+					  &opp->supplies[i].u_volt_max))
+			return false;
+
+		if (!debugfs_create_ulong("u_amp", S_IRUGO, d,
+					  &opp->supplies[i].u_amp))
+			return false;
+	} while (++i < opp_table->regulator_count);
+
+	return true;
+}
+
 int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
 {
 	struct dentry *pdentry = opp_table->dentry;
@@ -63,16 +104,7 @@ int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table)
 	if (!debugfs_create_ulong("rate_hz", S_IRUGO, d, &opp->rate))
 		return -ENOMEM;
 
-	if (!debugfs_create_ulong("u_volt_target", S_IRUGO, d, &opp->supply.u_volt))
-		return -ENOMEM;
-
-	if (!debugfs_create_ulong("u_volt_min", S_IRUGO, d, &opp->supply.u_volt_min))
-		return -ENOMEM;
-
-	if (!debugfs_create_ulong("u_volt_max", S_IRUGO, d, &opp->supply.u_volt_max))
-		return -ENOMEM;
-
-	if (!debugfs_create_ulong("u_amp", S_IRUGO, d, &opp->supply.u_amp))
+	if (!opp_debug_create_supplies(opp, opp_table, d))
 		return -ENOMEM;
 
 	if (!debugfs_create_ulong("clock_latency_ns", S_IRUGO, d,
diff --git a/drivers/base/power/opp/of.c b/drivers/base/power/opp/of.c
index bdf409d42126..3f7d2591b173 100644
--- a/drivers/base/power/opp/of.c
+++ b/drivers/base/power/opp/of.c
@@ -17,6 +17,7 @@
 #include <linux/errno.h>
 #include <linux/device.h>
 #include <linux/of.h>
+#include <linux/slab.h>
 #include <linux/export.h>
 
 #include "opp.h"
@@ -101,16 +102,16 @@ static bool _opp_is_supported(struct device *dev, struct opp_table *opp_table,
 	return true;
 }
 
-/* TODO: Support multiple regulators */
 static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
 			      struct opp_table *opp_table)
 {
-	u32 microvolt[3] = {0};
-	u32 val;
-	int count, ret;
+	u32 *microvolt, *microamp = NULL;
+	int supplies, vcount, icount, ret, i, j;
 	struct property *prop = NULL;
 	char name[NAME_MAX];
 
+	supplies = opp_table->regulator_count ? opp_table->regulator_count : 1;
+
 	/* Search for "opp-microvolt-<name>" */
 	if (opp_table->prop_name) {
 		snprintf(name, sizeof(name), "opp-microvolt-%s",
@@ -128,34 +129,29 @@ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
 			return 0;
 	}
 
-	count = of_property_count_u32_elems(opp->np, name);
-	if (count < 0) {
+	vcount = of_property_count_u32_elems(opp->np, name);
+	if (vcount < 0) {
 		dev_err(dev, "%s: Invalid %s property (%d)\n",
-			__func__, name, count);
-		return count;
+			__func__, name, vcount);
+		return vcount;
 	}
 
-	/* There can be one or three elements here */
-	if (count != 1 && count != 3) {
-		dev_err(dev, "%s: Invalid number of elements in %s property (%d)\n",
-			__func__, name, count);
+	/* There can be one or three elements per supply */
+	if (vcount != supplies && vcount != supplies * 3) {
+		dev_err(dev, "%s: Invalid number of elements in %s property (%d) with supplies (%d)\n",
+			__func__, name, vcount, supplies);
 		return -EINVAL;
 	}
 
-	ret = of_property_read_u32_array(opp->np, name, microvolt, count);
+	microvolt = kmalloc_array(vcount, sizeof(*microvolt), GFP_KERNEL);
+	if (!microvolt)
+		return -ENOMEM;
+
+	ret = of_property_read_u32_array(opp->np, name, microvolt, vcount);
 	if (ret) {
 		dev_err(dev, "%s: error parsing %s: %d\n", __func__, name, ret);
-		return -EINVAL;
-	}
-
-	opp->supply.u_volt = microvolt[0];
-
-	if (count == 1) {
-		opp->supply.u_volt_min = opp->supply.u_volt;
-		opp->supply.u_volt_max = opp->supply.u_volt;
-	} else {
-		opp->supply.u_volt_min = microvolt[1];
-		opp->supply.u_volt_max = microvolt[2];
+		ret = -EINVAL;
+		goto free_microvolt;
 	}
 
 	/* Search for "opp-microamp-<name>" */
@@ -172,10 +168,59 @@ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev,
 		prop = of_find_property(opp->np, name, NULL);
 	}
 
-	if (prop && !of_property_read_u32(opp->np, name, &val))
-		opp->supply.u_amp = val;
+	if (prop) {
+		icount = of_property_count_u32_elems(opp->np, name);
+		if (icount < 0) {
+			dev_err(dev, "%s: Invalid %s property (%d)\n", __func__,
+				name, icount);
+			ret = icount;
+			goto free_microvolt;
+		}
 
-	return 0;
+		if (icount != supplies) {
+			dev_err(dev, "%s: Invalid number of elements in %s property (%d) with supplies (%d)\n",
+				__func__, name, icount, supplies);
+			ret = -EINVAL;
+			goto free_microvolt;
+		}
+
+		microamp = kmalloc_array(icount, sizeof(*microamp), GFP_KERNEL);
+		if (!microamp) {
+			ret = -EINVAL;
+			goto free_microvolt;
+		}
+
+		ret = of_property_read_u32_array(opp->np, name, microamp,
+						 icount);
+		if (ret) {
+			dev_err(dev, "%s: error parsing %s: %d\n", __func__,
+				name, ret);
+			ret = -EINVAL;
+			goto free_microamp;
+		}
+	}
+
+	for (i = 0, j = 0; i < supplies; i++) {
+		opp->supplies[i].u_volt = microvolt[j++];
+
+		if (vcount == supplies) {
+			opp->supplies[i].u_volt_min = opp->supplies[i].u_volt;
+			opp->supplies[i].u_volt_max = opp->supplies[i].u_volt;
+		} else {
+			opp->supplies[i].u_volt_min = microvolt[j++];
+			opp->supplies[i].u_volt_max = microvolt[j++];
+		}
+
+		if (microamp)
+			opp->supplies[i].u_amp = microamp[i];
+	}
+
+free_microamp:
+	kfree(microamp);
+free_microvolt:
+	kfree(microvolt);
+
+	return ret;
 }
 
 /**
@@ -304,8 +349,8 @@ static int _opp_add_static_v2(struct device *dev, struct device_node *np)
 
 	pr_debug("%s: turbo:%d rate:%lu uv:%lu uvmin:%lu uvmax:%lu latency:%lu\n",
 		 __func__, new_opp->turbo, new_opp->rate,
-		 new_opp->supply.u_volt, new_opp->supply.u_volt_min,
-		 new_opp->supply.u_volt_max, new_opp->clock_latency_ns);
+		 new_opp->supplies[0].u_volt, new_opp->supplies[0].u_volt_min,
+		 new_opp->supplies[0].u_volt_max, new_opp->clock_latency_ns);
 
 	/*
 	 * Notify the changes in the availability of the operable
diff --git a/drivers/base/power/opp/opp.h b/drivers/base/power/opp/opp.h
index 8a02516542c2..5b0f7e53bede 100644
--- a/drivers/base/power/opp/opp.h
+++ b/drivers/base/power/opp/opp.h
@@ -61,7 +61,7 @@ extern struct list_head opp_tables;
  * @turbo:	true if turbo (boost) OPP
  * @suspend:	true if suspend OPP
  * @rate:	Frequency in hertz
- * @supply:	Power supply voltage/current values
+ * @supplies:	Power supplies voltage/current values
  * @clock_latency_ns: Latency (in nanoseconds) of switching to this OPP's
  *		frequency from any other OPP's frequency.
  * @opp_table:	points back to the opp_table struct this opp belongs to
@@ -80,7 +80,7 @@ struct dev_pm_opp {
 	bool suspend;
 	unsigned long rate;
 
-	struct dev_pm_opp_supply supply;
+	struct dev_pm_opp_supply *supplies;
 
 	unsigned long clock_latency_ns;
 
@@ -139,7 +139,8 @@ enum opp_table_access {
  * @supported_hw_count: Number of elements in supported_hw array.
  * @prop_name: A name to postfix to many DT properties, while parsing them.
  * @clk: Device's clock handle
- * @regulator: Supply regulator
+ * @regulators: Supply regulators
+ * @regulator_count: Number of power supply regulators
  * @dentry:	debugfs dentry pointer of the real device directory (not links).
  * @dentry_name: Name of the real dentry.
  *
@@ -174,7 +175,8 @@ struct opp_table {
 	unsigned int supported_hw_count;
 	const char *prop_name;
 	struct clk *clk;
-	struct regulator *regulator;
+	struct regulator **regulators;
+	unsigned int regulator_count;
 
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *dentry;
diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c
index 4d3ec92cbabf..269013311e79 100644
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -188,7 +188,7 @@ static int cpufreq_init(struct cpufreq_policy *policy)
 	 */
 	name = find_supply_name(cpu_dev);
 	if (name) {
-		opp_table = dev_pm_opp_set_regulator(cpu_dev, name);
+		opp_table = dev_pm_opp_set_regulators(cpu_dev, &name, 1);
 		if (IS_ERR(opp_table)) {
 			ret = PTR_ERR(opp_table);
 			dev_err(cpu_dev, "Failed to set regulator for cpu%d: %d\n",
@@ -289,7 +289,7 @@ out_free_priv:
 out_free_opp:
 	dev_pm_opp_of_cpumask_remove_table(policy->cpus);
 	if (name)
-		dev_pm_opp_put_regulator(opp_table);
+		dev_pm_opp_put_regulators(opp_table);
 out_put_clk:
 	clk_put(cpu_clk);
 
@@ -304,7 +304,7 @@ static int cpufreq_exit(struct cpufreq_policy *policy)
 	dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &policy->freq_table);
 	dev_pm_opp_of_cpumask_remove_table(policy->related_cpus);
 	if (priv->reg_name)
-		dev_pm_opp_put_regulator(priv->opp_table);
+		dev_pm_opp_put_regulators(priv->opp_table);
 
 	clk_put(policy->clk);
 	kfree(priv);
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 824f7268f687..9a825ae78653 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -79,8 +79,8 @@ int dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions,
 void dev_pm_opp_put_supported_hw(struct device *dev);
 int dev_pm_opp_set_prop_name(struct device *dev, const char *name);
 void dev_pm_opp_put_prop_name(struct device *dev);
-struct opp_table *dev_pm_opp_set_regulator(struct device *dev, const char *name);
-void dev_pm_opp_put_regulator(struct opp_table *opp_table);
+struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count);
+void dev_pm_opp_put_regulators(struct opp_table *opp_table);
 int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq);
 int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, const struct cpumask *cpumask);
 int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask);
@@ -187,12 +187,12 @@ static inline int dev_pm_opp_set_prop_name(struct device *dev, const char *name)
 
 static inline void dev_pm_opp_put_prop_name(struct device *dev) {}
 
-static inline struct opp_table *dev_pm_opp_set_regulator(struct device *dev, const char *name)
+static inline struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count)
 {
 	return ERR_PTR(-ENOTSUPP);
 }
 
-static inline void dev_pm_opp_put_regulator(struct opp_table *opp_table) {}
+static inline void dev_pm_opp_put_regulators(struct opp_table *opp_table) {}
 
 static inline int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 {
-- 
cgit v1.2.3


From 947355850fcb3bb6549294316667d0f53bc03082 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 1 Dec 2016 16:28:20 +0530
Subject: PM / OPP: Separate out _generic_set_opp()

Later patches would add support for custom set_opp() callbacks. This
patch separates out the code for _generic_set_opp() handler in order to
prepare for that.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Dave Gerlach <d-gerlach@ti.com>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/opp/core.c | 181 +++++++++++++++++++++++++++++-------------
 drivers/base/power/opp/opp.h  |   3 +
 include/linux/pm_opp.h        |  35 ++++++++
 3 files changed, 166 insertions(+), 53 deletions(-)

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index b4da31c5a5eb..e33198ce41b4 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -610,6 +610,69 @@ static int _set_opp_voltage(struct device *dev, struct regulator *reg,
 	return ret;
 }
 
+static inline int
+_generic_set_opp_clk_only(struct device *dev, struct clk *clk,
+			  unsigned long old_freq, unsigned long freq)
+{
+	int ret;
+
+	ret = clk_set_rate(clk, freq);
+	if (ret) {
+		dev_err(dev, "%s: failed to set clock rate: %d\n", __func__,
+			ret);
+	}
+
+	return ret;
+}
+
+static int _generic_set_opp(struct dev_pm_set_opp_data *data)
+{
+	struct dev_pm_opp_supply *old_supply = data->old_opp.supplies;
+	struct dev_pm_opp_supply *new_supply = data->new_opp.supplies;
+	unsigned long old_freq = data->old_opp.rate, freq = data->new_opp.rate;
+	struct regulator *reg = data->regulators[0];
+	struct device *dev= data->dev;
+	int ret;
+
+	/* This function only supports single regulator per device */
+	if (WARN_ON(data->regulator_count > 1)) {
+		dev_err(dev, "multiple regulators are not supported\n");
+		return -EINVAL;
+	}
+
+	/* Scaling up? Scale voltage before frequency */
+	if (freq > old_freq) {
+		ret = _set_opp_voltage(dev, reg, new_supply);
+		if (ret)
+			goto restore_voltage;
+	}
+
+	/* Change frequency */
+	ret = _generic_set_opp_clk_only(dev, data->clk, old_freq, freq);
+	if (ret)
+		goto restore_voltage;
+
+	/* Scaling down? Scale voltage after frequency */
+	if (freq < old_freq) {
+		ret = _set_opp_voltage(dev, reg, new_supply);
+		if (ret)
+			goto restore_freq;
+	}
+
+	return 0;
+
+restore_freq:
+	if (_generic_set_opp_clk_only(dev, data->clk, freq, old_freq))
+		dev_err(dev, "%s: failed to restore old-freq (%lu Hz)\n",
+			__func__, old_freq);
+restore_voltage:
+	/* This shouldn't harm even if the voltages weren't updated earlier */
+	if (old_supply->u_volt)
+		_set_opp_voltage(dev, reg, old_supply);
+
+	return ret;
+}
+
 /**
  * dev_pm_opp_set_rate() - Configure new OPP based on frequency
  * @dev:	 device for which we do this operation
@@ -623,12 +686,12 @@ static int _set_opp_voltage(struct device *dev, struct regulator *reg,
 int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 {
 	struct opp_table *opp_table;
+	unsigned long freq, old_freq;
 	struct dev_pm_opp *old_opp, *opp;
-	struct regulator *reg = ERR_PTR(-ENXIO);
+	struct regulator **regulators;
+	struct dev_pm_set_opp_data *data;
 	struct clk *clk;
-	unsigned long freq, old_freq;
-	struct dev_pm_opp_supply old_supply, new_supply;
-	int ret;
+	int ret, size;
 
 	if (unlikely(!target_freq)) {
 		dev_err(dev, "%s: Invalid target frequency %lu\n", __func__,
@@ -677,64 +740,36 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 		return ret;
 	}
 
-	if (opp_table->regulators) {
-		/* This function only supports single regulator per device */
-		if (WARN_ON(opp_table->regulator_count > 1)) {
-			dev_err(dev, "multiple regulators not supported\n");
-			rcu_read_unlock();
-			return -EINVAL;
-		}
+	dev_dbg(dev, "%s: switching OPP: %lu Hz --> %lu Hz\n", __func__,
+		old_freq, freq);
 
-		reg = opp_table->regulators[0];
+	regulators = opp_table->regulators;
+
+	/* Only frequency scaling */
+	if (!regulators) {
+		rcu_read_unlock();
+		return _generic_set_opp_clk_only(dev, clk, old_freq, freq);
 	}
 
+	data = opp_table->set_opp_data;
+	data->regulators = regulators;
+	data->regulator_count = opp_table->regulator_count;
+	data->clk = clk;
+	data->dev = dev;
+
+	data->old_opp.rate = old_freq;
+	size = sizeof(*opp->supplies) * opp_table->regulator_count;
 	if (IS_ERR(old_opp))
-		old_supply.u_volt = 0;
+		memset(data->old_opp.supplies, 0, size);
 	else
-		memcpy(&old_supply, old_opp->supplies, sizeof(old_supply));
+		memcpy(data->old_opp.supplies, old_opp->supplies, size);
 
-	memcpy(&new_supply, opp->supplies, sizeof(new_supply));
+	data->new_opp.rate = freq;
+	memcpy(data->new_opp.supplies, opp->supplies, size);
 
 	rcu_read_unlock();
 
-	/* Scaling up? Scale voltage before frequency */
-	if (freq > old_freq) {
-		ret = _set_opp_voltage(dev, reg, &new_supply);
-		if (ret)
-			goto restore_voltage;
-	}
-
-	/* Change frequency */
-
-	dev_dbg(dev, "%s: switching OPP: %lu Hz --> %lu Hz\n",
-		__func__, old_freq, freq);
-
-	ret = clk_set_rate(clk, freq);
-	if (ret) {
-		dev_err(dev, "%s: failed to set clock rate: %d\n", __func__,
-			ret);
-		goto restore_voltage;
-	}
-
-	/* Scaling down? Scale voltage after frequency */
-	if (freq < old_freq) {
-		ret = _set_opp_voltage(dev, reg, &new_supply);
-		if (ret)
-			goto restore_freq;
-	}
-
-	return 0;
-
-restore_freq:
-	if (clk_set_rate(clk, old_freq))
-		dev_err(dev, "%s: failed to restore old-freq (%lu Hz)\n",
-			__func__, old_freq);
-restore_voltage:
-	/* This shouldn't harm even if the voltages weren't updated earlier */
-	if (old_supply.u_volt)
-		_set_opp_voltage(dev, reg, &old_supply);
-
-	return ret;
+	return _generic_set_opp(data);
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_set_rate);
 
@@ -1368,6 +1403,38 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name);
 
+static int _allocate_set_opp_data(struct opp_table *opp_table)
+{
+	struct dev_pm_set_opp_data *data;
+	int len, count = opp_table->regulator_count;
+
+	if (WARN_ON(!count))
+		return -EINVAL;
+
+	/* space for set_opp_data */
+	len = sizeof(*data);
+
+	/* space for old_opp.supplies and new_opp.supplies */
+	len += 2 * sizeof(struct dev_pm_opp_supply) * count;
+
+	data = kzalloc(len, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->old_opp.supplies = (void *)(data + 1);
+	data->new_opp.supplies = data->old_opp.supplies + count;
+
+	opp_table->set_opp_data = data;
+
+	return 0;
+}
+
+static void _free_set_opp_data(struct opp_table *opp_table)
+{
+	kfree(opp_table->set_opp_data);
+	opp_table->set_opp_data = NULL;
+}
+
 /**
  * dev_pm_opp_set_regulators() - Set regulator names for the device
  * @dev: Device for which regulator name is being set.
@@ -1437,6 +1504,11 @@ struct opp_table *dev_pm_opp_set_regulators(struct device *dev,
 
 	opp_table->regulator_count = count;
 
+	/* Allocate block only once to pass to set_opp() routines */
+	ret = _allocate_set_opp_data(opp_table);
+	if (ret)
+		goto free_regulators;
+
 	mutex_unlock(&opp_table_lock);
 	return opp_table;
 
@@ -1446,6 +1518,7 @@ free_regulators:
 
 	kfree(opp_table->regulators);
 	opp_table->regulators = NULL;
+	opp_table->regulator_count = 0;
 err:
 	_remove_opp_table(opp_table);
 unlock:
@@ -1482,6 +1555,8 @@ void dev_pm_opp_put_regulators(struct opp_table *opp_table)
 	for (i = opp_table->regulator_count - 1; i >= 0; i--)
 		regulator_put(opp_table->regulators[i]);
 
+	_free_set_opp_data(opp_table);
+
 	kfree(opp_table->regulators);
 	opp_table->regulators = NULL;
 	opp_table->regulator_count = 0;
diff --git a/drivers/base/power/opp/opp.h b/drivers/base/power/opp/opp.h
index 5b0f7e53bede..a05e43912c6b 100644
--- a/drivers/base/power/opp/opp.h
+++ b/drivers/base/power/opp/opp.h
@@ -141,6 +141,7 @@ enum opp_table_access {
  * @clk: Device's clock handle
  * @regulators: Supply regulators
  * @regulator_count: Number of power supply regulators
+ * @set_opp_data: Data to be passed to set_opp callback
  * @dentry:	debugfs dentry pointer of the real device directory (not links).
  * @dentry_name: Name of the real dentry.
  *
@@ -178,6 +179,8 @@ struct opp_table {
 	struct regulator **regulators;
 	unsigned int regulator_count;
 
+	struct dev_pm_set_opp_data *set_opp_data;
+
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *dentry;
 	char dentry_name[NAME_MAX];
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 9a825ae78653..779b40a9287d 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -17,6 +17,8 @@
 #include <linux/err.h>
 #include <linux/notifier.h>
 
+struct clk;
+struct regulator;
 struct dev_pm_opp;
 struct device;
 struct opp_table;
@@ -41,6 +43,39 @@ struct dev_pm_opp_supply {
 	unsigned long u_amp;
 };
 
+/**
+ * struct dev_pm_opp_info - OPP freq/voltage/current values
+ * @rate:	Target clk rate in hz
+ * @supplies:	Array of voltage/current values for all power supplies
+ *
+ * This structure stores the freq/voltage/current values for a single OPP.
+ */
+struct dev_pm_opp_info {
+	unsigned long rate;
+	struct dev_pm_opp_supply *supplies;
+};
+
+/**
+ * struct dev_pm_set_opp_data - Set OPP data
+ * @old_opp:	Old OPP info
+ * @new_opp:	New OPP info
+ * @regulators:	Array of regulator pointers
+ * @regulator_count: Number of regulators
+ * @clk:	Pointer to clk
+ * @dev:	Pointer to the struct device
+ *
+ * This structure contains all information required for setting an OPP.
+ */
+struct dev_pm_set_opp_data {
+	struct dev_pm_opp_info old_opp;
+	struct dev_pm_opp_info new_opp;
+
+	struct regulator **regulators;
+	unsigned int regulator_count;
+	struct clk *clk;
+	struct device *dev;
+};
+
 #if defined(CONFIG_PM_OPP)
 
 unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp);
-- 
cgit v1.2.3


From 4dab160eb1586f67e8ba7c55ffdd2373f7a5553e Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 1 Dec 2016 16:28:21 +0530
Subject: PM / OPP: Allow platform specific custom set_opp() callbacks

The generic set_opp() handler isn't sufficient for platforms with
complex DVFS.  For example, some TI platforms have multiple regulators
for a CPU device. The order in which various supplies need to be
programmed is only known to the platform code and its best to leave it
to it.

This patch implements APIs to register platform specific set_opp()
callback.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Tested-by: Dave Gerlach <d-gerlach@ti.com>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/opp/core.c | 114 +++++++++++++++++++++++++++++++++++++++++-
 drivers/base/power/opp/opp.h  |   2 +
 include/linux/pm_opp.h        |  10 ++++
 3 files changed, 125 insertions(+), 1 deletion(-)

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index e33198ce41b4..eceebef36f21 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -687,6 +687,7 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 {
 	struct opp_table *opp_table;
 	unsigned long freq, old_freq;
+	int (*set_opp)(struct dev_pm_set_opp_data *data);
 	struct dev_pm_opp *old_opp, *opp;
 	struct regulator **regulators;
 	struct dev_pm_set_opp_data *data;
@@ -751,6 +752,11 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 		return _generic_set_opp_clk_only(dev, clk, old_freq, freq);
 	}
 
+	if (opp_table->set_opp)
+		set_opp = opp_table->set_opp;
+	else
+		set_opp = _generic_set_opp;
+
 	data = opp_table->set_opp_data;
 	data->regulators = regulators;
 	data->regulator_count = opp_table->regulator_count;
@@ -769,7 +775,7 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq)
 
 	rcu_read_unlock();
 
-	return _generic_set_opp(data);
+	return set_opp(data);
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_set_rate);
 
@@ -903,6 +909,9 @@ static void _remove_opp_table(struct opp_table *opp_table)
 	if (opp_table->regulators)
 		return;
 
+	if (opp_table->set_opp)
+		return;
+
 	/* Release clk */
 	if (!IS_ERR(opp_table->clk))
 		clk_put(opp_table->clk);
@@ -1569,6 +1578,109 @@ unlock:
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulators);
 
+/**
+ * dev_pm_opp_register_set_opp_helper() - Register custom set OPP helper
+ * @dev: Device for which the helper is getting registered.
+ * @set_opp: Custom set OPP helper.
+ *
+ * This is useful to support complex platforms (like platforms with multiple
+ * regulators per device), instead of the generic OPP set rate helper.
+ *
+ * This must be called before any OPPs are initialized for the device.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+int dev_pm_opp_register_set_opp_helper(struct device *dev,
+			int (*set_opp)(struct dev_pm_set_opp_data *data))
+{
+	struct opp_table *opp_table;
+	int ret;
+
+	if (!set_opp)
+		return -EINVAL;
+
+	mutex_lock(&opp_table_lock);
+
+	opp_table = _add_opp_table(dev);
+	if (!opp_table) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	/* This should be called before OPPs are initialized */
+	if (WARN_ON(!list_empty(&opp_table->opp_list))) {
+		ret = -EBUSY;
+		goto err;
+	}
+
+	/* Already have custom set_opp helper */
+	if (WARN_ON(opp_table->set_opp)) {
+		ret = -EBUSY;
+		goto err;
+	}
+
+	opp_table->set_opp = set_opp;
+
+	mutex_unlock(&opp_table_lock);
+	return 0;
+
+err:
+	_remove_opp_table(opp_table);
+unlock:
+	mutex_unlock(&opp_table_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_register_set_opp_helper);
+
+/**
+ * dev_pm_opp_register_put_opp_helper() - Releases resources blocked for
+ *					   set_opp helper
+ * @dev: Device for which custom set_opp helper has to be cleared.
+ *
+ * Locking: The internal opp_table and opp structures are RCU protected.
+ * Hence this function internally uses RCU updater strategy with mutex locks
+ * to keep the integrity of the internal data structures. Callers should ensure
+ * that this function is *NOT* called under RCU protection or in contexts where
+ * mutex cannot be locked.
+ */
+void dev_pm_opp_register_put_opp_helper(struct device *dev)
+{
+	struct opp_table *opp_table;
+
+	mutex_lock(&opp_table_lock);
+
+	/* Check for existing table for 'dev' first */
+	opp_table = _find_opp_table(dev);
+	if (IS_ERR(opp_table)) {
+		dev_err(dev, "Failed to find opp_table: %ld\n",
+			PTR_ERR(opp_table));
+		goto unlock;
+	}
+
+	if (!opp_table->set_opp) {
+		dev_err(dev, "%s: Doesn't have custom set_opp helper set\n",
+			__func__);
+		goto unlock;
+	}
+
+	/* Make sure there are no concurrent readers while updating opp_table */
+	WARN_ON(!list_empty(&opp_table->opp_list));
+
+	opp_table->set_opp = NULL;
+
+	/* Try freeing opp_table if this was the last blocking resource */
+	_remove_opp_table(opp_table);
+
+unlock:
+	mutex_unlock(&opp_table_lock);
+}
+EXPORT_SYMBOL_GPL(dev_pm_opp_register_put_opp_helper);
+
 /**
  * dev_pm_opp_add()  - Add an OPP table from a table definitions
  * @dev:	device for which we do this operation
diff --git a/drivers/base/power/opp/opp.h b/drivers/base/power/opp/opp.h
index a05e43912c6b..af9f2b849a66 100644
--- a/drivers/base/power/opp/opp.h
+++ b/drivers/base/power/opp/opp.h
@@ -141,6 +141,7 @@ enum opp_table_access {
  * @clk: Device's clock handle
  * @regulators: Supply regulators
  * @regulator_count: Number of power supply regulators
+ * @set_opp: Platform specific set_opp callback
  * @set_opp_data: Data to be passed to set_opp callback
  * @dentry:	debugfs dentry pointer of the real device directory (not links).
  * @dentry_name: Name of the real dentry.
@@ -179,6 +180,7 @@ struct opp_table {
 	struct regulator **regulators;
 	unsigned int regulator_count;
 
+	int (*set_opp)(struct dev_pm_set_opp_data *data);
 	struct dev_pm_set_opp_data *set_opp_data;
 
 #ifdef CONFIG_DEBUG_FS
diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h
index 779b40a9287d..0edd88f93904 100644
--- a/include/linux/pm_opp.h
+++ b/include/linux/pm_opp.h
@@ -116,6 +116,8 @@ int dev_pm_opp_set_prop_name(struct device *dev, const char *name);
 void dev_pm_opp_put_prop_name(struct device *dev);
 struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count);
 void dev_pm_opp_put_regulators(struct opp_table *opp_table);
+int dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data));
+void dev_pm_opp_register_put_opp_helper(struct device *dev);
 int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq);
 int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, const struct cpumask *cpumask);
 int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask);
@@ -215,6 +217,14 @@ static inline int dev_pm_opp_set_supported_hw(struct device *dev,
 
 static inline void dev_pm_opp_put_supported_hw(struct device *dev) {}
 
+static inline int dev_pm_opp_register_set_opp_helper(struct device *dev,
+			int (*set_opp)(struct dev_pm_set_opp_data *data))
+{
+	return -ENOTSUPP;
+}
+
+static inline void dev_pm_opp_register_put_opp_helper(struct device *dev) {}
+
 static inline int dev_pm_opp_set_prop_name(struct device *dev, const char *name)
 {
 	return -ENOTSUPP;
-- 
cgit v1.2.3


From e231f8d7ed9a01280d18cd897ae0bbb4118bc954 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 1 Dec 2016 16:28:22 +0530
Subject: PM / OPP: Don't WARN on multiple calls to dev_pm_opp_set_regulators()

If a platform specific OPP driver has called this routine first and set
the regulators, then the second call from cpufreq-dt driver will hit the
WARN_ON(). Remove the WARN_ON(), but continue to return error in such
cases.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Stephen Boyd <sboyd@codeaurora.org>
Tested-by: Dave Gerlach <d-gerlach@ti.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/opp/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c
index eceebef36f21..35ff06283738 100644
--- a/drivers/base/power/opp/core.c
+++ b/drivers/base/power/opp/core.c
@@ -1485,7 +1485,7 @@ struct opp_table *dev_pm_opp_set_regulators(struct device *dev,
 	}
 
 	/* Already have regulators set */
-	if (WARN_ON(opp_table->regulators)) {
+	if (opp_table->regulators) {
 		ret = -EBUSY;
 		goto err;
 	}
-- 
cgit v1.2.3


From 598da548ef78927c6e8a6baeed8072fa9db74ff1 Mon Sep 17 00:00:00 2001
From: Lina Iyer <lina.iyer@linaro.org>
Date: Thu, 3 Nov 2016 14:54:35 -0700
Subject: PM / Domains: Fix compatible for domain idle state

Re-using idle state definition provided by arm,idle-state for domain
idle states creates a lot of confusion and limits further evolution of
the domain idle definition. To keep things clear and simple, define a
idle states for domain using a new compatible "domain-idle-state".

Fix existing PM domains code to look for the newly defined compatible.

Signed-off-by: Lina Iyer <lina.iyer@linaro.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 .../bindings/power/domain-idle-state.txt           | 33 ++++++++++++++++++++++
 .../devicetree/bindings/power/power_domain.txt     |  8 +++---
 drivers/base/power/domain.c                        |  2 +-
 3 files changed, 38 insertions(+), 5 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/power/domain-idle-state.txt

diff --git a/Documentation/devicetree/bindings/power/domain-idle-state.txt b/Documentation/devicetree/bindings/power/domain-idle-state.txt
new file mode 100644
index 000000000000..eefc7ed22ca2
--- /dev/null
+++ b/Documentation/devicetree/bindings/power/domain-idle-state.txt
@@ -0,0 +1,33 @@
+PM Domain Idle State Node:
+
+A domain idle state node represents the state parameters that will be used to
+select the state when there are no active components in the domain.
+
+The state node has the following parameters -
+
+- compatible:
+	Usage: Required
+	Value type: <string>
+	Definition: Must be "domain-idle-state".
+
+- entry-latency-us
+	Usage: Required
+	Value type: <prop-encoded-array>
+	Definition: u32 value representing worst case latency in
+		    microseconds required to enter the idle state.
+		    The exit-latency-us duration may be guaranteed
+		    only after entry-latency-us has passed.
+
+- exit-latency-us
+	Usage: Required
+	Value type: <prop-encoded-array>
+	Definition: u32 value representing worst case latency
+		    in microseconds required to exit the idle state.
+
+- min-residency-us
+	Usage: Required
+	Value type: <prop-encoded-array>
+	Definition: u32 value representing minimum residency duration
+		    in microseconds after which the idle state will yield
+		    power benefits after overcoming the overhead in entering
+i		    the idle state.
diff --git a/Documentation/devicetree/bindings/power/power_domain.txt b/Documentation/devicetree/bindings/power/power_domain.txt
index e1650364b296..723e1ad937da 100644
--- a/Documentation/devicetree/bindings/power/power_domain.txt
+++ b/Documentation/devicetree/bindings/power/power_domain.txt
@@ -31,7 +31,7 @@ Optional properties:
 
 - domain-idle-states : A phandle of an idle-state that shall be soaked into a
                 generic domain power state. The idle state definitions are
-                compatible with arm,idle-state specified in [1].
+                compatible with domain-idle-state specified in [1].
   The domain-idle-state property reflects the idle state of this PM domain and
   not the idle states of the devices or sub-domains in the PM domain. Devices
   and sub-domains have their own idle-states independent of the parent
@@ -85,7 +85,7 @@ Example 3:
 	};
 
 	DOMAIN_RET: state@0 {
-		compatible = "arm,idle-state";
+		compatible = "domain-idle-state";
 		reg = <0x0>;
 		entry-latency-us = <1000>;
 		exit-latency-us = <2000>;
@@ -93,7 +93,7 @@ Example 3:
 	};
 
 	DOMAIN_PWR_DN: state@1 {
-		compatible = "arm,idle-state";
+		compatible = "domain-idle-state";
 		reg = <0x1>;
 		entry-latency-us = <5000>;
 		exit-latency-us = <8000>;
@@ -118,4 +118,4 @@ The node above defines a typical PM domain consumer device, which is located
 inside a PM domain with index 0 of a power controller represented by a node
 with the label "power".
 
-[1]. Documentation/devicetree/bindings/arm/idle-states.txt
+[1]. Documentation/devicetree/bindings/power/domain-idle-state.txt
diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c
index 46e7f6052c9b..5711708532db 100644
--- a/drivers/base/power/domain.c
+++ b/drivers/base/power/domain.c
@@ -2049,7 +2049,7 @@ out:
 EXPORT_SYMBOL_GPL(genpd_dev_pm_attach);
 
 static const struct of_device_id idle_state_match[] = {
-	{ .compatible = "arm,idle-state", },
+	{ .compatible = "domain-idle-state", },
 	{ }
 };
 
-- 
cgit v1.2.3


From bed570307ed78f21b77cb04a1df781dee4a8f05a Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Mon, 5 Dec 2016 16:38:16 -0800
Subject: PM / wakeirq: Fix dedicated wakeirq for drivers not using autosuspend

I noticed some wakeirq flakeyness with consumer drivers not using
autosuspend. For drivers not using autosuspend, the wakeirq may never
get unmasked in rpm_suspend() because of irq desc->depth.

We are configuring dedicated wakeirqs to start with IRQ_NOAUTOEN as we
naturally don't want them running until rpm_suspend() is called.

However, when a consumer driver initially calls pm_runtime_get(), we
now wrongly start with disable_irq_nosync() call on the dedicated
wakeirq that is disabled to start with.

This causes desc->depth to toggle between 1 and 2 instead of the usual
0 and 1. This can prevent enable_irq() from unmasking the wakeirq as
that only happens at desc->depth 1.

This does not necessarily show up with drivers using autosuspend as
there is time for disable_irq_nosync() before rpm_suspend() gets called
after the autosuspend timeout.

Let's fix the issue by adding wirq->status that lazily gets set on
the first rpm_suspend(). We also need PM runtime core private functions
for dev_pm_enable_wake_irq_check() and dev_pm_disable_wake_irq_check()
so we can enable the dedicated wakeirq on the first rpm_suspend().

While at it, let's also fix the comments for dev_pm_enable_wake_irq()
and dev_pm_disable_wake_irq(). Those can still be used by the consumer
drivers as needed because the IRQ core manages the interrupt usecount
for us.

Fixes: 4990d4fe327b (PM / Wakeirq: Add automated device wake IRQ handling)
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/power.h   | 19 ++++++++++-
 drivers/base/power/runtime.c |  8 ++---
 drivers/base/power/wakeirq.c | 76 ++++++++++++++++++++++++++++++++++++++------
 3 files changed, 88 insertions(+), 15 deletions(-)

diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h
index 50e30e7b059d..a84332aefc2d 100644
--- a/drivers/base/power/power.h
+++ b/drivers/base/power/power.h
@@ -21,14 +21,22 @@ extern void pm_runtime_init(struct device *dev);
 extern void pm_runtime_reinit(struct device *dev);
 extern void pm_runtime_remove(struct device *dev);
 
+#define WAKE_IRQ_DEDICATED_ALLOCATED	BIT(0)
+#define WAKE_IRQ_DEDICATED_MANAGED	BIT(1)
+#define WAKE_IRQ_DEDICATED_MASK		(WAKE_IRQ_DEDICATED_ALLOCATED | \
+					 WAKE_IRQ_DEDICATED_MANAGED)
+
 struct wake_irq {
 	struct device *dev;
+	unsigned int status;
 	int irq;
-	bool dedicated_irq:1;
 };
 
 extern void dev_pm_arm_wake_irq(struct wake_irq *wirq);
 extern void dev_pm_disarm_wake_irq(struct wake_irq *wirq);
+extern void dev_pm_enable_wake_irq_check(struct device *dev,
+					 bool can_change_status);
+extern void dev_pm_disable_wake_irq_check(struct device *dev);
 
 #ifdef CONFIG_PM_SLEEP
 
@@ -104,6 +112,15 @@ static inline void dev_pm_disarm_wake_irq(struct wake_irq *wirq)
 {
 }
 
+static inline void dev_pm_enable_wake_irq_check(struct device *dev,
+						bool can_change_status)
+{
+}
+
+static inline void dev_pm_disable_wake_irq_check(struct device *dev)
+{
+}
+
 #endif
 
 #ifdef CONFIG_PM_SLEEP
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index f0d863089345..26856d050037 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -516,7 +516,7 @@ static int rpm_suspend(struct device *dev, int rpmflags)
 
 	callback = RPM_GET_CALLBACK(dev, runtime_suspend);
 
-	dev_pm_enable_wake_irq(dev);
+	dev_pm_enable_wake_irq_check(dev, true);
 	retval = rpm_callback(callback, dev);
 	if (retval)
 		goto fail;
@@ -555,7 +555,7 @@ static int rpm_suspend(struct device *dev, int rpmflags)
 	return retval;
 
  fail:
-	dev_pm_disable_wake_irq(dev);
+	dev_pm_disable_wake_irq_check(dev);
 	__update_runtime_status(dev, RPM_ACTIVE);
 	dev->power.deferred_resume = false;
 	wake_up_all(&dev->power.wait_queue);
@@ -738,12 +738,12 @@ static int rpm_resume(struct device *dev, int rpmflags)
 
 	callback = RPM_GET_CALLBACK(dev, runtime_resume);
 
-	dev_pm_disable_wake_irq(dev);
+	dev_pm_disable_wake_irq_check(dev);
 	retval = rpm_callback(callback, dev);
 	if (retval) {
 		__update_runtime_status(dev, RPM_SUSPENDED);
 		pm_runtime_cancel_pending(dev);
-		dev_pm_enable_wake_irq(dev);
+		dev_pm_enable_wake_irq_check(dev, false);
 	} else {
  no_callback:
 		__update_runtime_status(dev, RPM_ACTIVE);
diff --git a/drivers/base/power/wakeirq.c b/drivers/base/power/wakeirq.c
index 0d77cd6fd8d1..404d94c6c8bc 100644
--- a/drivers/base/power/wakeirq.c
+++ b/drivers/base/power/wakeirq.c
@@ -110,8 +110,10 @@ void dev_pm_clear_wake_irq(struct device *dev)
 	dev->power.wakeirq = NULL;
 	spin_unlock_irqrestore(&dev->power.lock, flags);
 
-	if (wirq->dedicated_irq)
+	if (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED) {
 		free_irq(wirq->irq, wirq);
+		wirq->status &= ~WAKE_IRQ_DEDICATED_MASK;
+	}
 	kfree(wirq);
 }
 EXPORT_SYMBOL_GPL(dev_pm_clear_wake_irq);
@@ -179,7 +181,6 @@ int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq)
 
 	wirq->dev = dev;
 	wirq->irq = irq;
-	wirq->dedicated_irq = true;
 	irq_set_status_flags(irq, IRQ_NOAUTOEN);
 
 	/*
@@ -195,6 +196,8 @@ int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq)
 	if (err)
 		goto err_free_irq;
 
+	wirq->status = WAKE_IRQ_DEDICATED_ALLOCATED;
+
 	return err;
 
 err_free_irq:
@@ -210,9 +213,9 @@ EXPORT_SYMBOL_GPL(dev_pm_set_dedicated_wake_irq);
  * dev_pm_enable_wake_irq - Enable device wake-up interrupt
  * @dev: Device
  *
- * Called from the bus code or the device driver for
- * runtime_suspend() to enable the wake-up interrupt while
- * the device is running.
+ * Optionally called from the bus code or the device driver for
+ * runtime_resume() to override the PM runtime core managed wake-up
+ * interrupt handling to enable the wake-up interrupt.
  *
  * Note that for runtime_suspend()) the wake-up interrupts
  * should be unconditionally enabled unlike for suspend()
@@ -222,7 +225,7 @@ void dev_pm_enable_wake_irq(struct device *dev)
 {
 	struct wake_irq *wirq = dev->power.wakeirq;
 
-	if (wirq && wirq->dedicated_irq)
+	if (wirq && (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED))
 		enable_irq(wirq->irq);
 }
 EXPORT_SYMBOL_GPL(dev_pm_enable_wake_irq);
@@ -231,19 +234,72 @@ EXPORT_SYMBOL_GPL(dev_pm_enable_wake_irq);
  * dev_pm_disable_wake_irq - Disable device wake-up interrupt
  * @dev: Device
  *
- * Called from the bus code or the device driver for
- * runtime_resume() to disable the wake-up interrupt while
- * the device is running.
+ * Optionally called from the bus code or the device driver for
+ * runtime_suspend() to override the PM runtime core managed wake-up
+ * interrupt handling to disable the wake-up interrupt.
  */
 void dev_pm_disable_wake_irq(struct device *dev)
 {
 	struct wake_irq *wirq = dev->power.wakeirq;
 
-	if (wirq && wirq->dedicated_irq)
+	if (wirq && (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED))
 		disable_irq_nosync(wirq->irq);
 }
 EXPORT_SYMBOL_GPL(dev_pm_disable_wake_irq);
 
+/**
+ * dev_pm_enable_wake_irq_check - Checks and enables wake-up interrupt
+ * @dev: Device
+ * @can_change_status: Can change wake-up interrupt status
+ *
+ * Enables wakeirq conditionally. We need to enable wake-up interrupt
+ * lazily on the first rpm_suspend(). This is needed as the consumer device
+ * starts in RPM_SUSPENDED state, and the the first pm_runtime_get() would
+ * otherwise try to disable already disabled wakeirq. The wake-up interrupt
+ * starts disabled with IRQ_NOAUTOEN set.
+ *
+ * Should be only called from rpm_suspend() and rpm_resume() path.
+ * Caller must hold &dev->power.lock to change wirq->status
+ */
+void dev_pm_enable_wake_irq_check(struct device *dev,
+				  bool can_change_status)
+{
+	struct wake_irq *wirq = dev->power.wakeirq;
+
+	if (!wirq || !((wirq->status & WAKE_IRQ_DEDICATED_MASK)))
+		return;
+
+	if (likely(wirq->status & WAKE_IRQ_DEDICATED_MANAGED)) {
+		goto enable;
+	} else if (can_change_status) {
+		wirq->status |= WAKE_IRQ_DEDICATED_MANAGED;
+		goto enable;
+	}
+
+	return;
+
+enable:
+	enable_irq(wirq->irq);
+}
+
+/**
+ * dev_pm_disable_wake_irq_check - Checks and disables wake-up interrupt
+ * @dev: Device
+ *
+ * Disables wake-up interrupt conditionally based on status.
+ * Should be only called from rpm_suspend() and rpm_resume() path.
+ */
+void dev_pm_disable_wake_irq_check(struct device *dev)
+{
+	struct wake_irq *wirq = dev->power.wakeirq;
+
+	if (!wirq || !((wirq->status & WAKE_IRQ_DEDICATED_MASK)))
+		return;
+
+	if (wirq->status & WAKE_IRQ_DEDICATED_MANAGED)
+		disable_irq_nosync(wirq->irq);
+}
+
 /**
  * dev_pm_arm_wake_irq - Arm device wake-up
  * @wirq: Device wake-up interrupt
-- 
cgit v1.2.3


From 05a926227742b0bcbef366bbd710c4f6631c7d9f Mon Sep 17 00:00:00 2001
From: Sahitya Tummala <stummala@codeaurora.org>
Date: Wed, 7 Dec 2016 20:10:32 +0530
Subject: PM / core: Fix bug in the error handling of async suspend

If async_suspend is enabled for parent and child devices, then
PM framework has to ensure that parent's async suspend gets called
only after child's async suspend is done. In case if child's async
suspend fails with error, then parent's async suspend must not be
invoked. The current code uses async_error to ensure this but there
is a problem with it in __device_suspend(). This function notifies
the completion of child's async suspend before updating its error
via async_error variable. As a result, parent's async suspend gets
invoked even though it's child suspend has failed. Fix this bug by
updating the async_error before notifying the child's completion.

Signed-off-by: Sahitya Tummala <stummala@codeaurora.org>
[ rjw: Rearranged wthitespace ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index e44944f4be77..d5a44aacba7f 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1460,10 +1460,10 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
 	dpm_watchdog_clear(&wd);
 
  Complete:
-	complete_all(&dev->power.completion);
 	if (error)
 		async_error = error;
 
+	complete_all(&dev->power.completion);
 	TRACE_SUSPEND(error);
 	return error;
 }
-- 
cgit v1.2.3


From 9320f95c0b8f1d074f570385e6855d4554f693e4 Mon Sep 17 00:00:00 2001
From: xing wei <xing.wei@intel.com>
Date: Wed, 7 Dec 2016 19:31:16 +0800
Subject: PM / sleep: Print active wakeup sources when blocking on wakeup_count
 reads

If there are any wakeup events being processed, read operation
on /sys/power/wakeup_count will be blocked, so print the names
of all active wakeup sources to help to find out who is preventing
system suspend from triggering.

While at it change pr_info() in pm_print_active_wakeup_sources()
to pr_debug() to avoid excessive log noise.

Signed-off-by: xing wei <xing.wei@intel.com>
[ rjw: Subject & changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/base/power/wakeup.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 62e4de2aa8d1..bf9ba26981a5 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -811,7 +811,7 @@ void pm_print_active_wakeup_sources(void)
 	rcu_read_lock();
 	list_for_each_entry_rcu(ws, &wakeup_sources, entry) {
 		if (ws->active) {
-			pr_info("active wakeup source: %s\n", ws->name);
+			pr_debug("active wakeup source: %s\n", ws->name);
 			active = 1;
 		} else if (!active &&
 			   (!last_activity_ws ||
@@ -822,7 +822,7 @@ void pm_print_active_wakeup_sources(void)
 	}
 
 	if (!active && last_activity_ws)
-		pr_info("last active wakeup source: %s\n",
+		pr_debug("last active wakeup source: %s\n",
 			last_activity_ws->name);
 	rcu_read_unlock();
 }
@@ -905,7 +905,7 @@ bool pm_get_wakeup_count(unsigned int *count, bool block)
 			split_counters(&cnt, &inpr);
 			if (inpr == 0 || signal_pending(current))
 				break;
-
+			pm_print_active_wakeup_sources();
 			schedule();
 		}
 		finish_wait(&wakeup_count_wait_queue, &wait);
-- 
cgit v1.2.3


From b59fe540539623767cf18045a78fb672b40270d6 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Tue, 6 Dec 2016 13:32:15 -0800
Subject: cpufreq: intel_pstate: Add locking around HWP requests

To avoid race conditions from multiple threads, increase the scope
of intel_pstate_limits_lock to include HWP requests also.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
[ rjw: Subject ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index a7f597a476f5..c7f596f7b8cc 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -853,11 +853,11 @@ static ssize_t store_no_turbo(struct kobject *a, struct attribute *b,
 
 	limits->no_turbo = clamp_t(int, input, 0, 1);
 
-	mutex_unlock(&intel_pstate_limits_lock);
-
 	if (hwp_active)
 		intel_pstate_hwp_set_online_cpus();
 
+	mutex_unlock(&intel_pstate_limits_lock);
+
 	return count;
 }
 
@@ -882,10 +882,11 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b,
 				   limits->max_perf_pct);
 	limits->max_perf = div_ext_fp(limits->max_perf_pct, 100);
 
-	mutex_unlock(&intel_pstate_limits_lock);
-
 	if (hwp_active)
 		intel_pstate_hwp_set_online_cpus();
+
+	mutex_unlock(&intel_pstate_limits_lock);
+
 	return count;
 }
 
@@ -910,10 +911,11 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b,
 				   limits->min_perf_pct);
 	limits->min_perf = div_ext_fp(limits->min_perf_pct, 100);
 
-	mutex_unlock(&intel_pstate_limits_lock);
-
 	if (hwp_active)
 		intel_pstate_hwp_set_online_cpus();
+
+	mutex_unlock(&intel_pstate_limits_lock);
+
 	return count;
 }
 
@@ -1664,7 +1666,6 @@ static void intel_pstate_clear_update_util_hook(unsigned int cpu)
 
 static void intel_pstate_set_performance_limits(struct perf_limits *limits)
 {
-	mutex_lock(&intel_pstate_limits_lock);
 	limits->no_turbo = 0;
 	limits->turbo_disabled = 0;
 	limits->max_perf_pct = 100;
@@ -1675,15 +1676,12 @@ static void intel_pstate_set_performance_limits(struct perf_limits *limits)
 	limits->max_sysfs_pct = 100;
 	limits->min_policy_pct = 0;
 	limits->min_sysfs_pct = 0;
-	mutex_unlock(&intel_pstate_limits_lock);
 }
 
 static void intel_pstate_update_perf_limits(struct cpufreq_policy *policy,
 					    struct perf_limits *limits)
 {
 
-	mutex_lock(&intel_pstate_limits_lock);
-
 	limits->max_policy_pct = DIV_ROUND_UP(policy->max * 100,
 					      policy->cpuinfo.max_freq);
 	limits->max_policy_pct = clamp_t(int, limits->max_policy_pct, 0, 100);
@@ -1714,8 +1712,6 @@ static void intel_pstate_update_perf_limits(struct cpufreq_policy *policy,
 	limits->max_perf = round_up(limits->max_perf, EXT_FRAC_BITS);
 	limits->min_perf = round_up(limits->min_perf, EXT_FRAC_BITS);
 
-	mutex_unlock(&intel_pstate_limits_lock);
-
 	pr_debug("cpu:%d max_perf_pct:%d min_perf_pct:%d\n", policy->cpu,
 		 limits->max_perf_pct, limits->min_perf_pct);
 }
@@ -1744,6 +1740,8 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
 	if (per_cpu_limits)
 		perf_limits = cpu->perf_limits;
 
+	mutex_lock(&intel_pstate_limits_lock);
+
 	if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) {
 		if (!perf_limits) {
 			limits = &performance_limits;
@@ -1778,6 +1776,8 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy)
 
 	intel_pstate_hwp_set_policy(policy);
 
+	mutex_unlock(&intel_pstate_limits_lock);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 984edbdccc6ff01b953492f72296685ce3ea2497 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Tue, 6 Dec 2016 13:32:16 -0800
Subject: cpufreq: intel_pstate: Support for energy performance hints with HWP

It is possible to provide hints to the HWP algorithms in the processor
to be more performance centric to more energy centric. These hints are
provided by using HWP energy performance preference (EPP) or energy
performance bias (EPB) settings.

The scope of these settings is per logical processor, which means that
each of the logical processors in the package can be programmed with a
different value.

This change provides cpufreq sysfs interface to provide hint. For each
policy, two additional attributes will be available to check and provide
hint. These attributes will only be present when the intel_pstate driver
is using HWP mode.

These attributes are:
 - energy_performance_available_preferences
 - energy_performance_preference

To get list of supported hints:
$ cat energy_performance_available_preferences
default performance balance_performance balance_power power

The current preference can be read or changed via cpufreq sysfs
attribute "energy_performance_preference". Reading from this attribute
will display current effective setting changed via any method. User can
write any of the valid preference string to this attribute. User can
always restore to power-on default by writing "default".

Implementation
Since these hints can be provided by direct MSR write or using some tools
like x86_energy_perf_policy, the driver internally doesn't maintain any
state. The user operation will result in direct read/write of MSR: 0x774
(HWP_REQUEST_MSR). Also driver use read modify write to update other
fields in this MSR.

Summary of changes:
 - struct cpudata field epp_saved is renamed to epp_powersave, as this
   stores the value to restore once policy is switched from performance
   to powersave to restore original powersave EPP value.
 - A new struct cpudata field epp_saved is used to store the raw MSR
   EPP/EPB value when a CPU goes offline or on suspend and restore on
   online/resume. This ensures that EPP value is restored to correct
   value irrespective of the means used to set.
 - EPP/EPB value ranges are fixed for each preference, which can be
   set for the cpufreq sysfs, so user request is mapped to/from this
   range.
 - New attributes are only added when HWP is present.
 - Since EPP value of 0 is valid the fields are initialized to
   -EINVAL when not valid. The field epp_default is read only once
   after powerup to avoid reading on subsequent CPU online operation
 - New suspend callback to store epp on suspend operation
 - Don't invalidate old epp_saved field on resume and online as now
   we can restore last epp value on suspend and this field can still
   have old EPP value sampled during switch to performance from
   powersave.
 - While here optimized setting of cpu_data->epp_powersave = epp in
   intel_pstate_hwp_set() as this was done in both true and false
   paths.
 - epp/epb set function returns error to caller on failure to pass
   on to user space for display.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/intel_pstate.c | 242 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 224 insertions(+), 18 deletions(-)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index c7f596f7b8cc..7cd0177ddeaf 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -249,9 +249,14 @@ struct perf_limits {
  *			when per cpu controls are enforced
  * @acpi_perf_data:	Stores ACPI perf information read from _PSS
  * @valid_pss_table:	Set to true for valid ACPI _PSS entries found
- * @epp_saved:		Last saved HWP energy performance preference
- *			(EPP) or energy performance bias (EPB)
+ * @epp_powersave:	Last saved HWP energy performance preference
+ *			(EPP) or energy performance bias (EPB),
+ *			when policy switched to performance
  * @epp_policy:		Last saved policy used to set EPP/EPB
+ * @epp_default:	Power on default HWP energy performance
+ *			preference/bias
+ * @epp_saved:		Saved EPP/EPB during system suspend or CPU offline
+ *			operation
  *
  * This structure stores per CPU instance data for all CPUs.
  */
@@ -279,8 +284,10 @@ struct cpudata {
 	bool valid_pss_table;
 #endif
 	unsigned int iowait_boost;
-	s16 epp_saved;
+	s16 epp_powersave;
 	s16 epp_policy;
+	s16 epp_default;
+	s16 epp_saved;
 };
 
 static struct cpudata **all_cpu_data;
@@ -598,29 +605,204 @@ static s16 intel_pstate_get_epp(struct cpudata *cpu_data, u64 hwp_req_data)
 {
 	s16 epp;
 
-	if (static_cpu_has(X86_FEATURE_HWP_EPP))
+	if (static_cpu_has(X86_FEATURE_HWP_EPP)) {
+		/*
+		 * When hwp_req_data is 0, means that caller didn't read
+		 * MSR_HWP_REQUEST, so need to read and get EPP.
+		 */
+		if (!hwp_req_data) {
+			epp = rdmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST,
+					    &hwp_req_data);
+			if (epp)
+				return epp;
+		}
 		epp = (hwp_req_data >> 24) & 0xff;
-	else
+	} else {
 		/* When there is no EPP present, HWP uses EPB settings */
 		epp = intel_pstate_get_epb(cpu_data);
+	}
 
 	return epp;
 }
 
-static void intel_pstate_set_epb(int cpu, s16 pref)
+static int intel_pstate_set_epb(int cpu, s16 pref)
 {
 	u64 epb;
+	int ret;
 
 	if (!static_cpu_has(X86_FEATURE_EPB))
-		return;
+		return -ENXIO;
 
-	if (rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb))
-		return;
+	ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb);
+	if (ret)
+		return ret;
 
 	epb = (epb & ~0x0f) | pref;
 	wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, epb);
+
+	return 0;
 }
 
+/*
+ * EPP/EPB display strings corresponding to EPP index in the
+ * energy_perf_strings[]
+ *	index		String
+ *-------------------------------------
+ *	0		default
+ *	1		performance
+ *	2		balance_performance
+ *	3		balance_power
+ *	4		power
+ */
+static const char * const energy_perf_strings[] = {
+	"default",
+	"performance",
+	"balance_performance",
+	"balance_power",
+	"power",
+	NULL
+};
+
+static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data)
+{
+	s16 epp;
+	int index = -EINVAL;
+
+	epp = intel_pstate_get_epp(cpu_data, 0);
+	if (epp < 0)
+		return epp;
+
+	if (static_cpu_has(X86_FEATURE_HWP_EPP)) {
+		/*
+		 * Range:
+		 *	0x00-0x3F	:	Performance
+		 *	0x40-0x7F	:	Balance performance
+		 *	0x80-0xBF	:	Balance power
+		 *	0xC0-0xFF	:	Power
+		 * The EPP is a 8 bit value, but our ranges restrict the
+		 * value which can be set. Here only using top two bits
+		 * effectively.
+		 */
+		index = (epp >> 6) + 1;
+	} else if (static_cpu_has(X86_FEATURE_EPB)) {
+		/*
+		 * Range:
+		 *	0x00-0x03	:	Performance
+		 *	0x04-0x07	:	Balance performance
+		 *	0x08-0x0B	:	Balance power
+		 *	0x0C-0x0F	:	Power
+		 * The EPB is a 4 bit value, but our ranges restrict the
+		 * value which can be set. Here only using top two bits
+		 * effectively.
+		 */
+		index = (epp >> 2) + 1;
+	}
+
+	return index;
+}
+
+static int intel_pstate_set_energy_pref_index(struct cpudata *cpu_data,
+					      int pref_index)
+{
+	int epp = -EINVAL;
+	int ret;
+
+	if (!pref_index)
+		epp = cpu_data->epp_default;
+
+	mutex_lock(&intel_pstate_limits_lock);
+
+	if (static_cpu_has(X86_FEATURE_HWP_EPP)) {
+		u64 value;
+
+		ret = rdmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, &value);
+		if (ret)
+			goto return_pref;
+
+		value &= ~GENMASK_ULL(31, 24);
+
+		/*
+		 * If epp is not default, convert from index into
+		 * energy_perf_strings to epp value, by shifting 6
+		 * bits left to use only top two bits in epp.
+		 * The resultant epp need to shifted by 24 bits to
+		 * epp position in MSR_HWP_REQUEST.
+		 */
+		if (epp == -EINVAL)
+			epp = (pref_index - 1) << 6;
+
+		value |= (u64)epp << 24;
+		ret = wrmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, value);
+	} else {
+		if (epp == -EINVAL)
+			epp = (pref_index - 1) << 2;
+		ret = intel_pstate_set_epb(cpu_data->cpu, epp);
+	}
+return_pref:
+	mutex_unlock(&intel_pstate_limits_lock);
+
+	return ret;
+}
+
+static ssize_t show_energy_performance_available_preferences(
+				struct cpufreq_policy *policy, char *buf)
+{
+	int i = 0;
+	int ret = 0;
+
+	while (energy_perf_strings[i] != NULL)
+		ret += sprintf(&buf[ret], "%s ", energy_perf_strings[i++]);
+
+	ret += sprintf(&buf[ret], "\n");
+
+	return ret;
+}
+
+cpufreq_freq_attr_ro(energy_performance_available_preferences);
+
+static ssize_t store_energy_performance_preference(
+		struct cpufreq_policy *policy, const char *buf, size_t count)
+{
+	struct cpudata *cpu_data = all_cpu_data[policy->cpu];
+	char str_preference[21];
+	int ret, i = 0;
+
+	ret = sscanf(buf, "%20s", str_preference);
+	if (ret != 1)
+		return -EINVAL;
+
+	while (energy_perf_strings[i] != NULL) {
+		if (!strcmp(str_preference, energy_perf_strings[i])) {
+			intel_pstate_set_energy_pref_index(cpu_data, i);
+			return count;
+		}
+		++i;
+	}
+
+	return -EINVAL;
+}
+
+static ssize_t show_energy_performance_preference(
+				struct cpufreq_policy *policy, char *buf)
+{
+	struct cpudata *cpu_data = all_cpu_data[policy->cpu];
+	int preference;
+
+	preference = intel_pstate_get_energy_pref_index(cpu_data);
+	if (preference < 0)
+		return preference;
+
+	return  sprintf(buf, "%s\n", energy_perf_strings[preference]);
+}
+
+cpufreq_freq_attr_rw(energy_performance_preference);
+
+static struct freq_attr *hwp_cpufreq_attrs[] = {
+	&energy_performance_preference,
+	&energy_performance_available_preferences,
+	NULL,
+};
+
 static void intel_pstate_hwp_set(const struct cpumask *cpumask)
 {
 	int min, hw_min, max, hw_max, cpu, range, adj_range;
@@ -665,20 +847,24 @@ static void intel_pstate_hwp_set(const struct cpumask *cpumask)
 
 		cpu_data->epp_policy = cpu_data->policy;
 
+		if (cpu_data->epp_saved >= 0) {
+			epp = cpu_data->epp_saved;
+			cpu_data->epp_saved = -EINVAL;
+			goto update_epp;
+		}
+
 		if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) {
 			epp = intel_pstate_get_epp(cpu_data, value);
+			cpu_data->epp_powersave = epp;
 			/* If EPP read was failed, then don't try to write */
-			if (epp < 0) {
-				cpu_data->epp_saved = epp;
+			if (epp < 0)
 				goto skip_epp;
-			}
 
-			cpu_data->epp_saved = epp;
 
 			epp = 0;
 		} else {
 			/* skip setting EPP, when saved value is invalid */
-			if (cpu_data->epp_saved < 0)
+			if (cpu_data->epp_powersave < 0)
 				goto skip_epp;
 
 			/*
@@ -692,8 +878,9 @@ static void intel_pstate_hwp_set(const struct cpumask *cpumask)
 			if (epp)
 				goto skip_epp;
 
-			epp = cpu_data->epp_saved;
+			epp = cpu_data->epp_powersave;
 		}
+update_epp:
 		if (static_cpu_has(X86_FEATURE_HWP_EPP)) {
 			value &= ~GENMASK_ULL(31, 24);
 			value |= (u64)epp << 24;
@@ -713,13 +900,24 @@ static int intel_pstate_hwp_set_policy(struct cpufreq_policy *policy)
 	return 0;
 }
 
+static int intel_pstate_hwp_save_state(struct cpufreq_policy *policy)
+{
+	struct cpudata *cpu_data = all_cpu_data[policy->cpu];
+
+	if (!hwp_active)
+		return 0;
+
+	cpu_data->epp_saved = intel_pstate_get_epp(cpu_data, 0);
+
+	return 0;
+}
+
 static int intel_pstate_resume(struct cpufreq_policy *policy)
 {
 	if (!hwp_active)
 		return 0;
 
 	all_cpu_data[policy->cpu]->epp_policy = 0;
-	all_cpu_data[policy->cpu]->epp_saved = -EINVAL;
 
 	return intel_pstate_hwp_set_policy(policy);
 }
@@ -977,7 +1175,8 @@ static void intel_pstate_hwp_enable(struct cpudata *cpudata)
 
 	wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1);
 	cpudata->epp_policy = 0;
-	cpudata->epp_saved = -EINVAL;
+	if (cpudata->epp_default == -EINVAL)
+		cpudata->epp_default = intel_pstate_get_epp(cpudata, 0);
 }
 
 static int atom_get_min_pstate(void)
@@ -1610,6 +1809,9 @@ static int intel_pstate_init_cpu(unsigned int cpunum)
 		if (per_cpu_limits)
 			cpu->perf_limits = (struct perf_limits *)(cpu + 1);
 
+		cpu->epp_default = -EINVAL;
+		cpu->epp_powersave = -EINVAL;
+		cpu->epp_saved = -EINVAL;
 	}
 
 	cpu = all_cpu_data[cpunum];
@@ -1802,7 +2004,9 @@ static void intel_pstate_stop_cpu(struct cpufreq_policy *policy)
 	pr_debug("CPU %d exiting\n", policy->cpu);
 
 	intel_pstate_clear_update_util_hook(policy->cpu);
-	if (!hwp_active)
+	if (hwp_active)
+		intel_pstate_hwp_save_state(policy);
+	else
 		intel_cpufreq_stop_cpu(policy);
 }
 
@@ -1872,6 +2076,7 @@ static struct cpufreq_driver intel_pstate = {
 	.flags		= CPUFREQ_CONST_LOOPS,
 	.verify		= intel_pstate_verify_policy,
 	.setpolicy	= intel_pstate_set_policy,
+	.suspend	= intel_pstate_hwp_save_state,
 	.resume		= intel_pstate_resume,
 	.get		= intel_pstate_get,
 	.init		= intel_pstate_cpu_init,
@@ -2189,6 +2394,7 @@ static int __init intel_pstate_init(void)
 	if (x86_match_cpu(hwp_support_ids) && !no_hwp) {
 		copy_cpu_funcs(&core_params.funcs);
 		hwp_active++;
+		intel_pstate.attr = hwp_cpufreq_attrs;
 		goto hwp_cpu_matched;
 	}
 
-- 
cgit v1.2.3


From bf006e149a5717a4614d44a62090ae98775af7a0 Mon Sep 17 00:00:00 2001
From: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Date: Tue, 6 Dec 2016 13:32:17 -0800
Subject: Documentation: intel_pstate: Document HWP energy/performance hints

Updated documentation for the support of energy performance hint in
the HWP mode.

Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
[ rjw: Subject & changelog ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/cpu-freq/intel-pstate.txt | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/Documentation/cpu-freq/intel-pstate.txt b/Documentation/cpu-freq/intel-pstate.txt
index 198b2ef88aec..1953994ef5e6 100644
--- a/Documentation/cpu-freq/intel-pstate.txt
+++ b/Documentation/cpu-freq/intel-pstate.txt
@@ -139,6 +139,30 @@ architecture of the processor permits:
 - User can read write system wide turbo status
 	/sys/devices/system/cpu/no_turbo
 
+Support of energy performance hints
+It is possible to provide hints to the HWP algorithms in the processor
+to be more performance centric to more energy centric. When the driver
+is using HWP, two additional cpufreq sysfs attributes are presented for
+each logical CPU.
+These attributes are:
+	- energy_performance_available_preferences
+	- energy_performance_preference
+
+To get list of supported hints:
+$ cat energy_performance_available_preferences
+    default performance balance_performance balance_power power
+
+The current preference can be read or changed via cpufreq sysfs
+attribute "energy_performance_preference". Reading from this attribute
+will display current effective setting. User can write any of the valid
+preference string to this attribute. User can always restore to power-on
+default by writing "default".
+
+Since threads can migrate to different CPUs, this is possible that the
+new CPU may have different energy performance preference than the previous
+one. To avoid such issues, either threads can be pinned to specific CPUs
+or set the same energy performance preference value to all CPUs.
+
 Tuning Intel P-State driver
 
 When the performance can be tuned using PID (Proportional Integral
-- 
cgit v1.2.3


From c8ce82b9b9c40d66709cce588f6281dd47cc3922 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 1 Dec 2016 15:55:40 +0530
Subject: devfreq: exynos: Don't use OPP structures outside of RCU locks

The OPP structures are abused to the best here, without understanding
how the OPP core and RCU locks work.

In short, the OPP pointer saved 'struct exynos_bus' can become invalid
under your nose, as the OPP core may free it.

Fix various abuses around OPP structures and calls.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Chanwoo Choi <cw00.choi@samsung.com>
Tested-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/devfreq/exynos-bus.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/drivers/devfreq/exynos-bus.c b/drivers/devfreq/exynos-bus.c
index 29866f7e6d7e..a8ed7792ece2 100644
--- a/drivers/devfreq/exynos-bus.c
+++ b/drivers/devfreq/exynos-bus.c
@@ -35,7 +35,7 @@ struct exynos_bus {
 	unsigned int edev_count;
 	struct mutex lock;
 
-	struct dev_pm_opp *curr_opp;
+	unsigned long curr_freq;
 
 	struct regulator *regulator;
 	struct clk *clk;
@@ -99,7 +99,7 @@ static int exynos_bus_target(struct device *dev, unsigned long *freq, u32 flags)
 {
 	struct exynos_bus *bus = dev_get_drvdata(dev);
 	struct dev_pm_opp *new_opp;
-	unsigned long old_freq, new_freq, old_volt, new_volt, tol;
+	unsigned long old_freq, new_freq, new_volt, tol;
 	int ret = 0;
 
 	/* Get new opp-bus instance according to new bus clock */
@@ -113,8 +113,7 @@ static int exynos_bus_target(struct device *dev, unsigned long *freq, u32 flags)
 
 	new_freq = dev_pm_opp_get_freq(new_opp);
 	new_volt = dev_pm_opp_get_voltage(new_opp);
-	old_freq = dev_pm_opp_get_freq(bus->curr_opp);
-	old_volt = dev_pm_opp_get_voltage(bus->curr_opp);
+	old_freq = bus->curr_freq;
 	rcu_read_unlock();
 
 	if (old_freq == new_freq)
@@ -146,7 +145,7 @@ static int exynos_bus_target(struct device *dev, unsigned long *freq, u32 flags)
 			goto out;
 		}
 	}
-	bus->curr_opp = new_opp;
+	bus->curr_freq = new_freq;
 
 	dev_dbg(dev, "Set the frequency of bus (%lukHz -> %lukHz)\n",
 			old_freq/1000, new_freq/1000);
@@ -163,9 +162,7 @@ static int exynos_bus_get_dev_status(struct device *dev,
 	struct devfreq_event_data edata;
 	int ret;
 
-	rcu_read_lock();
-	stat->current_frequency = dev_pm_opp_get_freq(bus->curr_opp);
-	rcu_read_unlock();
+	stat->current_frequency = bus->curr_freq;
 
 	ret = exynos_bus_get_event(bus, &edata);
 	if (ret < 0) {
@@ -226,7 +223,7 @@ static int exynos_bus_passive_target(struct device *dev, unsigned long *freq,
 	}
 
 	new_freq = dev_pm_opp_get_freq(new_opp);
-	old_freq = dev_pm_opp_get_freq(bus->curr_opp);
+	old_freq = bus->curr_freq;
 	rcu_read_unlock();
 
 	if (old_freq == new_freq)
@@ -242,7 +239,7 @@ static int exynos_bus_passive_target(struct device *dev, unsigned long *freq,
 	}
 
 	*freq = new_freq;
-	bus->curr_opp = new_opp;
+	bus->curr_freq = new_freq;
 
 	dev_dbg(dev, "Set the frequency of bus (%lukHz -> %lukHz)\n",
 			old_freq/1000, new_freq/1000);
@@ -335,6 +332,7 @@ static int exynos_bus_parse_of(struct device_node *np,
 			      struct exynos_bus *bus)
 {
 	struct device *dev = bus->dev;
+	struct dev_pm_opp *opp;
 	unsigned long rate;
 	int ret;
 
@@ -352,22 +350,23 @@ static int exynos_bus_parse_of(struct device_node *np,
 	}
 
 	/* Get the freq and voltage from OPP table to scale the bus freq */
-	rcu_read_lock();
 	ret = dev_pm_opp_of_add_table(dev);
 	if (ret < 0) {
 		dev_err(dev, "failed to get OPP table\n");
-		rcu_read_unlock();
 		goto err_clk;
 	}
 
 	rate = clk_get_rate(bus->clk);
-	bus->curr_opp = devfreq_recommended_opp(dev, &rate, 0);
-	if (IS_ERR(bus->curr_opp)) {
+
+	rcu_read_lock();
+	opp = devfreq_recommended_opp(dev, &rate, 0);
+	if (IS_ERR(opp)) {
 		dev_err(dev, "failed to find dev_pm_opp\n");
 		rcu_read_unlock();
-		ret = PTR_ERR(bus->curr_opp);
+		ret = PTR_ERR(opp);
 		goto err_opp;
 	}
+	bus->curr_freq = dev_pm_opp_get_freq(opp);
 	rcu_read_unlock();
 
 	return 0;
-- 
cgit v1.2.3


From d8323de3d4062e1b5a5aa7e9c0f935138405a582 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 1 Dec 2016 16:12:14 +0530
Subject: devfreq: rk3399_dmc: Remove dangling rcu_read_unlock()

This call never had the rcu_read_lock() counterpart. Remove the unlock
part as well.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/devfreq/rk3399_dmc.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/devfreq/rk3399_dmc.c b/drivers/devfreq/rk3399_dmc.c
index 5063ac1a5939..971ee6358456 100644
--- a/drivers/devfreq/rk3399_dmc.c
+++ b/drivers/devfreq/rk3399_dmc.c
@@ -414,7 +414,6 @@ static int rk3399_dmcfreq_probe(struct platform_device *pdev)
 	 */
 	if (dev_pm_opp_of_add_table(dev)) {
 		dev_err(dev, "Invalid operating-points in device tree.\n");
-		rcu_read_unlock();
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From e37d35082e75982ef714f9e25bfa43a061d0c5e6 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Mon, 5 Dec 2016 08:53:52 +0530
Subject: devfreq: rk3399_dmc: Don't use OPP structures outside of RCU locks

The OPP structures are abused to the best here, without understanding
how the OPP core and RCU locks work.

In short, the OPP pointer saved in 'rk3399_dmcfreq' can become invalid
under your nose, as the OPP core may free it.

Fix various abuses around OPP structures and calls.

Reviewed-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/devfreq/rk3399_dmc.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/devfreq/rk3399_dmc.c b/drivers/devfreq/rk3399_dmc.c
index 971ee6358456..27d2f349b53c 100644
--- a/drivers/devfreq/rk3399_dmc.c
+++ b/drivers/devfreq/rk3399_dmc.c
@@ -80,7 +80,6 @@ struct rk3399_dmcfreq {
 	struct regulator *vdd_center;
 	unsigned long rate, target_rate;
 	unsigned long volt, target_volt;
-	struct dev_pm_opp *curr_opp;
 };
 
 static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq,
@@ -102,9 +101,6 @@ static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq,
 	target_rate = dev_pm_opp_get_freq(opp);
 	target_volt = dev_pm_opp_get_voltage(opp);
 
-	dmcfreq->rate = dev_pm_opp_get_freq(dmcfreq->curr_opp);
-	dmcfreq->volt = dev_pm_opp_get_voltage(dmcfreq->curr_opp);
-
 	rcu_read_unlock();
 
 	if (dmcfreq->rate == target_rate)
@@ -165,7 +161,9 @@ static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq,
 	if (err)
 		dev_err(dev, "Cannot to set vol %lu uV\n", target_volt);
 
-	dmcfreq->curr_opp = opp;
+	dmcfreq->rate = target_rate;
+	dmcfreq->volt = target_volt;
+
 out:
 	mutex_unlock(&dmcfreq->lock);
 	return err;
@@ -430,8 +428,9 @@ static int rk3399_dmcfreq_probe(struct platform_device *pdev)
 		rcu_read_unlock();
 		return PTR_ERR(opp);
 	}
+	data->rate = dev_pm_opp_get_freq(opp);
+	data->volt = dev_pm_opp_get_voltage(opp);
 	rcu_read_unlock();
-	data->curr_opp = opp;
 
 	rk3399_devfreq_dmc_profile.initial_freq = data->rate;
 
-- 
cgit v1.2.3