diff options
79 files changed, 4399 insertions, 1549 deletions
diff --git a/Documentation/ABI/testing/sysfs-power b/Documentation/ABI/testing/sysfs-power index 50b368d490b5..f523e5a3ac33 100644 --- a/Documentation/ABI/testing/sysfs-power +++ b/Documentation/ABI/testing/sysfs-power @@ -7,30 +7,35 @@ Description: subsystem. What: /sys/power/state -Date: May 2014 +Date: November 2016 Contact: Rafael J. Wysocki <rjw@rjwysocki.net> Description: The /sys/power/state file controls system sleep states. Reading from this file returns the available sleep state - labels, which may be "mem", "standby", "freeze" and "disk" - (hibernation). The meanings of the first three labels depend on - the relative_sleep_states command line argument as follows: - 1) relative_sleep_states = 1 - "mem", "standby", "freeze" represent non-hibernation sleep - states from the deepest ("mem", always present) to the - shallowest ("freeze"). "standby" and "freeze" may or may - not be present depending on the capabilities of the - platform. "freeze" can only be present if "standby" is - present. - 2) relative_sleep_states = 0 (default) - "mem" - "suspend-to-RAM", present if supported. - "standby" - "power-on suspend", present if supported. - "freeze" - "suspend-to-idle", always present. - - Writing to this file one of these strings causes the system to - transition into the corresponding state, if available. See - Documentation/power/states.txt for a description of what - "suspend-to-RAM", "power-on suspend" and "suspend-to-idle" mean. + labels, which may be "mem" (suspend), "standby" (power-on + suspend), "freeze" (suspend-to-idle) and "disk" (hibernation). + + Writing one of the above strings to this file causes the system + to transition into the corresponding state, if available. + + See Documentation/power/states.txt for more information. + +What: /sys/power/mem_sleep +Date: November 2016 +Contact: Rafael J. Wysocki <rjw@rjwysocki.net> +Description: + The /sys/power/mem_sleep file controls the operating mode of + system suspend. Reading from it returns the available modes + as "s2idle" (always present), "shallow" and "deep" (present if + supported). The mode that will be used on subsequent attempts + to suspend the system (by writing "mem" to the /sys/power/state + file described above) is enclosed in square brackets. + + Writing one of the above strings to this file causes the mode + represented by it to be used on subsequent attempts to suspend + the system. + + See Documentation/power/states.txt for more information. What: /sys/power/disk Date: September 2006 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 62d68b2056de..be2d6d0a03a4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1560,6 +1560,12 @@ disable Do not enable intel_pstate as the default scaling driver for the supported processors + passive + Use intel_pstate as a scaling driver, but configure it + to work with generic cpufreq governors (instead of + enabling its internal governor). This mode cannot be + used along with the hardware-managed P-states (HWP) + feature. force Enable intel_pstate on systems that prohibit it by default in favor of acpi-cpufreq. Forcing the intel_pstate driver @@ -1580,6 +1586,9 @@ Description Table, specifies preferred power management profile as "Enterprise Server" or "Performance Server", then this feature is turned on by default. + per_cpu_perf_limits + Allow per-logical-CPU P-State performance control limits using + cpufreq sysfs interface intremap= [X86-64, Intel-IOMMU] on enable Interrupt Remapping (default) @@ -2122,6 +2131,12 @@ memory contents and reserves bad memory regions that are detected. + mem_sleep_default= [SUSPEND] Default system suspend mode: + s2idle - Suspend-To-Idle + shallow - Power-On Suspend or equivalent (if supported) + deep - Suspend-To-RAM or equivalent (if supported) + See Documentation/power/states.txt. + meye.*= [HW] Set MotionEye Camera parameters See Documentation/video4linux/meye.txt. @@ -3475,13 +3490,6 @@ [KNL, SMP] Set scheduler's default relax_domain_level. See Documentation/cgroup-v1/cpusets.txt. - relative_sleep_states= - [SUSPEND] Use sleep state labeling where the deepest - state available other than hibernation is always "mem". - Format: { "0" | "1" } - 0 -- Traditional sleep state labels. - 1 -- Relative sleep state labels. - reserve= [KNL,BUGS] Force the kernel to ignore some iomem area reservetop= [X86-32] diff --git a/Documentation/cpu-freq/cpufreq-stats.txt b/Documentation/cpu-freq/cpufreq-stats.txt index 8d9773f23550..3c355f6ad834 100644 --- a/Documentation/cpu-freq/cpufreq-stats.txt +++ b/Documentation/cpu-freq/cpufreq-stats.txt @@ -44,11 +44,17 @@ the stats driver insertion. total 0 drwxr-xr-x 2 root root 0 May 14 16:06 . drwxr-xr-x 3 root root 0 May 14 15:58 .. +--w------- 1 root root 4096 May 14 16:06 reset -r--r--r-- 1 root root 4096 May 14 16:06 time_in_state -r--r--r-- 1 root root 4096 May 14 16:06 total_trans -r--r--r-- 1 root root 4096 May 14 16:06 trans_table -------------------------------------------------------------------------------- +- reset +Write-only attribute that can be used to reset the stat counters. This can be +useful for evaluating system behaviour under different governors without the +need for a reboot. + - time_in_state This gives the amount of time spent in each of the frequencies supported by this CPU. The cat output will have "<frequency> <time>" pair in each line, which diff --git a/Documentation/cpu-freq/intel-pstate.txt b/Documentation/cpu-freq/intel-pstate.txt index e6bd1e6512a5..1953994ef5e6 100644 --- a/Documentation/cpu-freq/intel-pstate.txt +++ b/Documentation/cpu-freq/intel-pstate.txt @@ -48,7 +48,7 @@ In addition to the frequency-controlling interfaces provided by the cpufreq core, the driver provides its own sysfs files to control the P-State selection. These files have been added to /sys/devices/system/cpu/intel_pstate/. Any changes made to these files are applicable to all CPUs (even in a -multi-package system). +multi-package system, Refer to later section on placing "Per-CPU limits"). max_perf_pct: Limits the maximum P-State that will be requested by the driver. It states it as a percentage of the available performance. The @@ -120,13 +120,57 @@ frequency is fictional for Intel Core processors. Even if the scaling driver selects a single P-State, the actual frequency the processor will run at is selected by the processor itself. +Per-CPU limits + +The kernel command line option "intel_pstate=per_cpu_perf_limits" forces +the intel_pstate driver to use per-CPU performance limits. When it is set, +the sysfs control interface described above is subject to limitations. +- The following controls are not available for both read and write + /sys/devices/system/cpu/intel_pstate/max_perf_pct + /sys/devices/system/cpu/intel_pstate/min_perf_pct +- The following controls can be used to set performance limits, as far as the +architecture of the processor permits: + /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq + /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq + /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor +- User can still observe turbo percent and number of P-States from + /sys/devices/system/cpu/intel_pstate/turbo_pct + /sys/devices/system/cpu/intel_pstate/num_pstates +- User can read write system wide turbo status + /sys/devices/system/cpu/no_turbo + +Support of energy performance hints +It is possible to provide hints to the HWP algorithms in the processor +to be more performance centric to more energy centric. When the driver +is using HWP, two additional cpufreq sysfs attributes are presented for +each logical CPU. +These attributes are: + - energy_performance_available_preferences + - energy_performance_preference + +To get list of supported hints: +$ cat energy_performance_available_preferences + default performance balance_performance balance_power power + +The current preference can be read or changed via cpufreq sysfs +attribute "energy_performance_preference". Reading from this attribute +will display current effective setting. User can write any of the valid +preference string to this attribute. User can always restore to power-on +default by writing "default". + +Since threads can migrate to different CPUs, this is possible that the +new CPU may have different energy performance preference than the previous +one. To avoid such issues, either threads can be pinned to specific CPUs +or set the same energy performance preference value to all CPUs. + Tuning Intel P-State driver -When HWP mode is not used, debugfs files have also been added to allow the -tuning of the internal governor algorithm. These files are located at -/sys/kernel/debug/pstate_snb/. The algorithm uses a PID (Proportional -Integral Derivative) controller. The PID tunable parameters are: +When the performance can be tuned using PID (Proportional Integral +Derivative) controller, debugfs files are provided for adjusting performance. +They are presented under: +/sys/kernel/debug/pstate_snb/ +The PID tunable parameters are: deadband d_gain_pct i_gain_pct diff --git a/Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt b/Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt new file mode 100644 index 000000000000..af2385795d78 --- /dev/null +++ b/Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt @@ -0,0 +1,78 @@ +Broadcom AVS mail box and interrupt register bindings +===================================================== + +A total of three DT nodes are required. One node (brcm,avs-cpu-data-mem) +references the mailbox register used to communicate with the AVS CPU[1]. The +second node (brcm,avs-cpu-l2-intr) is required to trigger an interrupt on +the AVS CPU. The interrupt tells the AVS CPU that it needs to process a +command sent to it by a driver. Interrupting the AVS CPU is mandatory for +commands to be processed. + +The interface also requires a reference to the AVS host interrupt controller, +so a driver can react to interrupts generated by the AVS CPU whenever a command +has been processed. See [2] for more information on the brcm,l2-intc node. + +[1] The AVS CPU is an independent co-processor that runs proprietary +firmware. On some SoCs, this firmware supports DFS and DVFS in addition to +Adaptive Voltage Scaling. + +[2] Documentation/devicetree/bindings/interrupt-controller/brcm,l2-intc.txt + + +Node brcm,avs-cpu-data-mem +-------------------------- + +Required properties: +- compatible: must include: brcm,avs-cpu-data-mem and + should include: one of brcm,bcm7271-avs-cpu-data-mem or + brcm,bcm7268-avs-cpu-data-mem +- reg: Specifies base physical address and size of the registers. +- interrupts: The interrupt that the AVS CPU will use to interrupt the host + when a command completed. +- interrupt-parent: The interrupt controller the above interrupt is routed + through. +- interrupt-names: The name of the interrupt used to interrupt the host. + +Optional properties: +- None + +Node brcm,avs-cpu-l2-intr +------------------------- + +Required properties: +- compatible: must include: brcm,avs-cpu-l2-intr and + should include: one of brcm,bcm7271-avs-cpu-l2-intr or + brcm,bcm7268-avs-cpu-l2-intr +- reg: Specifies base physical address and size of the registers. + +Optional properties: +- None + + +Example +======= + + avs_host_l2_intc: interrupt-controller@f04d1200 { + #interrupt-cells = <1>; + compatible = "brcm,l2-intc"; + interrupt-parent = <&intc>; + reg = <0xf04d1200 0x48>; + interrupt-controller; + interrupts = <0x0 0x19 0x0>; + interrupt-names = "avs"; + }; + + avs-cpu-data-mem@f04c4000 { + compatible = "brcm,bcm7271-avs-cpu-data-mem", + "brcm,avs-cpu-data-mem"; + reg = <0xf04c4000 0x60>; + interrupts = <0x1a>; + interrupt-parent = <&avs_host_l2_intc>; + interrupt-names = "sw_intr"; + }; + + avs-cpu-l2-intr@f04d1100 { + compatible = "brcm,bcm7271-avs-cpu-l2-intr", + "brcm,avs-cpu-l2-intr"; + reg = <0xf04d1100 0x10>; + }; diff --git a/Documentation/devicetree/bindings/opp/opp.txt b/Documentation/devicetree/bindings/opp/opp.txt index ee91cbdd95ee..9f5ca4457b5f 100644 --- a/Documentation/devicetree/bindings/opp/opp.txt +++ b/Documentation/devicetree/bindings/opp/opp.txt @@ -86,8 +86,14 @@ Optional properties: Single entry is for target voltage and three entries are for <target min max> voltages. - Entries for multiple regulators must be present in the same order as - regulators are specified in device's DT node. + Entries for multiple regulators shall be provided in the same field separated + by angular brackets <>. The OPP binding doesn't provide any provisions to + relate the values to their power supplies or the order in which the supplies + need to be configured and that is left for the implementation specific + binding. + + Entries for all regulators shall be of the same size, i.e. either all use a + single value or triplets. - opp-microvolt-<name>: Named opp-microvolt property. This is exactly similar to the above opp-microvolt property, but allows multiple voltage ranges to be @@ -104,10 +110,13 @@ Optional properties: Should only be set if opp-microvolt is set for the OPP. - Entries for multiple regulators must be present in the same order as - regulators are specified in device's DT node. If this property isn't required - for few regulators, then this should be marked as zero for them. If it isn't - required for any regulator, then this property need not be present. + Entries for multiple regulators shall be provided in the same field separated + by angular brackets <>. If current values aren't required for a regulator, + then it shall be filled with 0. If current values aren't required for any of + the regulators, then this field is not required. The OPP binding doesn't + provide any provisions to relate the values to their power supplies or the + order in which the supplies need to be configured and that is left for the + implementation specific binding. - opp-microamp-<name>: Named opp-microamp property. Similar to opp-microvolt-<name> property, but for microamp instead. @@ -386,10 +395,12 @@ Example 4: Handling multiple regulators / { cpus { cpu@0 { - compatible = "arm,cortex-a7"; + compatible = "vendor,cpu-type"; ... - cpu-supply = <&cpu_supply0>, <&cpu_supply1>, <&cpu_supply2>; + vcc0-supply = <&cpu_supply0>; + vcc1-supply = <&cpu_supply1>; + vcc2-supply = <&cpu_supply2>; operating-points-v2 = <&cpu0_opp_table>; }; }; diff --git a/Documentation/devicetree/bindings/power/domain-idle-state.txt b/Documentation/devicetree/bindings/power/domain-idle-state.txt new file mode 100644 index 000000000000..eefc7ed22ca2 --- /dev/null +++ b/Documentation/devicetree/bindings/power/domain-idle-state.txt @@ -0,0 +1,33 @@ +PM Domain Idle State Node: + +A domain idle state node represents the state parameters that will be used to +select the state when there are no active components in the domain. + +The state node has the following parameters - + +- compatible: + Usage: Required + Value type: <string> + Definition: Must be "domain-idle-state". + +- entry-latency-us + Usage: Required + Value type: <prop-encoded-array> + Definition: u32 value representing worst case latency in + microseconds required to enter the idle state. + The exit-latency-us duration may be guaranteed + only after entry-latency-us has passed. + +- exit-latency-us + Usage: Required + Value type: <prop-encoded-array> + Definition: u32 value representing worst case latency + in microseconds required to exit the idle state. + +- min-residency-us + Usage: Required + Value type: <prop-encoded-array> + Definition: u32 value representing minimum residency duration + in microseconds after which the idle state will yield + power benefits after overcoming the overhead in entering +i the idle state. diff --git a/Documentation/devicetree/bindings/power/power_domain.txt b/Documentation/devicetree/bindings/power/power_domain.txt index 025b5e7df61c..723e1ad937da 100644 --- a/Documentation/devicetree/bindings/power/power_domain.txt +++ b/Documentation/devicetree/bindings/power/power_domain.txt @@ -29,6 +29,15 @@ Optional properties: specified by this binding. More details about power domain specifier are available in the next section. +- domain-idle-states : A phandle of an idle-state that shall be soaked into a + generic domain power state. The idle state definitions are + compatible with domain-idle-state specified in [1]. + The domain-idle-state property reflects the idle state of this PM domain and + not the idle states of the devices or sub-domains in the PM domain. Devices + and sub-domains have their own idle-states independent of the parent + domain's idle states. In the absence of this property, the domain would be + considered as capable of being powered-on or powered-off. + Example: power: power-controller@12340000 { @@ -59,6 +68,38 @@ The nodes above define two power controllers: 'parent' and 'child'. Domains created by the 'child' power controller are subdomains of '0' power domain provided by the 'parent' power controller. +Example 3: + parent: power-controller@12340000 { + compatible = "foo,power-controller"; + reg = <0x12340000 0x1000>; + #power-domain-cells = <0>; + domain-idle-states = <&DOMAIN_RET>, <&DOMAIN_PWR_DN>; + }; + + child: power-controller@12341000 { + compatible = "foo,power-controller"; + reg = <0x12341000 0x1000>; + power-domains = <&parent 0>; + #power-domain-cells = <0>; + domain-idle-states = <&DOMAIN_PWR_DN>; + }; + + DOMAIN_RET: state@0 { + compatible = "domain-idle-state"; + reg = <0x0>; + entry-latency-us = <1000>; + exit-latency-us = <2000>; + min-residency-us = <10000>; + }; + + DOMAIN_PWR_DN: state@1 { + compatible = "domain-idle-state"; + reg = <0x1>; + entry-latency-us = <5000>; + exit-latency-us = <8000>; + min-residency-us = <7000>; + }; + ==PM domain consumers== Required properties: @@ -76,3 +117,5 @@ Example: The node above defines a typical PM domain consumer device, which is located inside a PM domain with index 0 of a power controller represented by a node with the label "power". + +[1]. Documentation/devicetree/bindings/power/domain-idle-state.txt diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt index 8ba6625fdd63..73ddea39a9ce 100644 --- a/Documentation/power/devices.txt +++ b/Documentation/power/devices.txt @@ -607,7 +607,9 @@ individually. Instead, a set of devices sharing a power resource can be put into a low-power state together at the same time by turning off the shared power resource. Of course, they also need to be put into the full-power state together, by turning the shared power resource on. A set of devices with this -property is often referred to as a power domain. +property is often referred to as a power domain. A power domain may also be +nested inside another power domain. The nested domain is referred to as the +sub-domain of the parent domain. Support for power domains is provided through the pm_domain field of struct device. This field is a pointer to an object of type struct dev_pm_domain, @@ -629,6 +631,16 @@ support for power domains into subsystem-level callbacks, for example by modifying the platform bus type. Other platforms need not implement it or take it into account in any way. +Devices may be defined as IRQ-safe which indicates to the PM core that their +runtime PM callbacks may be invoked with disabled interrupts (see +Documentation/power/runtime_pm.txt for more information). If an IRQ-safe +device belongs to a PM domain, the runtime PM of the domain will be +disallowed, unless the domain itself is defined as IRQ-safe. However, it +makes sense to define a PM domain as IRQ-safe only if all the devices in it +are IRQ-safe. Moreover, if an IRQ-safe domain has a parent domain, the runtime +PM of the parent is only allowed if the parent itself is IRQ-safe too with the +additional restriction that all child domains of an IRQ-safe parent must also +be IRQ-safe. Device Low Power (suspend) States --------------------------------- diff --git a/Documentation/power/states.txt b/Documentation/power/states.txt index 50f3ef9177c1..8a39ce45d8a0 100644 --- a/Documentation/power/states.txt +++ b/Documentation/power/states.txt @@ -8,25 +8,43 @@ for each state. The states are represented by strings that can be read or written to the /sys/power/state file. Those strings may be "mem", "standby", "freeze" and -"disk", where the last one always represents hibernation (Suspend-To-Disk) and -the meaning of the remaining ones depends on the relative_sleep_states command -line argument. - -For relative_sleep_states=1, the strings "mem", "standby" and "freeze" label the -available non-hibernation sleep states from the deepest to the shallowest, -respectively. In that case, "mem" is always present in /sys/power/state, -because there is at least one non-hibernation sleep state in every system. If -the given system supports two non-hibernation sleep states, "standby" is present -in /sys/power/state in addition to "mem". If the system supports three -non-hibernation sleep states, "freeze" will be present in /sys/power/state in -addition to "mem" and "standby". - -For relative_sleep_states=0, which is the default, the following descriptions -apply. - -state: Suspend-To-Idle +"disk", where the last three always represent Power-On Suspend (if supported), +Suspend-To-Idle and hibernation (Suspend-To-Disk), respectively. + +The meaning of the "mem" string is controlled by the /sys/power/mem_sleep file. +It contains strings representing the available modes of system suspend that may +be triggered by writing "mem" to /sys/power/state. These modes are "s2idle" +(Suspend-To-Idle), "shallow" (Power-On Suspend) and "deep" (Suspend-To-RAM). +The "s2idle" mode is always available, while the other ones are only available +if supported by the platform (if not supported, the strings representing them +are not present in /sys/power/mem_sleep). The string representing the suspend +mode to be used subsequently is enclosed in square brackets. Writing one of +the other strings present in /sys/power/mem_sleep to it causes the suspend mode +to be used subsequently to change to the one represented by that string. + +Consequently, there are two ways to cause the system to go into the +Suspend-To-Idle sleep state. The first one is to write "freeze" directly to +/sys/power/state. The second one is to write "s2idle" to /sys/power/mem_sleep +and then to wrtie "mem" to /sys/power/state. Similarly, there are two ways +to cause the system to go into the Power-On Suspend sleep state (the strings to +write to the control files in that case are "standby" or "shallow" and "mem", +respectively) if that state is supported by the platform. In turn, there is +only one way to cause the system to go into the Suspend-To-RAM state (write +"deep" into /sys/power/mem_sleep and "mem" into /sys/power/state). + +The default suspend mode (ie. the one to be used without writing anything into +/sys/power/mem_sleep) is either "deep" (if Suspend-To-RAM is supported) or +"s2idle", but it can be overridden by the value of the "mem_sleep_default" +parameter in the kernel command line. On some ACPI-based systems, depending on +the information in the FADT, the default may be "s2idle" even if Suspend-To-RAM +is supported. + +The properties of all of the sleep states are described below. + + +State: Suspend-To-Idle ACPI state: S0 -Label: "freeze" +Label: "s2idle" ("freeze") This state is a generic, pure software, light-weight, system sleep state. It allows more energy to be saved relative to runtime idle by freezing user @@ -35,13 +53,13 @@ lower-power than available at run time), such that the processors can spend more time in their idle states. This state can be used for platforms without Power-On Suspend/Suspend-to-RAM -support, or it can be used in addition to Suspend-to-RAM (memory sleep) -to provide reduced resume latency. It is always supported. +support, or it can be used in addition to Suspend-to-RAM to provide reduced +resume latency. It is always supported. State: Standby / Power-On Suspend ACPI State: S1 -Label: "standby" +Label: "shallow" ("standby") This state, if supported, offers moderate, though real, power savings, while providing a relatively low-latency transition back to a working system. No @@ -58,7 +76,7 @@ state. State: Suspend-to-RAM ACPI State: S3 -Label: "mem" +Label: "deep" This state, if supported, offers significant power savings as everything in the system is put into a low-power state, except for memory, which should be placed diff --git a/MAINTAINERS b/MAINTAINERS index 3d7d66cdb44c..34ef63763566 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2764,6 +2764,14 @@ L: bcm-kernel-feedback-list@broadcom.com S: Maintained F: drivers/mtd/nand/brcmnand/ +BROADCOM STB AVS CPUFREQ DRIVER +M: Markus Mayer <mmayer@broadcom.com> +M: bcm-kernel-feedback-list@broadcom.com +L: linux-pm@vger.kernel.org +S: Maintained +F: Documentation/devicetree/bindings/cpufreq/brcm,stb-avs-cpu-freq.txt +F: drivers/cpufreq/brcmstb* + BROADCOM SPECIFIC AMBA DRIVER (BCMA) M: Rafał Miłecki <zajec5@gmail.com> L: linux-wireless@vger.kernel.org @@ -3356,6 +3364,7 @@ L: linux-pm@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git T: git git://git.linaro.org/people/vireshk/linux.git (For ARM Updates) +B: https://bugzilla.kernel.org F: Documentation/cpu-freq/ F: drivers/cpufreq/ F: include/linux/cpufreq.h @@ -3395,6 +3404,7 @@ M: Daniel Lezcano <daniel.lezcano@linaro.org> L: linux-pm@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git +B: https://bugzilla.kernel.org F: drivers/cpuidle/* F: include/linux/cpuidle.h @@ -6362,9 +6372,11 @@ S: Maintained F: drivers/platform/x86/intel-vbtn.c INTEL IDLE DRIVER +M: Jacob Pan <jacob.jun.pan@linux.intel.com> M: Len Brown <lenb@kernel.org> L: linux-pm@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux.git +B: https://bugzilla.kernel.org S: Supported F: drivers/idle/intel_idle.c diff --git a/arch/arm/mach-imx/gpc.c b/arch/arm/mach-imx/gpc.c index b54db47f6f32..1dc2a34b9dbd 100644 --- a/arch/arm/mach-imx/gpc.c +++ b/arch/arm/mach-imx/gpc.c @@ -380,13 +380,6 @@ static struct pu_domain imx6q_pu_domain = { .name = "PU", .power_off = imx6q_pm_pu_power_off, .power_on = imx6q_pm_pu_power_on, - .states = { - [0] = { - .power_off_latency_ns = 25000, - .power_on_latency_ns = 2000000, - }, - }, - .state_count = 1, }, }; @@ -430,6 +423,16 @@ static int imx_gpc_genpd_init(struct device *dev, struct regulator *pu_reg) if (!IS_ENABLED(CONFIG_PM_GENERIC_DOMAINS)) return 0; + imx6q_pu_domain.base.states = devm_kzalloc(dev, + sizeof(*imx6q_pu_domain.base.states), + GFP_KERNEL); + if (!imx6q_pu_domain.base.states) + return -ENOMEM; + + imx6q_pu_domain.base.states[0].power_off_latency_ns = 25000; + imx6q_pu_domain.base.states[0].power_on_latency_ns = 2000000; + imx6q_pu_domain.base.state_count = 1; + for (i = 0; i < ARRAY_SIZE(imx_gpc_domains); i++) pm_genpd_init(imx_gpc_domains[i], NULL, false); diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 169963f471bb..50b8ed0317a3 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -109,6 +109,15 @@ ENTRY(do_suspend_lowlevel) movq pt_regs_r14(%rax), %r14 movq pt_regs_r15(%rax), %r15 +#ifdef CONFIG_KASAN + /* + * The suspend path may have poisoned some areas deeper in the stack, + * which we now need to unpoison. + */ + movq %rsp, %rdi + call kasan_unpoison_task_stack_below +#endif + xorl %eax, %eax addq $8, %rsp FRAME_END diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 9634557a5444..ded2e8272382 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -11,6 +11,10 @@ #include <linux/gfp.h> #include <linux/smp.h> #include <linux/suspend.h> +#include <linux/scatterlist.h> +#include <linux/kdebug.h> + +#include <crypto/hash.h> #include <asm/init.h> #include <asm/proto.h> @@ -177,14 +181,86 @@ int pfn_is_nosave(unsigned long pfn) return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); } +#define MD5_DIGEST_SIZE 16 + struct restore_data_record { unsigned long jump_address; unsigned long jump_address_phys; unsigned long cr3; unsigned long magic; + u8 e820_digest[MD5_DIGEST_SIZE]; }; -#define RESTORE_MAGIC 0x123456789ABCDEF0UL +#define RESTORE_MAGIC 0x23456789ABCDEF01UL + +#if IS_BUILTIN(CONFIG_CRYPTO_MD5) +/** + * get_e820_md5 - calculate md5 according to given e820 map + * + * @map: the e820 map to be calculated + * @buf: the md5 result to be stored to + */ +static int get_e820_md5(struct e820map *map, void *buf) +{ + struct scatterlist sg; + struct crypto_ahash *tfm; + int size; + int ret = 0; + + tfm = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) + return -ENOMEM; + + { + AHASH_REQUEST_ON_STACK(req, tfm); + size = offsetof(struct e820map, map) + + sizeof(struct e820entry) * map->nr_map; + ahash_request_set_tfm(req, tfm); + sg_init_one(&sg, (u8 *)map, size); + ahash_request_set_callback(req, 0, NULL, NULL); + ahash_request_set_crypt(req, &sg, buf, size); + + if (crypto_ahash_digest(req)) + ret = -EINVAL; + ahash_request_zero(req); + } + crypto_free_ahash(tfm); + + return ret; +} + +static void hibernation_e820_save(void *buf) +{ + get_e820_md5(e820_saved, buf); +} + +static bool hibernation_e820_mismatch(void *buf) +{ + int ret; + u8 result[MD5_DIGEST_SIZE]; + + memset(result, 0, MD5_DIGEST_SIZE); + /* If there is no digest in suspend kernel, let it go. */ + if (!memcmp(result, buf, MD5_DIGEST_SIZE)) + return false; + + ret = get_e820_md5(e820_saved, result); + if (ret) + return true; + + return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false; +} +#else +static void hibernation_e820_save(void *buf) +{ +} + +static bool hibernation_e820_mismatch(void *buf) +{ + /* If md5 is not builtin for restore kernel, let it go. */ + return false; +} +#endif /** * arch_hibernation_header_save - populate the architecture specific part @@ -201,6 +277,9 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size) rdr->jump_address_phys = __pa_symbol(&restore_registers); rdr->cr3 = restore_cr3; rdr->magic = RESTORE_MAGIC; + + hibernation_e820_save(rdr->e820_digest); + return 0; } @@ -216,5 +295,16 @@ int arch_hibernation_header_restore(void *addr) restore_jump_address = rdr->jump_address; jump_address_phys = rdr->jump_address_phys; restore_cr3 = rdr->cr3; - return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL; + + if (rdr->magic != RESTORE_MAGIC) { + pr_crit("Unrecognized hibernate image header format!\n"); + return -EINVAL; + } + + if (hibernation_e820_mismatch(rdr->e820_digest)) { + pr_crit("Hibernate inconsistent memory map detected!\n"); + return -ENODEV; + } + + return 0; } diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c index bb01dea39fdc..f0b4a981b8d3 100644 --- a/drivers/acpi/processor_perflib.c +++ b/drivers/acpi/processor_perflib.c @@ -157,7 +157,7 @@ static void acpi_processor_ppc_ost(acpi_handle handle, int status) status, NULL); } -int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag) +void acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag) { int ret; @@ -168,7 +168,7 @@ int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag) */ if (event_flag) acpi_processor_ppc_ost(pr->handle, 1); - return 0; + return; } ret = acpi_processor_get_platform_limit(pr); @@ -182,10 +182,8 @@ int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag) else acpi_processor_ppc_ost(pr->handle, 0); } - if (ret < 0) - return (ret); - else - return cpufreq_update_policy(pr->id); + if (ret >= 0) + cpufreq_update_policy(pr->id); } int acpi_processor_get_bios_limit(int cpu, unsigned int *limit) @@ -465,11 +463,33 @@ int acpi_processor_get_performance_info(struct acpi_processor *pr) return result; } EXPORT_SYMBOL_GPL(acpi_processor_get_performance_info); -int acpi_processor_notify_smm(struct module *calling_module) + +int acpi_processor_pstate_control(void) { acpi_status status; - static int is_done = 0; + if (!acpi_gbl_FADT.smi_command || !acpi_gbl_FADT.pstate_control) + return 0; + + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "Writing pstate_control [0x%x] to smi_command [0x%x]\n", + acpi_gbl_FADT.pstate_control, acpi_gbl_FADT.smi_command)); + + status = acpi_os_write_port(acpi_gbl_FADT.smi_command, + (u32)acpi_gbl_FADT.pstate_control, 8); + if (ACPI_SUCCESS(status)) + return 1; + + ACPI_EXCEPTION((AE_INFO, status, + "Failed to write pstate_control [0x%x] to smi_command [0x%x]", + acpi_gbl_FADT.pstate_control, acpi_gbl_FADT.smi_command)); + return -EIO; +} + +int acpi_processor_notify_smm(struct module *calling_module) +{ + static int is_done = 0; + int result; if (!(acpi_processor_ppc_status & PPC_REGISTERED)) return -EBUSY; @@ -492,26 +512,15 @@ int acpi_processor_notify_smm(struct module *calling_module) is_done = -EIO; - /* Can't write pstate_control to smi_command if either value is zero */ - if ((!acpi_gbl_FADT.smi_command) || (!acpi_gbl_FADT.pstate_control)) { + result = acpi_processor_pstate_control(); + if (!result) { ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No SMI port or pstate_control\n")); module_put(calling_module); return 0; } - - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Writing pstate_control [0x%x] to smi_command [0x%x]\n", - acpi_gbl_FADT.pstate_control, acpi_gbl_FADT.smi_command)); - - status = acpi_os_write_port(acpi_gbl_FADT.smi_command, - (u32) acpi_gbl_FADT.pstate_control, 8); - if (ACPI_FAILURE(status)) { - ACPI_EXCEPTION((AE_INFO, status, - "Failed to write pstate_control [0x%x] to " - "smi_command [0x%x]", acpi_gbl_FADT.pstate_control, - acpi_gbl_FADT.smi_command)); + if (result < 0) { module_put(calling_module); - return status; + return result; } /* Success. If there's no _PPC, we need to fear nothing, so diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 54abb26b7366..9b6cebe227a0 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -674,6 +674,14 @@ static void acpi_sleep_suspend_setup(void) if (acpi_sleep_state_supported(i)) sleep_states[i] = 1; + /* + * Use suspend-to-idle by default if ACPI_FADT_LOW_POWER_S0 is set and + * the default suspend mode was not selected from the command line. + */ + if (acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0 && + mem_sleep_default > PM_SUSPEND_MEM) + mem_sleep_default = PM_SUSPEND_FREEZE; + suspend_set_ops(old_suspend_ordering ? &acpi_suspend_ops_old : &acpi_suspend_ops); freeze_set_ops(&acpi_freeze_ops); diff --git a/drivers/base/power/domain.c b/drivers/base/power/domain.c index e023066e4215..5711708532db 100644 --- a/drivers/base/power/domain.c +++ b/drivers/base/power/domain.c @@ -39,6 +39,105 @@ static LIST_HEAD(gpd_list); static DEFINE_MUTEX(gpd_list_lock); +struct genpd_lock_ops { + void (*lock)(struct generic_pm_domain *genpd); + void (*lock_nested)(struct generic_pm_domain *genpd, int depth); + int (*lock_interruptible)(struct generic_pm_domain *genpd); + void (*unlock)(struct generic_pm_domain *genpd); +}; + +static void genpd_lock_mtx(struct generic_pm_domain *genpd) +{ + mutex_lock(&genpd->mlock); +} + +static void genpd_lock_nested_mtx(struct generic_pm_domain *genpd, + int depth) +{ + mutex_lock_nested(&genpd->mlock, depth); +} + +static int genpd_lock_interruptible_mtx(struct generic_pm_domain *genpd) +{ + return mutex_lock_interruptible(&genpd->mlock); +} + +static void genpd_unlock_mtx(struct generic_pm_domain *genpd) +{ + return mutex_unlock(&genpd->mlock); +} + +static const struct genpd_lock_ops genpd_mtx_ops = { + .lock = genpd_lock_mtx, + .lock_nested = genpd_lock_nested_mtx, + .lock_interruptible = genpd_lock_interruptible_mtx, + .unlock = genpd_unlock_mtx, +}; + +static void genpd_lock_spin(struct generic_pm_domain *genpd) + __acquires(&genpd->slock) +{ + unsigned long flags; + + spin_lock_irqsave(&genpd->slock, flags); + genpd->lock_flags = flags; +} + +static void genpd_lock_nested_spin(struct generic_pm_domain *genpd, + int depth) + __acquires(&genpd->slock) +{ + unsigned long flags; + + spin_lock_irqsave_nested(&genpd->slock, flags, depth); + genpd->lock_flags = flags; +} + +static int genpd_lock_interruptible_spin(struct generic_pm_domain *genpd) + __acquires(&genpd->slock) +{ + unsigned long flags; + + spin_lock_irqsave(&genpd->slock, flags); + genpd->lock_flags = flags; + return 0; +} + +static void genpd_unlock_spin(struct generic_pm_domain *genpd) + __releases(&genpd->slock) +{ + spin_unlock_irqrestore(&genpd->slock, genpd->lock_flags); +} + +static const struct genpd_lock_ops genpd_spin_ops = { + .lock = genpd_lock_spin, + .lock_nested = genpd_lock_nested_spin, + .lock_interruptible = genpd_lock_interruptible_spin, + .unlock = genpd_unlock_spin, +}; + +#define genpd_lock(p) p->lock_ops->lock(p) +#define genpd_lock_nested(p, d) p->lock_ops->lock_nested(p, d) +#define genpd_lock_interruptible(p) p->lock_ops->lock_interruptible(p) +#define genpd_unlock(p) p->lock_ops->unlock(p) + +#define genpd_is_irq_safe(genpd) (genpd->flags & GENPD_FLAG_IRQ_SAFE) + +static inline bool irq_safe_dev_in_no_sleep_domain(struct device *dev, + struct generic_pm_domain *genpd) +{ + bool ret; + + ret = pm_runtime_is_irq_safe(dev) && !genpd_is_irq_safe(genpd); + + /* Warn once for each IRQ safe dev in no sleep domain */ + if (ret) + dev_warn_once(dev, "PM domain %s will not be powered off\n", + genpd->name); + + return ret; +} + /* * Get the generic PM domain for a particular struct device. * This validates the struct device pointer, the PM domain pointer, @@ -200,9 +299,9 @@ static int genpd_poweron(struct generic_pm_domain *genpd, unsigned int depth) genpd_sd_counter_inc(master); - mutex_lock_nested(&master->lock, depth + 1); + genpd_lock_nested(master, depth + 1); ret = genpd_poweron(master, depth + 1); - mutex_unlock(&master->lock); + genpd_unlock(master); if (ret) { genpd_sd_counter_dec(master); @@ -255,9 +354,9 @@ static int genpd_dev_pm_qos_notifier(struct notifier_block *nb, spin_unlock_irq(&dev->power.lock); if (!IS_ERR(genpd)) { - mutex_lock(&genpd->lock); + genpd_lock(genpd); genpd->max_off_time_changed = true; - mutex_unlock(&genpd->lock); + genpd_unlock(genpd); } dev = dev->parent; @@ -303,7 +402,12 @@ static int genpd_poweroff(struct generic_pm_domain *genpd, bool is_async) if (stat > PM_QOS_FLAGS_NONE) return -EBUSY; - if (!pm_runtime_suspended(pdd->dev) || pdd->dev->power.irq_safe) + /* + * Do not allow PM domain to be powered off, when an IRQ safe + * device is part of a non-IRQ safe domain. + */ + if (!pm_runtime_suspended(pdd->dev) || + irq_safe_dev_in_no_sleep_domain(pdd->dev, genpd)) not_suspended++; } @@ -354,9 +458,9 @@ static void genpd_power_off_work_fn(struct work_struct *work) genpd = container_of(work, struct generic_pm_domain, power_off_work); - mutex_lock(&genpd->lock); + genpd_lock(genpd); genpd_poweroff(genpd, true); - mutex_unlock(&genpd->lock); + genpd_unlock(genpd); } /** @@ -466,15 +570,15 @@ static int genpd_runtime_suspend(struct device *dev) } /* - * If power.irq_safe is set, this routine will be run with interrupts - * off, so it can't use mutexes. + * If power.irq_safe is set, this routine may be run with + * IRQs disabled, so suspend only if the PM domain also is irq_safe. */ - if (dev->power.irq_safe) + if (irq_safe_dev_in_no_sleep_domain(dev, genpd)) return 0; - mutex_lock(&genpd->lock); + genpd_lock(genpd); genpd_poweroff(genpd, false); - mutex_unlock(&genpd->lock); + genpd_unlock(genpd); return 0; } @@ -503,15 +607,18 @@ static int genpd_runtime_resume(struct device *dev) if (IS_ERR(genpd)) return -EINVAL; - /* If power.irq_safe, the PM domain is never powered off. */ - if (dev->power.irq_safe) { + /* + * As we don't power off a non IRQ safe domain, which holds + * an IRQ safe device, we don't need to restore power to it. + */ + if (irq_safe_dev_in_no_sleep_domain(dev, genpd)) { timed = false; goto out; } - mutex_lock(&genpd->lock); + genpd_lock(genpd); ret = genpd_poweron(genpd, 0); - mutex_unlock(&genpd->lock); + genpd_unlock(genpd); if (ret) return ret; @@ -546,10 +653,11 @@ static int genpd_runtime_resume(struct device *dev) err_stop: genpd_stop_dev(genpd, dev); err_poweroff: - if (!dev->power.irq_safe) { - mutex_lock(&genpd->lock); + if (!pm_runtime_is_irq_safe(dev) || + (pm_runtime_is_irq_safe(dev) && genpd_is_irq_safe(genpd))) { + genpd_lock(genpd); genpd_poweroff(genpd, 0); - mutex_unlock(&genpd->lock); + genpd_unlock(genpd); } return ret; @@ -732,20 +840,20 @@ static int pm_genpd_prepare(struct device *dev) if (resume_needed(dev, genpd)) pm_runtime_resume(dev); - mutex_lock(&genpd->lock); + genpd_lock(genpd); if (genpd->prepared_count++ == 0) genpd->suspended_count = 0; - mutex_unlock(&genpd->lock); + genpd_unlock(genpd); ret = pm_generic_prepare(dev); if (ret) { - mutex_lock(&genpd->lock); + genpd_lock(genpd); genpd->prepared_count--; - mutex_unlock(&genpd->lock); + genpd_unlock(genpd); } return ret; @@ -936,13 +1044,13 @@ static void pm_genpd_complete(struct device *dev) pm_generic_complete(dev); - mutex_lock(&genpd->lock); + genpd_lock(genpd); genpd->prepared_count--; if (!genpd->prepared_count) genpd_queue_power_off_work(genpd); - mutex_unlock(&genpd->lock); + genpd_unlock(genpd); } /** @@ -1071,7 +1179,7 @@ static int genpd_add_device(struct generic_pm_domain *genpd, struct device *dev, if (IS_ERR(gpd_data)) return PTR_ERR(gpd_data); - mutex_lock(&genpd->lock); + genpd_lock(genpd); if (genpd->prepared_count > 0) { ret = -EAGAIN; @@ -1088,7 +1196,7 @@ static int genpd_add_device(struct generic_pm_domain *genpd, struct device *dev, list_add_tail(&gpd_data->base.list_node, &genpd->dev_list); out: - mutex_unlock(&genpd->lock); + genpd_unlock(genpd); if (ret) genpd_free_dev_data(dev, gpd_data); @@ -1130,7 +1238,7 @@ static int genpd_remove_device(struct generic_pm_domain *genpd, gpd_data = to_gpd_data(pdd); dev_pm_qos_remove_notifier(dev, &gpd_data->nb); - mutex_lock(&genpd->lock); + genpd_lock(genpd); if (genpd->prepared_count > 0) { ret = -EAGAIN; @@ -1145,14 +1253,14 @@ static int genpd_remove_device(struct generic_pm_domain *genpd, list_del_init(&pdd->list_node); - mutex_unlock(&genpd->lock); + genpd_unlock(genpd); genpd_free_dev_data(dev, gpd_data); return 0; out: - mutex_unlock(&genpd->lock); + genpd_unlock(genpd); dev_pm_qos_add_notifier(dev, &gpd_data->nb); return ret; @@ -1183,12 +1291,23 @@ static int genpd_add_subdomain(struct generic_pm_domain *genpd, || genpd == subdomain) return -EINVAL; + /* + * If the domain can be powered on/off in an IRQ safe + * context, ensure that the subdomain can also be + * powered on/off in that context. + */ + if (!genpd_is_irq_safe(genpd) && genpd_is_irq_safe(subdomain)) { + WARN(1, "Parent %s of subdomain %s must be IRQ safe\n", + genpd->name, subdomain->name); + return -EINVAL; + } + link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) return -ENOMEM; - mutex_lock(&subdomain->lock); - mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING); + genpd_lock(subdomain); + genpd_lock_nested(genpd, SINGLE_DEPTH_NESTING); if (genpd->status == GPD_STATE_POWER_OFF && subdomain->status != GPD_STATE_POWER_OFF) { @@ -1211,8 +1330,8 @@ static int genpd_add_subdomain(struct generic_pm_domain *genpd, genpd_sd_counter_inc(genpd); out: - mutex_unlock(&genpd->lock); - mutex_unlock(&subdomain->lock); + genpd_unlock(genpd); + genpd_unlock(subdomain); if (ret) kfree(link); return ret; @@ -1250,8 +1369,8 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd, if (IS_ERR_OR_NULL(genpd) || IS_ERR_OR_NULL(subdomain)) return -EINVAL; - mutex_lock(&subdomain->lock); - mutex_lock_nested(&genpd->lock, SINGLE_DEPTH_NESTING); + genpd_lock(subdomain); + genpd_lock_nested(genpd, SINGLE_DEPTH_NESTING); if (!list_empty(&subdomain->master_links) || subdomain->device_count) { pr_warn("%s: unable to remove subdomain %s\n", genpd->name, @@ -1275,13 +1394,39 @@ int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd, } out: - mutex_unlock(&genpd->lock); - mutex_unlock(&subdomain->lock); + genpd_unlock(genpd); + genpd_unlock(subdomain); return ret; } EXPORT_SYMBOL_GPL(pm_genpd_remove_subdomain); +static int genpd_set_default_power_state(struct generic_pm_domain *genpd) +{ + struct genpd_power_state *state; + + state = kzalloc(sizeof(*state), GFP_KERNEL); + if (!state) + return -ENOMEM; + + genpd->states = state; + genpd->state_count = 1; + genpd->free = state; + + return 0; +} + +static void genpd_lock_init(struct generic_pm_domain *genpd) +{ + if (genpd->flags & GENPD_FLAG_IRQ_SAFE) { + spin_lock_init(&genpd->slock); + genpd->lock_ops = &genpd_spin_ops; + } else { + mutex_init(&genpd->mlock); + genpd->lock_ops = &genpd_mtx_ops; + } +} + /** * pm_genpd_init - Initialize a generic I/O PM domain object. * @genpd: PM domain object to initialize. @@ -1293,13 +1438,15 @@ EXPORT_SYMBOL_GPL(pm_genpd_remove_subdomain); int pm_genpd_init(struct generic_pm_domain *genpd, struct dev_power_governor *gov, bool is_off) { + int ret; + if (IS_ERR_OR_NULL(genpd)) return -EINVAL; INIT_LIST_HEAD(&genpd->master_links); INIT_LIST_HEAD(&genpd->slave_links); INIT_LIST_HEAD(&genpd->dev_list); - mutex_init(&genpd->lock); + genpd_lock_init(genpd); genpd->gov = gov; INIT_WORK(&genpd->power_off_work, genpd_power_off_work_fn); atomic_set(&genpd->sd_count, 0); @@ -1325,19 +1472,12 @@ int pm_genpd_init(struct generic_pm_domain *genpd, genpd->dev_ops.start = pm_clk_resume; } - if (genpd->state_idx >= GENPD_MAX_NUM_STATES) { - pr_warn("Initial state index out of bounds.\n"); - genpd->state_idx = GENPD_MAX_NUM_STATES - 1; - } - - if (genpd->state_count > GENPD_MAX_NUM_STATES) { - pr_warn("Limiting states to %d\n", GENPD_MAX_NUM_STATES); - genpd->state_count = GENPD_MAX_NUM_STATES; - } - /* Use only one "off" state if there were no states declared */ - if (genpd->state_count == 0) - genpd->state_count = 1; + if (genpd->state_count == 0) { + ret = genpd_set_default_power_state(genpd); + if (ret) + return ret; + } mutex_lock(&gpd_list_lock); list_add(&genpd->gpd_list_node, &gpd_list); @@ -1354,16 +1494,16 @@ static int genpd_remove(struct generic_pm_domain *genpd) if (IS_ERR_OR_NULL(genpd)) return -EINVAL; - mutex_lock(&genpd->lock); + genpd_lock(genpd); if (genpd->has_provider) { - mutex_unlock(&genpd->lock); + genpd_unlock(genpd); pr_err("Provider present, unable to remove %s\n", genpd->name); return -EBUSY; } if (!list_empty(&genpd->master_links) || genpd->device_count) { - mutex_unlock(&genpd->lock); + genpd_unlock(genpd); pr_err("%s: unable to remove %s\n", __func__, genpd->name); return -EBUSY; } @@ -1375,8 +1515,9 @@ static int genpd_remove(struct generic_pm_domain *genpd) } list_del(&genpd->gpd_list_node); - mutex_unlock(&genpd->lock); + genpd_unlock(genpd); cancel_work_sync(&genpd->power_off_work); + kfree(genpd->free); pr_debug("%s: removed %s\n", __func__, genpd->name); return 0; @@ -1890,21 +2031,117 @@ int genpd_dev_pm_attach(struct device *dev) mutex_unlock(&gpd_list_lock); if (ret < 0) { - dev_err(dev, "failed to add to PM domain %s: %d", - pd->name, ret); + if (ret != -EPROBE_DEFER) + dev_err(dev, "failed to add to PM domain %s: %d", + pd->name, ret); goto out; } dev->pm_domain->detach = genpd_dev_pm_detach; dev->pm_domain->sync = genpd_dev_pm_sync; - mutex_lock(&pd->lock); + genpd_lock(pd); ret = genpd_poweron(pd, 0); - mutex_unlock(&pd->lock); + genpd_unlock(pd); out: return ret ? -EPROBE_DEFER : 0; } EXPORT_SYMBOL_GPL(genpd_dev_pm_attach); + +static const struct of_device_id idle_state_match[] = { + { .compatible = "domain-idle-state", }, + { } +}; + +static int genpd_parse_state(struct genpd_power_state *genpd_state, + struct device_node *state_node) +{ + int err; + u32 residency; + u32 entry_latency, exit_latency; + const struct of_device_id *match_id; + + match_id = of_match_node(idle_state_match, state_node); + if (!match_id) + return -EINVAL; + + err = of_property_read_u32(state_node, "entry-latency-us", + &entry_latency); + if (err) { + pr_debug(" * %s missing entry-latency-us property\n", + state_node->full_name); + return -EINVAL; + } + + err = of_property_read_u32(state_node, "exit-latency-us", + &exit_latency); + if (err) { + pr_debug(" * %s missing exit-latency-us property\n", + state_node->full_name); + return -EINVAL; + } + + err = of_property_read_u32(state_node, "min-residency-us", &residency); + if (!err) + genpd_state->residency_ns = 1000 * residency; + + genpd_state->power_on_latency_ns = 1000 * exit_latency; + genpd_state->power_off_latency_ns = 1000 * entry_latency; + genpd_state->fwnode = &state_node->fwnode; + + return 0; +} + +/** + * of_genpd_parse_idle_states: Return array of idle states for the genpd. + * + * @dn: The genpd device node + * @states: The pointer to which the state array will be saved. + * @n: The count of elements in the array returned from this function. + * + * Returns the device states parsed from the OF node. The memory for the states + * is allocated by this function and is the responsibility of the caller to + * free the memory after use. + */ +int of_genpd_parse_idle_states(struct device_node *dn, + struct genpd_power_state **states, int *n) +{ + struct genpd_power_state *st; + struct device_node *np; + int i = 0; + int err, ret; + int count; + struct of_phandle_iterator it; + + count = of_count_phandle_with_args(dn, "domain-idle-states", NULL); + if (count <= 0) + return -EINVAL; + + st = kcalloc(count, sizeof(*st), GFP_KERNEL); + if (!st) + return -ENOMEM; + + /* Loop over the phandles until all the requested entry is found */ + of_for_each_phandle(&it, err, dn, "domain-idle-states", NULL, 0) { + np = it.node; + ret = genpd_parse_state(&st[i++], np); + if (ret) { + pr_err + ("Parsing idle state node %s failed with err %d\n", + np->full_name, ret); + of_node_put(np); + kfree(st); + return ret; + } + } + + *n = count; + *states = st; + + return 0; +} +EXPORT_SYMBOL_GPL(of_genpd_parse_idle_states); + #endif /* CONFIG_PM_GENERIC_DOMAINS_OF */ @@ -1958,7 +2195,7 @@ static int pm_genpd_summary_one(struct seq_file *s, char state[16]; int ret; - ret = mutex_lock_interruptible(&genpd->lock); + ret = genpd_lock_interruptible(genpd); if (ret) return -ERESTARTSYS; @@ -1984,7 +2221,9 @@ static int pm_genpd_summary_one(struct seq_file *s, } list_for_each_entry(pm_data, &genpd->dev_list, list_node) { - kobj_path = kobject_get_path(&pm_data->dev->kobj, GFP_KERNEL); + kobj_path = kobject_get_path(&pm_data->dev->kobj, + genpd_is_irq_safe(genpd) ? + GFP_ATOMIC : GFP_KERNEL); if (kobj_path == NULL) continue; @@ -1995,7 +2234,7 @@ static int pm_genpd_summary_one(struct seq_file *s, seq_puts(s, "\n"); exit: - mutex_unlock(&genpd->lock); + genpd_unlock(genpd); return 0; } diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 2932a5bd892f..eb474c882ebe 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1460,10 +1460,10 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) dpm_watchdog_clear(&wd); Complete: - complete_all(&dev->power.completion); if (error) async_error = error; + complete_all(&dev->power.completion); TRACE_SUSPEND(error); return error; } diff --git a/drivers/base/power/opp/core.c b/drivers/base/power/opp/core.c index 4c7c6da7a989..35ff06283738 100644 --- a/drivers/base/power/opp/core.c +++ b/drivers/base/power/opp/core.c @@ -93,6 +93,8 @@ struct opp_table *_find_opp_table(struct device *dev) * Return: voltage in micro volt corresponding to the opp, else * return 0 * + * This is useful only for devices with single power supply. + * * Locking: This function must be called under rcu_read_lock(). opp is a rcu * protected pointer. This means that opp which could have been fetched by * opp_find_freq_{exact,ceil,floor} functions is valid as long as we are @@ -112,7 +114,7 @@ unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp) if (IS_ERR_OR_NULL(tmp_opp)) pr_err("%s: Invalid parameters\n", __func__); else - v = tmp_opp->u_volt; + v = tmp_opp->supplies[0].u_volt; return v; } @@ -210,6 +212,24 @@ unsigned long dev_pm_opp_get_max_clock_latency(struct device *dev) } EXPORT_SYMBOL_GPL(dev_pm_opp_get_max_clock_latency); +static int _get_regulator_count(struct device *dev) +{ + struct opp_table *opp_table; + int count; + + rcu_read_lock(); + + opp_table = _find_opp_table(dev); + if (!IS_ERR(opp_table)) + count = opp_table->regulator_count; + else + count = 0; + + rcu_read_unlock(); + + return count; +} + /** * dev_pm_opp_get_max_volt_latency() - Get max voltage latency in nanoseconds * @dev: device for which we do this operation @@ -222,34 +242,51 @@ unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev) { struct opp_table *opp_table; struct dev_pm_opp *opp; - struct regulator *reg; + struct regulator *reg, **regulators; unsigned long latency_ns = 0; - unsigned long min_uV = ~0, max_uV = 0; - int ret; + int ret, i, count; + struct { + unsigned long min; + unsigned long max; + } *uV; + + count = _get_regulator_count(dev); + + /* Regulator may not be required for the device */ + if (!count) + return 0; + + regulators = kmalloc_array(count, sizeof(*regulators), GFP_KERNEL); + if (!regulators) + return 0; + + uV = kmalloc_array(count, sizeof(*uV), GFP_KERNEL); + if (!uV) + goto free_regulators; rcu_read_lock(); opp_table = _find_opp_table(dev); if (IS_ERR(opp_table)) { rcu_read_unlock(); - return 0; + goto free_uV; } - reg = opp_table->regulator; - if (IS_ERR(reg)) { - /* Regulator may not be required for device */ - rcu_read_unlock(); - return 0; - } + memcpy(regulators, opp_table->regulators, count * sizeof(*regulators)); - list_for_each_entry_rcu(opp, &opp_table->opp_list, node) { - if (!opp->available) - continue; + for (i = 0; i < count; i++) { + uV[i].min = ~0; + uV[i].max = 0; + + list_for_each_entry_rcu(opp, &opp_table->opp_list, node) { + if (!opp->available) + continue; - if (opp->u_volt_min < min_uV) - min_uV = opp->u_volt_min; - if (opp->u_volt_max > max_uV) - max_uV = opp->u_volt_max; + if (opp->supplies[i].u_volt_min < uV[i].min) + uV[i].min = opp->supplies[i].u_volt_min; + if (opp->supplies[i].u_volt_max > uV[i].max) + uV[i].max = opp->supplies[i].u_volt_max; + } } rcu_read_unlock(); @@ -258,9 +295,16 @@ unsigned long dev_pm_opp_get_max_volt_latency(struct device *dev) * The caller needs to ensure that opp_table (and hence the regulator) * isn't freed, while we are executing this routine. */ - ret = regulator_set_voltage_time(reg, min_uV, max_uV); - if (ret > 0) - latency_ns = ret * 1000; + for (i = 0; reg = regulators[i], i < count; i++) { + ret = regulator_set_voltage_time(reg, uV[i].min, uV[i].max); + if (ret > 0) + latency_ns += ret * 1000; + } + +free_uV: + kfree(uV); +free_regulators: + kfree(regulators); return latency_ns; } @@ -542,8 +586,7 @@ unlock: } static int _set_opp_voltage(struct device *dev, struct regulator *reg, - unsigned long u_volt, unsigned long u_volt_min, - unsigned long u_volt_max) + struct dev_pm_opp_supply *supply) { int ret; @@ -554,14 +597,78 @@ static int _set_opp_voltage(struct device *dev, struct regulator *reg, return 0; } - dev_dbg(dev, "%s: voltages (mV): %lu %lu %lu\n", __func__, u_volt_min, - u_volt, u_volt_max); + dev_dbg(dev, "%s: voltages (mV): %lu %lu %lu\n", __func__, + supply->u_volt_min, supply->u_volt, supply->u_volt_max); - ret = regulator_set_voltage_triplet(reg, u_volt_min, u_volt, - u_volt_max); + ret = regulator_set_voltage_triplet(reg, supply->u_volt_min, + supply->u_volt, supply->u_volt_max); if (ret) dev_err(dev, "%s: failed to set voltage (%lu %lu %lu mV): %d\n", - __func__, u_volt_min, u_volt, u_volt_max, ret); + __func__, supply->u_volt_min, supply->u_volt, + supply->u_volt_max, ret); + + return ret; +} + +static inline int +_generic_set_opp_clk_only(struct device *dev, struct clk *clk, + unsigned long old_freq, unsigned long freq) +{ + int ret; + + ret = clk_set_rate(clk, freq); + if (ret) { + dev_err(dev, "%s: failed to set clock rate: %d\n", __func__, + ret); + } + + return ret; +} + +static int _generic_set_opp(struct dev_pm_set_opp_data *data) +{ + struct dev_pm_opp_supply *old_supply = data->old_opp.supplies; + struct dev_pm_opp_supply *new_supply = data->new_opp.supplies; + unsigned long old_freq = data->old_opp.rate, freq = data->new_opp.rate; + struct regulator *reg = data->regulators[0]; + struct device *dev= data->dev; + int ret; + + /* This function only supports single regulator per device */ + if (WARN_ON(data->regulator_count > 1)) { + dev_err(dev, "multiple regulators are not supported\n"); + return -EINVAL; + } + + /* Scaling up? Scale voltage before frequency */ + if (freq > old_freq) { + ret = _set_opp_voltage(dev, reg, new_supply); + if (ret) + goto restore_voltage; + } + + /* Change frequency */ + ret = _generic_set_opp_clk_only(dev, data->clk, old_freq, freq); + if (ret) + goto restore_voltage; + + /* Scaling down? Scale voltage after frequency */ + if (freq < old_freq) { + ret = _set_opp_voltage(dev, reg, new_supply); + if (ret) + goto restore_freq; + } + + return 0; + +restore_freq: + if (_generic_set_opp_clk_only(dev, data->clk, freq, old_freq)) + dev_err(dev, "%s: failed to restore old-freq (%lu Hz)\n", + __func__, old_freq); +restore_voltage: + /* This shouldn't harm even if the voltages weren't updated earlier */ + if (old_supply->u_volt) + _set_opp_voltage(dev, reg, old_supply); return ret; } @@ -579,12 +686,13 @@ static int _set_opp_voltage(struct device *dev, struct regulator *reg, int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) { struct opp_table *opp_table; + unsigned long freq, old_freq; + int (*set_opp)(struct dev_pm_set_opp_data *data); struct dev_pm_opp *old_opp, *opp; - struct regulator *reg; + struct regulator **regulators; + struct dev_pm_set_opp_data *data; struct clk *clk; - unsigned long freq, old_freq; - unsigned long u_volt, u_volt_min, u_volt_max; - int ret; + int ret, size; if (unlikely(!target_freq)) { dev_err(dev, "%s: Invalid target frequency %lu\n", __func__, @@ -633,55 +741,41 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) return ret; } - u_volt = opp->u_volt; - u_volt_min = opp->u_volt_min; - u_volt_max = opp->u_volt_max; + dev_dbg(dev, "%s: switching OPP: %lu Hz --> %lu Hz\n", __func__, + old_freq, freq); - reg = opp_table->regulator; + regulators = opp_table->regulators; - rcu_read_unlock(); - - /* Scaling up? Scale voltage before frequency */ - if (freq > old_freq) { - ret = _set_opp_voltage(dev, reg, u_volt, u_volt_min, - u_volt_max); - if (ret) - goto restore_voltage; - } - - /* Change frequency */ - - dev_dbg(dev, "%s: switching OPP: %lu Hz --> %lu Hz\n", - __func__, old_freq, freq); - - ret = clk_set_rate(clk, freq); - if (ret) { - dev_err(dev, "%s: failed to set clock rate: %d\n", __func__, - ret); - goto restore_voltage; + /* Only frequency scaling */ + if (!regulators) { + rcu_read_unlock(); + return _generic_set_opp_clk_only(dev, clk, old_freq, freq); } - /* Scaling down? Scale voltage after frequency */ - if (freq < old_freq) { - ret = _set_opp_voltage(dev, reg, u_volt, u_volt_min, - u_volt_max); - if (ret) - goto restore_freq; - } + if (opp_table->set_opp) + set_opp = opp_table->set_opp; + else + set_opp = _generic_set_opp; + + data = opp_table->set_opp_data; + data->regulators = regulators; + data->regulator_count = opp_table->regulator_count; + data->clk = clk; + data->dev = dev; + + data->old_opp.rate = old_freq; + size = sizeof(*opp->supplies) * opp_table->regulator_count; + if (IS_ERR(old_opp)) + memset(data->old_opp.supplies, 0, size); + else + memcpy(data->old_opp.supplies, old_opp->supplies, size); - return 0; + data->new_opp.rate = freq; + memcpy(data->new_opp.supplies, opp->supplies, size); -restore_freq: - if (clk_set_rate(clk, old_freq)) - dev_err(dev, "%s: failed to restore old-freq (%lu Hz)\n", - __func__, old_freq); -restore_voltage: - /* This shouldn't harm even if the voltages weren't updated earlier */ - if (!IS_ERR(old_opp)) - _set_opp_voltage(dev, reg, old_opp->u_volt, - old_opp->u_volt_min, old_opp->u_volt_max); + rcu_read_unlock(); - return ret; + return set_opp(data); } EXPORT_SYMBOL_GPL(dev_pm_opp_set_rate); @@ -764,9 +858,6 @@ static struct opp_table *_add_opp_table(struct device *dev) _of_init_opp_table(opp_table, dev); - /* Set regulator to a non-NULL error value */ - opp_table->regulator = ERR_PTR(-ENXIO); - /* Find clk for the device */ opp_table->clk = clk_get(dev, NULL); if (IS_ERR(opp_table->clk)) { @@ -815,7 +906,10 @@ static void _remove_opp_table(struct opp_table *opp_table) if (opp_table->prop_name) return; - if (!IS_ERR(opp_table->regulator)) + if (opp_table->regulators) + return; + + if (opp_table->set_opp) return; /* Release clk */ @@ -924,34 +1018,50 @@ struct dev_pm_opp *_allocate_opp(struct device *dev, struct opp_table **opp_table) { struct dev_pm_opp *opp; + int count, supply_size; + struct opp_table *table; - /* allocate new OPP node */ - opp = kzalloc(sizeof(*opp), GFP_KERNEL); - if (!opp) + table = _add_opp_table(dev); + if (!table) return NULL; - INIT_LIST_HEAD(&opp->node); + /* Allocate space for at least one supply */ + count = table->regulator_count ? table->regulator_count : 1; + supply_size = sizeof(*opp->supplies) * count; - *opp_table = _add_opp_table(dev); - if (!*opp_table) { - kfree(opp); + /* allocate new OPP node and supplies structures */ + opp = kzalloc(sizeof(*opp) + supply_size, GFP_KERNEL); + if (!opp) { + kfree(table); return NULL; } + /* Put the supplies at the end of the OPP structure as an empty array */ + opp->supplies = (struct dev_pm_opp_supply *)(opp + 1); + INIT_LIST_HEAD(&opp->node); + + *opp_table = table; + return opp; } static bool _opp_supported_by_regulators(struct dev_pm_opp *opp, struct opp_table *opp_table) { - struct regulator *reg = opp_table->regulator; - - if (!IS_ERR(reg) && - !regulator_is_supported_voltage(reg, opp->u_volt_min, - opp->u_volt_max)) { - pr_warn("%s: OPP minuV: %lu maxuV: %lu, not supported by regulator\n", - __func__, opp->u_volt_min, opp->u_volt_max); - return false; + struct regulator *reg; + int i; + + for (i = 0; i < opp_table->regulator_count; i++) { + reg = opp_table->regulators[i]; + + if (!regulator_is_supported_voltage(reg, + opp->supplies[i].u_volt_min, + opp->supplies[i].u_volt_max)) { + pr_warn("%s: OPP minuV: %lu maxuV: %lu, not supported by regulator\n", + __func__, opp->supplies[i].u_volt_min, + opp->supplies[i].u_volt_max); + return false; + } } return true; @@ -983,11 +1093,13 @@ int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, /* Duplicate OPPs */ dev_warn(dev, "%s: duplicate OPPs detected. Existing: freq: %lu, volt: %lu, enabled: %d. New: freq: %lu, volt: %lu, enabled: %d\n", - __func__, opp->rate, opp->u_volt, opp->available, - new_opp->rate, new_opp->u_volt, new_opp->available); + __func__, opp->rate, opp->supplies[0].u_volt, + opp->available, new_opp->rate, + new_opp->supplies[0].u_volt, new_opp->available); - return opp->available && new_opp->u_volt == opp->u_volt ? - 0 : -EEXIST; + /* Should we compare voltages for all regulators here ? */ + return opp->available && + new_opp->supplies[0].u_volt == opp->supplies[0].u_volt ? 0 : -EEXIST; } new_opp->opp_table = opp_table; @@ -1054,9 +1166,9 @@ int _opp_add_v1(struct device *dev, unsigned long freq, long u_volt, /* populate the opp table */ new_opp->rate = freq; tol = u_volt * opp_table->voltage_tolerance_v1 / 100; - new_opp->u_volt = u_volt; - new_opp->u_volt_min = u_volt - tol; - new_opp->u_volt_max = u_volt + tol; + new_opp->supplies[0].u_volt = u_volt; + new_opp->supplies[0].u_volt_min = u_volt - tol; + new_opp->supplies[0].u_volt_max = u_volt + tol; new_opp->available = true; new_opp->dynamic = dynamic; @@ -1300,13 +1412,47 @@ unlock: } EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name); +static int _allocate_set_opp_data(struct opp_table *opp_table) +{ + struct dev_pm_set_opp_data *data; + int len, count = opp_table->regulator_count; + + if (WARN_ON(!count)) + return -EINVAL; + + /* space for set_opp_data */ + len = sizeof(*data); + + /* space for old_opp.supplies and new_opp.supplies */ + len += 2 * sizeof(struct dev_pm_opp_supply) * count; + + data = kzalloc(len, GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->old_opp.supplies = (void *)(data + 1); + data->new_opp.supplies = data->old_opp.supplies + count; + + opp_table->set_opp_data = data; + + return 0; +} + +static void _free_set_opp_data(struct opp_table *opp_table) +{ + kfree(opp_table->set_opp_data); + opp_table->set_opp_data = NULL; +} + /** - * dev_pm_opp_set_regulator() - Set regulator name for the device + * dev_pm_opp_set_regulators() - Set regulator names for the device * @dev: Device for which regulator name is being set. - * @name: Name of the regulator. + * @names: Array of pointers to the names of the regulator. + * @count: Number of regulators. * * In order to support OPP switching, OPP layer needs to know the name of the - * device's regulator, as the core would be required to switch voltages as well. + * device's regulators, as the core would be required to switch voltages as + * well. * * This must be called before any OPPs are initialized for the device. * @@ -1316,11 +1462,13 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_put_prop_name); * that this function is *NOT* called under RCU protection or in contexts where * mutex cannot be locked. */ -int dev_pm_opp_set_regulator(struct device *dev, const char *name) +struct opp_table *dev_pm_opp_set_regulators(struct device *dev, + const char * const names[], + unsigned int count) { struct opp_table *opp_table; struct regulator *reg; - int ret; + int ret, i; mutex_lock(&opp_table_lock); @@ -1336,22 +1484,146 @@ int dev_pm_opp_set_regulator(struct device *dev, const char *name) goto err; } - /* Already have a regulator set */ - if (WARN_ON(!IS_ERR(opp_table->regulator))) { + /* Already have regulators set */ + if (opp_table->regulators) { ret = -EBUSY; goto err; } - /* Allocate the regulator */ - reg = regulator_get_optional(dev, name); - if (IS_ERR(reg)) { - ret = PTR_ERR(reg); - if (ret != -EPROBE_DEFER) - dev_err(dev, "%s: no regulator (%s) found: %d\n", - __func__, name, ret); + + opp_table->regulators = kmalloc_array(count, + sizeof(*opp_table->regulators), + GFP_KERNEL); + if (!opp_table->regulators) { + ret = -ENOMEM; + goto err; + } + + for (i = 0; i < count; i++) { + reg = regulator_get_optional(dev, names[i]); + if (IS_ERR(reg)) { + ret = PTR_ERR(reg); + if (ret != -EPROBE_DEFER) + dev_err(dev, "%s: no regulator (%s) found: %d\n", + __func__, names[i], ret); + goto free_regulators; + } + + opp_table->regulators[i] = reg; + } + + opp_table->regulator_count = count; + + /* Allocate block only once to pass to set_opp() routines */ + ret = _allocate_set_opp_data(opp_table); + if (ret) + goto free_regulators; + + mutex_unlock(&opp_table_lock); + return opp_table; + +free_regulators: + while (i != 0) + regulator_put(opp_table->regulators[--i]); + + kfree(opp_table->regulators); + opp_table->regulators = NULL; + opp_table->regulator_count = 0; +err: + _remove_opp_table(opp_table); +unlock: + mutex_unlock(&opp_table_lock); + + return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulators); + +/** + * dev_pm_opp_put_regulators() - Releases resources blocked for regulator + * @opp_table: OPP table returned from dev_pm_opp_set_regulators(). + * + * Locking: The internal opp_table and opp structures are RCU protected. + * Hence this function internally uses RCU updater strategy with mutex locks + * to keep the integrity of the internal data structures. Callers should ensure + * that this function is *NOT* called under RCU protection or in contexts where + * mutex cannot be locked. + */ +void dev_pm_opp_put_regulators(struct opp_table *opp_table) +{ + int i; + + mutex_lock(&opp_table_lock); + + if (!opp_table->regulators) { + pr_err("%s: Doesn't have regulators set\n", __func__); + goto unlock; + } + + /* Make sure there are no concurrent readers while updating opp_table */ + WARN_ON(!list_empty(&opp_table->opp_list)); + + for (i = opp_table->regulator_count - 1; i >= 0; i--) + regulator_put(opp_table->regulators[i]); + + _free_set_opp_data(opp_table); + + kfree(opp_table->regulators); + opp_table->regulators = NULL; + opp_table->regulator_count = 0; + + /* Try freeing opp_table if this was the last blocking resource */ + _remove_opp_table(opp_table); + +unlock: + mutex_unlock(&opp_table_lock); +} +EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulators); + +/** + * dev_pm_opp_register_set_opp_helper() - Register custom set OPP helper + * @dev: Device for which the helper is getting registered. + * @set_opp: Custom set OPP helper. + * + * This is useful to support complex platforms (like platforms with multiple + * regulators per device), instead of the generic OPP set rate helper. + * + * This must be called before any OPPs are initialized for the device. + * + * Locking: The internal opp_table and opp structures are RCU protected. + * Hence this function internally uses RCU updater strategy with mutex locks + * to keep the integrity of the internal data structures. Callers should ensure + * that this function is *NOT* called under RCU protection or in contexts where + * mutex cannot be locked. + */ +int dev_pm_opp_register_set_opp_helper(struct device *dev, + int (*set_opp)(struct dev_pm_set_opp_data *data)) +{ + struct opp_table *opp_table; + int ret; + + if (!set_opp) + return -EINVAL; + + mutex_lock(&opp_table_lock); + + opp_table = _add_opp_table(dev); + if (!opp_table) { + ret = -ENOMEM; + goto unlock; + } + + /* This should be called before OPPs are initialized */ + if (WARN_ON(!list_empty(&opp_table->opp_list))) { + ret = -EBUSY; goto err; } - opp_table->regulator = reg; + /* Already have custom set_opp helper */ + if (WARN_ON(opp_table->set_opp)) { + ret = -EBUSY; + goto err; + } + + opp_table->set_opp = set_opp; mutex_unlock(&opp_table_lock); return 0; @@ -1363,11 +1635,12 @@ unlock: return ret; } -EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulator); +EXPORT_SYMBOL_GPL(dev_pm_opp_register_set_opp_helper); /** - * dev_pm_opp_put_regulator() - Releases resources blocked for regulator - * @dev: Device for which regulator was set. + * dev_pm_opp_register_put_opp_helper() - Releases resources blocked for + * set_opp helper + * @dev: Device for which custom set_opp helper has to be cleared. * * Locking: The internal opp_table and opp structures are RCU protected. * Hence this function internally uses RCU updater strategy with mutex locks @@ -1375,7 +1648,7 @@ EXPORT_SYMBOL_GPL(dev_pm_opp_set_regulator); * that this function is *NOT* called under RCU protection or in contexts where * mutex cannot be locked. */ -void dev_pm_opp_put_regulator(struct device *dev) +void dev_pm_opp_register_put_opp_helper(struct device *dev) { struct opp_table *opp_table; @@ -1389,16 +1662,16 @@ void dev_pm_opp_put_regulator(struct device *dev) goto unlock; } - if (IS_ERR(opp_table->regulator)) { - dev_err(dev, "%s: Doesn't have regulator set\n", __func__); + if (!opp_table->set_opp) { + dev_err(dev, "%s: Doesn't have custom set_opp helper set\n", + __func__); goto unlock; } /* Make sure there are no concurrent readers while updating opp_table */ WARN_ON(!list_empty(&opp_table->opp_list)); - regulator_put(opp_table->regulator); - opp_table->regulator = ERR_PTR(-ENXIO); + opp_table->set_opp = NULL; /* Try freeing opp_table if this was the last blocking resource */ _remove_opp_table(opp_table); @@ -1406,7 +1679,7 @@ void dev_pm_opp_put_regulator(struct device *dev) unlock: mutex_unlock(&opp_table_lock); } -EXPORT_SYMBOL_GPL(dev_pm_opp_put_regulator); +EXPORT_SYMBOL_GPL(dev_pm_opp_register_put_opp_helper); /** * dev_pm_opp_add() - Add an OPP table from a table definitions diff --git a/drivers/base/power/opp/debugfs.c b/drivers/base/power/opp/debugfs.c index ef1ae6b52042..95f433db4ac7 100644 --- a/drivers/base/power/opp/debugfs.c +++ b/drivers/base/power/opp/debugfs.c @@ -15,6 +15,7 @@ #include <linux/err.h> #include <linux/init.h> #include <linux/limits.h> +#include <linux/slab.h> #include "opp.h" @@ -34,6 +35,46 @@ void opp_debug_remove_one(struct dev_pm_opp *opp) debugfs_remove_recursive(opp->dentry); } +static bool opp_debug_create_supplies(struct dev_pm_opp *opp, + struct opp_table *opp_table, + struct dentry *pdentry) +{ + struct dentry *d; + int i = 0; + char *name; + + /* Always create at least supply-0 directory */ + do { + name = kasprintf(GFP_KERNEL, "supply-%d", i); + + /* Create per-opp directory */ + d = debugfs_create_dir(name, pdentry); + + kfree(name); + + if (!d) + return false; + + if (!debugfs_create_ulong("u_volt_target", S_IRUGO, d, + &opp->supplies[i].u_volt)) + return false; + + if (!debugfs_create_ulong("u_volt_min", S_IRUGO, d, + &opp->supplies[i].u_volt_min)) + return false; + + if (!debugfs_create_ulong("u_volt_max", S_IRUGO, d, + &opp->supplies[i].u_volt_max)) + return false; + + if (!debugfs_create_ulong("u_amp", S_IRUGO, d, + &opp->supplies[i].u_amp)) + return false; + } while (++i < opp_table->regulator_count); + + return true; +} + int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table) { struct dentry *pdentry = opp_table->dentry; @@ -63,16 +104,7 @@ int opp_debug_create_one(struct dev_pm_opp *opp, struct opp_table *opp_table) if (!debugfs_create_ulong("rate_hz", S_IRUGO, d, &opp->rate)) return -ENOMEM; - if (!debugfs_create_ulong("u_volt_target", S_IRUGO, d, &opp->u_volt)) - return -ENOMEM; - - if (!debugfs_create_ulong("u_volt_min", S_IRUGO, d, &opp->u_volt_min)) - return -ENOMEM; - - if (!debugfs_create_ulong("u_volt_max", S_IRUGO, d, &opp->u_volt_max)) - return -ENOMEM; - - if (!debugfs_create_ulong("u_amp", S_IRUGO, d, &opp->u_amp)) + if (!opp_debug_create_supplies(opp, opp_table, d)) return -ENOMEM; if (!debugfs_create_ulong("clock_latency_ns", S_IRUGO, d, diff --git a/drivers/base/power/opp/of.c b/drivers/base/power/opp/of.c index 5552211e6fcd..3f7d2591b173 100644 --- a/drivers/base/power/opp/of.c +++ b/drivers/base/power/opp/of.c @@ -17,6 +17,7 @@ #include <linux/errno.h> #include <linux/device.h> #include <linux/of.h> +#include <linux/slab.h> #include <linux/export.h> #include "opp.h" @@ -101,16 +102,16 @@ static bool _opp_is_supported(struct device *dev, struct opp_table *opp_table, return true; } -/* TODO: Support multiple regulators */ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev, struct opp_table *opp_table) { - u32 microvolt[3] = {0}; - u32 val; - int count, ret; + u32 *microvolt, *microamp = NULL; + int supplies, vcount, icount, ret, i, j; struct property *prop = NULL; char name[NAME_MAX]; + supplies = opp_table->regulator_count ? opp_table->regulator_count : 1; + /* Search for "opp-microvolt-<name>" */ if (opp_table->prop_name) { snprintf(name, sizeof(name), "opp-microvolt-%s", @@ -128,34 +129,29 @@ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev, return 0; } - count = of_property_count_u32_elems(opp->np, name); - if (count < 0) { + vcount = of_property_count_u32_elems(opp->np, name); + if (vcount < 0) { dev_err(dev, "%s: Invalid %s property (%d)\n", - __func__, name, count); - return count; + __func__, name, vcount); + return vcount; } - /* There can be one or three elements here */ - if (count != 1 && count != 3) { - dev_err(dev, "%s: Invalid number of elements in %s property (%d)\n", - __func__, name, count); + /* There can be one or three elements per supply */ + if (vcount != supplies && vcount != supplies * 3) { + dev_err(dev, "%s: Invalid number of elements in %s property (%d) with supplies (%d)\n", + __func__, name, vcount, supplies); return -EINVAL; } - ret = of_property_read_u32_array(opp->np, name, microvolt, count); + microvolt = kmalloc_array(vcount, sizeof(*microvolt), GFP_KERNEL); + if (!microvolt) + return -ENOMEM; + + ret = of_property_read_u32_array(opp->np, name, microvolt, vcount); if (ret) { dev_err(dev, "%s: error parsing %s: %d\n", __func__, name, ret); - return -EINVAL; - } - - opp->u_volt = microvolt[0]; - - if (count == 1) { - opp->u_volt_min = opp->u_volt; - opp->u_volt_max = opp->u_volt; - } else { - opp->u_volt_min = microvolt[1]; - opp->u_volt_max = microvolt[2]; + ret = -EINVAL; + goto free_microvolt; } /* Search for "opp-microamp-<name>" */ @@ -172,10 +168,59 @@ static int opp_parse_supplies(struct dev_pm_opp *opp, struct device *dev, prop = of_find_property(opp->np, name, NULL); } - if (prop && !of_property_read_u32(opp->np, name, &val)) - opp->u_amp = val; + if (prop) { + icount = of_property_count_u32_elems(opp->np, name); + if (icount < 0) { + dev_err(dev, "%s: Invalid %s property (%d)\n", __func__, + name, icount); + ret = icount; + goto free_microvolt; + } - return 0; + if (icount != supplies) { + dev_err(dev, "%s: Invalid number of elements in %s property (%d) with supplies (%d)\n", + __func__, name, icount, supplies); + ret = -EINVAL; + goto free_microvolt; + } + + microamp = kmalloc_array(icount, sizeof(*microamp), GFP_KERNEL); + if (!microamp) { + ret = -EINVAL; + goto free_microvolt; + } + + ret = of_property_read_u32_array(opp->np, name, microamp, + icount); + if (ret) { + dev_err(dev, "%s: error parsing %s: %d\n", __func__, + name, ret); + ret = -EINVAL; + goto free_microamp; + } + } + + for (i = 0, j = 0; i < supplies; i++) { + opp->supplies[i].u_volt = microvolt[j++]; + + if (vcount == supplies) { + opp->supplies[i].u_volt_min = opp->supplies[i].u_volt; + opp->supplies[i].u_volt_max = opp->supplies[i].u_volt; + } else { + opp->supplies[i].u_volt_min = microvolt[j++]; + opp->supplies[i].u_volt_max = microvolt[j++]; + } + + if (microamp) + opp->supplies[i].u_amp = microamp[i]; + } + +free_microamp: + kfree(microamp); +free_microvolt: + kfree(microvolt); + + return ret; } /** @@ -198,7 +243,7 @@ void dev_pm_opp_of_remove_table(struct device *dev) EXPORT_SYMBOL_GPL(dev_pm_opp_of_remove_table); /* Returns opp descriptor node for a device, caller must do of_node_put() */ -struct device_node *_of_get_opp_desc_node(struct device *dev) +static struct device_node *_of_get_opp_desc_node(struct device *dev) { /* * TODO: Support for multiple OPP tables. @@ -303,9 +348,9 @@ static int _opp_add_static_v2(struct device *dev, struct device_node *np) mutex_unlock(&opp_table_lock); pr_debug("%s: turbo:%d rate:%lu uv:%lu uvmin:%lu uvmax:%lu latency:%lu\n", - __func__, new_opp->turbo, new_opp->rate, new_opp->u_volt, - new_opp->u_volt_min, new_opp->u_volt_max, - new_opp->clock_latency_ns); + __func__, new_opp->turbo, new_opp->rate, + new_opp->supplies[0].u_volt, new_opp->supplies[0].u_volt_min, + new_opp->supplies[0].u_volt_max, new_opp->clock_latency_ns); /* * Notify the changes in the availability of the operable @@ -562,7 +607,7 @@ int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev, /* Get OPP descriptor node */ np = _of_get_opp_desc_node(cpu_dev); if (!np) { - dev_dbg(cpu_dev, "%s: Couldn't find cpu_dev node.\n", __func__); + dev_dbg(cpu_dev, "%s: Couldn't find opp node.\n", __func__); return -ENOENT; } @@ -587,7 +632,7 @@ int dev_pm_opp_of_get_sharing_cpus(struct device *cpu_dev, /* Get OPP descriptor node */ tmp_np = _of_get_opp_desc_node(tcpu_dev); if (!tmp_np) { - dev_err(tcpu_dev, "%s: Couldn't find tcpu_dev node.\n", + dev_err(tcpu_dev, "%s: Couldn't find opp node.\n", __func__); ret = -ENOENT; goto put_cpu_node; diff --git a/drivers/base/power/opp/opp.h b/drivers/base/power/opp/opp.h index fabd5ca1a083..af9f2b849a66 100644 --- a/drivers/base/power/opp/opp.h +++ b/drivers/base/power/opp/opp.h @@ -61,10 +61,7 @@ extern struct list_head opp_tables; * @turbo: true if turbo (boost) OPP * @suspend: true if suspend OPP * @rate: Frequency in hertz - * @u_volt: Target voltage in microvolts corresponding to this OPP - * @u_volt_min: Minimum voltage in microvolts corresponding to this OPP - * @u_volt_max: Maximum voltage in microvolts corresponding to this OPP - * @u_amp: Maximum current drawn by the device in microamperes + * @supplies: Power supplies voltage/current values * @clock_latency_ns: Latency (in nanoseconds) of switching to this OPP's * frequency from any other OPP's frequency. * @opp_table: points back to the opp_table struct this opp belongs to @@ -83,10 +80,8 @@ struct dev_pm_opp { bool suspend; unsigned long rate; - unsigned long u_volt; - unsigned long u_volt_min; - unsigned long u_volt_max; - unsigned long u_amp; + struct dev_pm_opp_supply *supplies; + unsigned long clock_latency_ns; struct opp_table *opp_table; @@ -144,7 +139,10 @@ enum opp_table_access { * @supported_hw_count: Number of elements in supported_hw array. * @prop_name: A name to postfix to many DT properties, while parsing them. * @clk: Device's clock handle - * @regulator: Supply regulator + * @regulators: Supply regulators + * @regulator_count: Number of power supply regulators + * @set_opp: Platform specific set_opp callback + * @set_opp_data: Data to be passed to set_opp callback * @dentry: debugfs dentry pointer of the real device directory (not links). * @dentry_name: Name of the real dentry. * @@ -179,7 +177,11 @@ struct opp_table { unsigned int supported_hw_count; const char *prop_name; struct clk *clk; - struct regulator *regulator; + struct regulator **regulators; + unsigned int regulator_count; + + int (*set_opp)(struct dev_pm_set_opp_data *data); + struct dev_pm_set_opp_data *set_opp_data; #ifdef CONFIG_DEBUG_FS struct dentry *dentry; @@ -190,7 +192,6 @@ struct opp_table { /* Routines internal to opp core */ struct opp_table *_find_opp_table(struct device *dev); struct opp_device *_add_opp_dev(const struct device *dev, struct opp_table *opp_table); -struct device_node *_of_get_opp_desc_node(struct device *dev); void _dev_pm_opp_remove_table(struct device *dev, bool remove_all); struct dev_pm_opp *_allocate_opp(struct device *dev, struct opp_table **opp_table); int _opp_add(struct device *dev, struct dev_pm_opp *new_opp, struct opp_table *opp_table); diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h index 50e30e7b059d..a84332aefc2d 100644 --- a/drivers/base/power/power.h +++ b/drivers/base/power/power.h @@ -21,14 +21,22 @@ extern void pm_runtime_init(struct device *dev); extern void pm_runtime_reinit(struct device *dev); extern void pm_runtime_remove(struct device *dev); +#define WAKE_IRQ_DEDICATED_ALLOCATED BIT(0) +#define WAKE_IRQ_DEDICATED_MANAGED BIT(1) +#define WAKE_IRQ_DEDICATED_MASK (WAKE_IRQ_DEDICATED_ALLOCATED | \ + WAKE_IRQ_DEDICATED_MANAGED) + struct wake_irq { struct device *dev; + unsigned int status; int irq; - bool dedicated_irq:1; }; extern void dev_pm_arm_wake_irq(struct wake_irq *wirq); extern void dev_pm_disarm_wake_irq(struct wake_irq *wirq); +extern void dev_pm_enable_wake_irq_check(struct device *dev, + bool can_change_status); +extern void dev_pm_disable_wake_irq_check(struct device *dev); #ifdef CONFIG_PM_SLEEP @@ -104,6 +112,15 @@ static inline void dev_pm_disarm_wake_irq(struct wake_irq *wirq) { } +static inline void dev_pm_enable_wake_irq_check(struct device *dev, + bool can_change_status) +{ +} + +static inline void dev_pm_disable_wake_irq_check(struct device *dev) +{ +} + #endif #ifdef CONFIG_PM_SLEEP diff --git a/drivers/base/power/qos.c b/drivers/base/power/qos.c index 7f3646e459cb..58fcc758334e 100644 --- a/drivers/base/power/qos.c +++ b/drivers/base/power/qos.c @@ -856,7 +856,10 @@ int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val) struct dev_pm_qos_request *req; if (val < 0) { - ret = -EINVAL; + if (val == PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT) + ret = 0; + else + ret = -EINVAL; goto out; } req = kzalloc(sizeof(*req), GFP_KERNEL); @@ -883,6 +886,7 @@ int dev_pm_qos_update_user_latency_tolerance(struct device *dev, s32 val) mutex_unlock(&dev_pm_qos_mtx); return ret; } +EXPORT_SYMBOL_GPL(dev_pm_qos_update_user_latency_tolerance); /** * dev_pm_qos_expose_latency_tolerance - Expose latency tolerance to userspace diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c index 82a081ea4317..26856d050037 100644 --- a/drivers/base/power/runtime.c +++ b/drivers/base/power/runtime.c @@ -241,7 +241,8 @@ static int rpm_check_suspend_allowed(struct device *dev) retval = -EACCES; else if (atomic_read(&dev->power.usage_count) > 0) retval = -EAGAIN; - else if (!pm_children_suspended(dev)) + else if (!dev->power.ignore_children && + atomic_read(&dev->power.child_count)) retval = -EBUSY; /* Pending resume requests take precedence over suspends. */ @@ -515,7 +516,7 @@ static int rpm_suspend(struct device *dev, int rpmflags) callback = RPM_GET_CALLBACK(dev, runtime_suspend); - dev_pm_enable_wake_irq(dev); + dev_pm_enable_wake_irq_check(dev, true); retval = rpm_callback(callback, dev); if (retval) goto fail; @@ -554,7 +555,7 @@ static int rpm_suspend(struct device *dev, int rpmflags) return retval; fail: - dev_pm_disable_wake_irq(dev); + dev_pm_disable_wake_irq_check(dev); __update_runtime_status(dev, RPM_ACTIVE); dev->power.deferred_resume = false; wake_up_all(&dev->power.wait_queue); @@ -712,8 +713,8 @@ static int rpm_resume(struct device *dev, int rpmflags) spin_lock(&parent->power.lock); /* - * We can resume if the parent's runtime PM is disabled or it - * is set to ignore children. + * Resume the parent if it has runtime PM enabled and not been + * set to ignore its children. */ if (!parent->power.disable_depth && !parent->power.ignore_children) { @@ -737,12 +738,12 @@ static int rpm_resume(struct device *dev, int rpmflags) callback = RPM_GET_CALLBACK(dev, runtime_resume); - dev_pm_disable_wake_irq(dev); + dev_pm_disable_wake_irq_check(dev); retval = rpm_callback(callback, dev); if (retval) { __update_runtime_status(dev, RPM_SUSPENDED); pm_runtime_cancel_pending(dev); - dev_pm_enable_wake_irq(dev); + dev_pm_enable_wake_irq_check(dev, false); } else { no_callback: __update_runtime_status(dev, RPM_ACTIVE); @@ -1027,7 +1028,17 @@ int __pm_runtime_set_status(struct device *dev, unsigned int status) goto out_set; if (status == RPM_SUSPENDED) { - /* It always is possible to set the status to 'suspended'. */ + /* + * It is invalid to suspend a device with an active child, + * unless it has been set to ignore its children. + */ + if (!dev->power.ignore_children && + atomic_read(&dev->power.child_count)) { + dev_err(dev, "runtime PM trying to suspend device but active child\n"); + error = -EBUSY; + goto out; + } + if (parent) { atomic_add_unless(&parent->power.child_count, -1, 0); notify_parent = !parent->power.ignore_children; @@ -1478,6 +1489,16 @@ int pm_runtime_force_suspend(struct device *dev) if (ret) goto err; + /* + * Increase the runtime PM usage count for the device's parent, in case + * when we find the device being used when system suspend was invoked. + * This informs pm_runtime_force_resume() to resume the parent + * immediately, which is needed to be able to resume its children, + * when not deferring the resume to be managed via runtime PM. + */ + if (dev->parent && atomic_read(&dev->power.usage_count) > 1) + pm_runtime_get_noresume(dev->parent); + pm_runtime_set_suspended(dev); return 0; err: @@ -1487,16 +1508,20 @@ err: EXPORT_SYMBOL_GPL(pm_runtime_force_suspend); /** - * pm_runtime_force_resume - Force a device into resume state. + * pm_runtime_force_resume - Force a device into resume state if needed. * @dev: Device to resume. * * Prior invoking this function we expect the user to have brought the device * into low power state by a call to pm_runtime_force_suspend(). Here we reverse - * those actions and brings the device into full power. We update the runtime PM - * status and re-enables runtime PM. + * those actions and brings the device into full power, if it is expected to be + * used on system resume. To distinguish that, we check whether the runtime PM + * usage count is greater than 1 (the PM core increases the usage count in the + * system PM prepare phase), as that indicates a real user (such as a subsystem, + * driver, userspace, etc.) is using it. If that is the case, the device is + * expected to be used on system resume as well, so then we resume it. In the + * other case, we defer the resume to be managed via runtime PM. * - * Typically this function may be invoked from a system resume callback to make - * sure the device is put into full power state. + * Typically this function may be invoked from a system resume callback. */ int pm_runtime_force_resume(struct device *dev) { @@ -1513,6 +1538,17 @@ int pm_runtime_force_resume(struct device *dev) if (!pm_runtime_status_suspended(dev)) goto out; + /* + * Decrease the parent's runtime PM usage count, if we increased it + * during system suspend in pm_runtime_force_suspend(). + */ + if (atomic_read(&dev->power.usage_count) > 1) { + if (dev->parent) + pm_runtime_put_noidle(dev->parent); + } else { + goto out; + } + ret = pm_runtime_set_active(dev); if (ret) goto out; diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c index a7b46798c81d..33b4b902741a 100644 --- a/drivers/base/power/sysfs.c +++ b/drivers/base/power/sysfs.c @@ -263,7 +263,11 @@ static ssize_t pm_qos_latency_tolerance_store(struct device *dev, s32 value; int ret; - if (kstrtos32(buf, 0, &value)) { + if (kstrtos32(buf, 0, &value) == 0) { + /* Users can't write negative values directly */ + if (value < 0) + return -EINVAL; + } else { if (!strcmp(buf, "auto") || !strcmp(buf, "auto\n")) value = PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT; else if (!strcmp(buf, "any") || !strcmp(buf, "any\n")) diff --git a/drivers/base/power/wakeirq.c b/drivers/base/power/wakeirq.c index 0d77cd6fd8d1..404d94c6c8bc 100644 --- a/drivers/base/power/wakeirq.c +++ b/drivers/base/power/wakeirq.c @@ -110,8 +110,10 @@ void dev_pm_clear_wake_irq(struct device *dev) dev->power.wakeirq = NULL; spin_unlock_irqrestore(&dev->power.lock, flags); - if (wirq->dedicated_irq) + if (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED) { free_irq(wirq->irq, wirq); + wirq->status &= ~WAKE_IRQ_DEDICATED_MASK; + } kfree(wirq); } EXPORT_SYMBOL_GPL(dev_pm_clear_wake_irq); @@ -179,7 +181,6 @@ int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq) wirq->dev = dev; wirq->irq = irq; - wirq->dedicated_irq = true; irq_set_status_flags(irq, IRQ_NOAUTOEN); /* @@ -195,6 +196,8 @@ int dev_pm_set_dedicated_wake_irq(struct device *dev, int irq) if (err) goto err_free_irq; + wirq->status = WAKE_IRQ_DEDICATED_ALLOCATED; + return err; err_free_irq: @@ -210,9 +213,9 @@ EXPORT_SYMBOL_GPL(dev_pm_set_dedicated_wake_irq); * dev_pm_enable_wake_irq - Enable device wake-up interrupt * @dev: Device * - * Called from the bus code or the device driver for - * runtime_suspend() to enable the wake-up interrupt while - * the device is running. + * Optionally called from the bus code or the device driver for + * runtime_resume() to override the PM runtime core managed wake-up + * interrupt handling to enable the wake-up interrupt. * * Note that for runtime_suspend()) the wake-up interrupts * should be unconditionally enabled unlike for suspend() @@ -222,7 +225,7 @@ void dev_pm_enable_wake_irq(struct device *dev) { struct wake_irq *wirq = dev->power.wakeirq; - if (wirq && wirq->dedicated_irq) + if (wirq && (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED)) enable_irq(wirq->irq); } EXPORT_SYMBOL_GPL(dev_pm_enable_wake_irq); @@ -231,20 +234,73 @@ EXPORT_SYMBOL_GPL(dev_pm_enable_wake_irq); * dev_pm_disable_wake_irq - Disable device wake-up interrupt * @dev: Device * - * Called from the bus code or the device driver for - * runtime_resume() to disable the wake-up interrupt while - * the device is running. + * Optionally called from the bus code or the device driver for + * runtime_suspend() to override the PM runtime core managed wake-up + * interrupt handling to disable the wake-up interrupt. */ void dev_pm_disable_wake_irq(struct device *dev) { struct wake_irq *wirq = dev->power.wakeirq; - if (wirq && wirq->dedicated_irq) + if (wirq && (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED)) disable_irq_nosync(wirq->irq); } EXPORT_SYMBOL_GPL(dev_pm_disable_wake_irq); /** + * dev_pm_enable_wake_irq_check - Checks and enables wake-up interrupt + * @dev: Device + * @can_change_status: Can change wake-up interrupt status + * + * Enables wakeirq conditionally. We need to enable wake-up interrupt + * lazily on the first rpm_suspend(). This is needed as the consumer device + * starts in RPM_SUSPENDED state, and the the first pm_runtime_get() would + * otherwise try to disable already disabled wakeirq. The wake-up interrupt + * starts disabled with IRQ_NOAUTOEN set. + * + * Should be only called from rpm_suspend() and rpm_resume() path. + * Caller must hold &dev->power.lock to change wirq->status + */ +void dev_pm_enable_wake_irq_check(struct device *dev, + bool can_change_status) +{ + struct wake_irq *wirq = dev->power.wakeirq; + + if (!wirq || !((wirq->status & WAKE_IRQ_DEDICATED_MASK))) + return; + + if (likely(wirq->status & WAKE_IRQ_DEDICATED_MANAGED)) { + goto enable; + } else if (can_change_status) { + wirq->status |= WAKE_IRQ_DEDICATED_MANAGED; + goto enable; + } + + return; + +enable: + enable_irq(wirq->irq); +} + +/** + * dev_pm_disable_wake_irq_check - Checks and disables wake-up interrupt + * @dev: Device + * + * Disables wake-up interrupt conditionally based on status. + * Should be only called from rpm_suspend() and rpm_resume() path. + */ +void dev_pm_disable_wake_irq_check(struct device *dev) +{ + struct wake_irq *wirq = dev->power.wakeirq; + + if (!wirq || !((wirq->status & WAKE_IRQ_DEDICATED_MASK))) + return; + + if (wirq->status & WAKE_IRQ_DEDICATED_MANAGED) + disable_irq_nosync(wirq->irq); +} + +/** * dev_pm_arm_wake_irq - Arm device wake-up * @wirq: Device wake-up interrupt * diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index 62e4de2aa8d1..bf9ba26981a5 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -811,7 +811,7 @@ void pm_print_active_wakeup_sources(void) rcu_read_lock(); list_for_each_entry_rcu(ws, &wakeup_sources, entry) { if (ws->active) { - pr_info("active wakeup source: %s\n", ws->name); + pr_debug("active wakeup source: %s\n", ws->name); active = 1; } else if (!active && (!last_activity_ws || @@ -822,7 +822,7 @@ void pm_print_active_wakeup_sources(void) } if (!active && last_activity_ws) - pr_info("last active wakeup source: %s\n", + pr_debug("last active wakeup source: %s\n", last_activity_ws->name); rcu_read_unlock(); } @@ -905,7 +905,7 @@ bool pm_get_wakeup_count(unsigned int *count, bool block) split_counters(&cnt, &inpr); if (inpr == 0 || signal_pending(current)) break; - + pm_print_active_wakeup_sources(); schedule(); } finish_wait(&wakeup_count_wait_queue, &wait); diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index d89b8afe23b6..920c469f3953 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -12,6 +12,27 @@ config ARM_BIG_LITTLE_CPUFREQ help This enables the Generic CPUfreq driver for ARM big.LITTLE platforms. +config ARM_BRCMSTB_AVS_CPUFREQ + tristate "Broadcom STB AVS CPUfreq driver" + depends on ARCH_BRCMSTB || COMPILE_TEST + default y + help + Some Broadcom STB SoCs use a co-processor running proprietary firmware + ("AVS") to handle voltage and frequency scaling. This driver provides + a standard CPUfreq interface to to the firmware. + + Say Y, if you have a Broadcom SoC with AVS support for DFS or DVFS. + +config ARM_BRCMSTB_AVS_CPUFREQ_DEBUG + bool "Broadcom STB AVS CPUfreq driver sysfs debug capability" + depends on ARM_BRCMSTB_AVS_CPUFREQ + help + Enabling this option turns on debug support via sysfs under + /sys/kernel/debug/brcmstb-avs-cpufreq. It is possible to read all and + write some AVS mailbox registers through sysfs entries. + + If in doubt, say N. + config ARM_DT_BL_CPUFREQ tristate "Generic probing via DT for ARM big LITTLE CPUfreq driver" depends on ARM_BIG_LITTLE_CPUFREQ && OF @@ -60,14 +81,6 @@ config ARM_IMX6Q_CPUFREQ If in doubt, say N. -config ARM_INTEGRATOR - tristate "CPUfreq driver for ARM Integrator CPUs" - depends on ARCH_INTEGRATOR - default y - help - This enables the CPUfreq driver for ARM Integrator CPUs. - If in doubt, say Y. - config ARM_KIRKWOOD_CPUFREQ def_bool MACH_KIRKWOOD help diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile index 0a9b6a093646..1e46c3918e7a 100644 --- a/drivers/cpufreq/Makefile +++ b/drivers/cpufreq/Makefile @@ -51,12 +51,12 @@ obj-$(CONFIG_ARM_BIG_LITTLE_CPUFREQ) += arm_big_little.o # LITTLE drivers, so that it is probed last. obj-$(CONFIG_ARM_DT_BL_CPUFREQ) += arm_big_little_dt.o +obj-$(CONFIG_ARM_BRCMSTB_AVS_CPUFREQ) += brcmstb-avs-cpufreq.o obj-$(CONFIG_ARCH_DAVINCI) += davinci-cpufreq.o obj-$(CONFIG_UX500_SOC_DB8500) += dbx500-cpufreq.o obj-$(CONFIG_ARM_EXYNOS5440_CPUFREQ) += exynos5440-cpufreq.o obj-$(CONFIG_ARM_HIGHBANK_CPUFREQ) += highbank-cpufreq.o obj-$(CONFIG_ARM_IMX6Q_CPUFREQ) += imx6q-cpufreq.o -obj-$(CONFIG_ARM_INTEGRATOR) += integrator-cpufreq.o obj-$(CONFIG_ARM_KIRKWOOD_CPUFREQ) += kirkwood-cpufreq.o obj-$(CONFIG_ARM_MT8173_CPUFREQ) += mt8173-cpufreq.o obj-$(CONFIG_ARM_OMAP2PLUS_CPUFREQ) += omap-cpufreq.o diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 297e9128fe9f..3a98702b7445 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -84,7 +84,6 @@ static inline struct acpi_processor_performance *to_perf_data(struct acpi_cpufre static struct cpufreq_driver acpi_cpufreq_driver; static unsigned int acpi_pstate_strict; -static struct msr __percpu *msrs; static bool boost_state(unsigned int cpu) { @@ -104,11 +103,10 @@ static bool boost_state(unsigned int cpu) return false; } -static void boost_set_msrs(bool enable, const struct cpumask *cpumask) +static int boost_set_msr(bool enable) { - u32 cpu; u32 msr_addr; - u64 msr_mask; + u64 msr_mask, val; switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: @@ -120,26 +118,31 @@ static void boost_set_msrs(bool enable, const struct cpumask *cpumask) msr_mask = MSR_K7_HWCR_CPB_DIS; break; default: - return; + return -EINVAL; } - rdmsr_on_cpus(cpumask, msr_addr, msrs); + rdmsrl(msr_addr, val); - for_each_cpu(cpu, cpumask) { - struct msr *reg = per_cpu_ptr(msrs, cpu); - if (enable) - reg->q &= ~msr_mask; - else - reg->q |= msr_mask; - } + if (enable) + val &= ~msr_mask; + else + val |= msr_mask; + + wrmsrl(msr_addr, val); + return 0; +} + +static void boost_set_msr_each(void *p_en) +{ + bool enable = (bool) p_en; - wrmsr_on_cpus(cpumask, msr_addr, msrs); + boost_set_msr(enable); } static int set_boost(int val) { get_online_cpus(); - boost_set_msrs(val, cpu_online_mask); + on_each_cpu(boost_set_msr_each, (void *)(long)val, 1); put_online_cpus(); pr_debug("Core Boosting %sabled.\n", val ? "en" : "dis"); @@ -536,46 +539,24 @@ static void free_acpi_perf_data(void) free_percpu(acpi_perf_data); } -static int boost_notify(struct notifier_block *nb, unsigned long action, - void *hcpu) +static int cpufreq_boost_online(unsigned int cpu) { - unsigned cpu = (long)hcpu; - const struct cpumask *cpumask; - - cpumask = get_cpu_mask(cpu); + /* + * On the CPU_UP path we simply keep the boost-disable flag + * in sync with the current global state. + */ + return boost_set_msr(acpi_cpufreq_driver.boost_enabled); +} +static int cpufreq_boost_down_prep(unsigned int cpu) +{ /* * Clear the boost-disable bit on the CPU_DOWN path so that - * this cpu cannot block the remaining ones from boosting. On - * the CPU_UP path we simply keep the boost-disable flag in - * sync with the current global state. + * this cpu cannot block the remaining ones from boosting. */ - - switch (action) { - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - boost_set_msrs(acpi_cpufreq_driver.boost_enabled, cpumask); - break; - - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - boost_set_msrs(1, cpumask); - break; - - default: - break; - } - - return NOTIFY_OK; + return boost_set_msr(1); } - -static struct notifier_block boost_nb = { - .notifier_call = boost_notify, -}; - /* * acpi_cpufreq_early_init - initialize ACPI P-States library * @@ -922,37 +903,35 @@ static struct cpufreq_driver acpi_cpufreq_driver = { .attr = acpi_cpufreq_attr, }; +static enum cpuhp_state acpi_cpufreq_online; + static void __init acpi_cpufreq_boost_init(void) { - if (boot_cpu_has(X86_FEATURE_CPB) || boot_cpu_has(X86_FEATURE_IDA)) { - msrs = msrs_alloc(); - - if (!msrs) - return; - - acpi_cpufreq_driver.set_boost = set_boost; - acpi_cpufreq_driver.boost_enabled = boost_state(0); - - cpu_notifier_register_begin(); + int ret; - /* Force all MSRs to the same value */ - boost_set_msrs(acpi_cpufreq_driver.boost_enabled, - cpu_online_mask); + if (!(boot_cpu_has(X86_FEATURE_CPB) || boot_cpu_has(X86_FEATURE_IDA))) + return; - __register_cpu_notifier(&boost_nb); + acpi_cpufreq_driver.set_boost = set_boost; + acpi_cpufreq_driver.boost_enabled = boost_state(0); - cpu_notifier_register_done(); + /* + * This calls the online callback on all online cpu and forces all + * MSRs to the same value. + */ + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "cpufreq/acpi:online", + cpufreq_boost_online, cpufreq_boost_down_prep); + if (ret < 0) { + pr_err("acpi_cpufreq: failed to register hotplug callbacks\n"); + return; } + acpi_cpufreq_online = ret; } static void acpi_cpufreq_boost_exit(void) { - if (msrs) { - unregister_cpu_notifier(&boost_nb); - - msrs_free(msrs); - msrs = NULL; - } + if (acpi_cpufreq_online >= 0) + cpuhp_remove_state_nocalls(acpi_cpufreq_online); } static int __init acpi_cpufreq_init(void) diff --git a/drivers/cpufreq/brcmstb-avs-cpufreq.c b/drivers/cpufreq/brcmstb-avs-cpufreq.c new file mode 100644 index 000000000000..4fda623e55bb --- /dev/null +++ b/drivers/cpufreq/brcmstb-avs-cpufreq.c @@ -0,0 +1,1057 @@ +/* + * CPU frequency scaling for Broadcom SoCs with AVS firmware that + * supports DVS or DVFS + * + * Copyright (c) 2016 Broadcom + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation version 2. + * + * This program is distributed "as is" WITHOUT ANY WARRANTY of any + * kind, whether express or implied; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +/* + * "AVS" is the name of a firmware developed at Broadcom. It derives + * its name from the technique called "Adaptive Voltage Scaling". + * Adaptive voltage scaling was the original purpose of this firmware. + * The AVS firmware still supports "AVS mode", where all it does is + * adaptive voltage scaling. However, on some newer Broadcom SoCs, the + * AVS Firmware, despite its unchanged name, also supports DFS mode and + * DVFS mode. + * + * In the context of this document and the related driver, "AVS" by + * itself always means the Broadcom firmware and never refers to the + * technique called "Adaptive Voltage Scaling". + * + * The Broadcom STB AVS CPUfreq driver provides voltage and frequency + * scaling on Broadcom SoCs using AVS firmware with support for DFS and + * DVFS. The AVS firmware is running on its own co-processor. The + * driver supports both uniprocessor (UP) and symmetric multiprocessor + * (SMP) systems which share clock and voltage across all CPUs. + * + * Actual voltage and frequency scaling is done solely by the AVS + * firmware. This driver does not change frequency or voltage itself. + * It provides a standard CPUfreq interface to the rest of the kernel + * and to userland. It interfaces with the AVS firmware to effect the + * requested changes and to report back the current system status in a + * way that is expected by existing tools. + */ + +#include <linux/cpufreq.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/module.h> +#include <linux/of_address.h> +#include <linux/platform_device.h> +#include <linux/semaphore.h> + +#ifdef CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG +#include <linux/ctype.h> +#include <linux/debugfs.h> +#include <linux/slab.h> +#include <linux/uaccess.h> +#endif + +/* Max number of arguments AVS calls take */ +#define AVS_MAX_CMD_ARGS 4 +/* + * This macro is used to generate AVS parameter register offsets. For + * x >= AVS_MAX_CMD_ARGS, it returns 0 to protect against accidental memory + * access outside of the parameter range. (Offset 0 is the first parameter.) + */ +#define AVS_PARAM_MULT(x) ((x) < AVS_MAX_CMD_ARGS ? (x) : 0) + +/* AVS Mailbox Register offsets */ +#define AVS_MBOX_COMMAND 0x00 +#define AVS_MBOX_STATUS 0x04 +#define AVS_MBOX_VOLTAGE0 0x08 +#define AVS_MBOX_TEMP0 0x0c +#define AVS_MBOX_PV0 0x10 +#define AVS_MBOX_MV0 0x14 +#define AVS_MBOX_PARAM(x) (0x18 + AVS_PARAM_MULT(x) * sizeof(u32)) +#define AVS_MBOX_REVISION 0x28 +#define AVS_MBOX_PSTATE 0x2c +#define AVS_MBOX_HEARTBEAT 0x30 +#define AVS_MBOX_MAGIC 0x34 +#define AVS_MBOX_SIGMA_HVT 0x38 +#define AVS_MBOX_SIGMA_SVT 0x3c +#define AVS_MBOX_VOLTAGE1 0x40 +#define AVS_MBOX_TEMP1 0x44 +#define AVS_MBOX_PV1 0x48 +#define AVS_MBOX_MV1 0x4c +#define AVS_MBOX_FREQUENCY 0x50 + +/* AVS Commands */ +#define AVS_CMD_AVAILABLE 0x00 +#define AVS_CMD_DISABLE 0x10 +#define AVS_CMD_ENABLE 0x11 +#define AVS_CMD_S2_ENTER 0x12 +#define AVS_CMD_S2_EXIT 0x13 +#define AVS_CMD_BBM_ENTER 0x14 +#define AVS_CMD_BBM_EXIT 0x15 +#define AVS_CMD_S3_ENTER 0x16 +#define AVS_CMD_S3_EXIT 0x17 +#define AVS_CMD_BALANCE 0x18 +/* PMAP and P-STATE commands */ +#define AVS_CMD_GET_PMAP 0x30 +#define AVS_CMD_SET_PMAP 0x31 +#define AVS_CMD_GET_PSTATE 0x40 +#define AVS_CMD_SET_PSTATE 0x41 + +/* Different modes AVS supports (for GET_PMAP/SET_PMAP) */ +#define AVS_MODE_AVS 0x0 +#define AVS_MODE_DFS 0x1 +#define AVS_MODE_DVS 0x2 +#define AVS_MODE_DVFS 0x3 + +/* + * PMAP parameter p1 + * unused:31-24, mdiv_p0:23-16, unused:15-14, pdiv:13-10 , ndiv_int:9-0 + */ +#define NDIV_INT_SHIFT 0 +#define NDIV_INT_MASK 0x3ff +#define PDIV_SHIFT 10 +#define PDIV_MASK 0xf +#define MDIV_P0_SHIFT 16 +#define MDIV_P0_MASK 0xff +/* + * PMAP parameter p2 + * mdiv_p4:31-24, mdiv_p3:23-16, mdiv_p2:15:8, mdiv_p1:7:0 + */ +#define MDIV_P1_SHIFT 0 +#define MDIV_P1_MASK 0xff +#define MDIV_P2_SHIFT 8 +#define MDIV_P2_MASK 0xff +#define MDIV_P3_SHIFT 16 +#define MDIV_P3_MASK 0xff +#define MDIV_P4_SHIFT 24 +#define MDIV_P4_MASK 0xff + +/* Different P-STATES AVS supports (for GET_PSTATE/SET_PSTATE) */ +#define AVS_PSTATE_P0 0x0 +#define AVS_PSTATE_P1 0x1 +#define AVS_PSTATE_P2 0x2 +#define AVS_PSTATE_P3 0x3 +#define AVS_PSTATE_P4 0x4 +#define AVS_PSTATE_MAX AVS_PSTATE_P4 + +/* CPU L2 Interrupt Controller Registers */ +#define AVS_CPU_L2_SET0 0x04 +#define AVS_CPU_L2_INT_MASK BIT(31) + +/* AVS Command Status Values */ +#define AVS_STATUS_CLEAR 0x00 +/* Command/notification accepted */ +#define AVS_STATUS_SUCCESS 0xf0 +/* Command/notification rejected */ +#define AVS_STATUS_FAILURE 0xff +/* Invalid command/notification (unknown) */ +#define AVS_STATUS_INVALID 0xf1 +/* Non-AVS modes are not supported */ +#define AVS_STATUS_NO_SUPP 0xf2 +/* Cannot set P-State until P-Map supplied */ +#define AVS_STATUS_NO_MAP 0xf3 +/* Cannot change P-Map after initial P-Map set */ +#define AVS_STATUS_MAP_SET 0xf4 +/* Max AVS status; higher numbers are used for debugging */ +#define AVS_STATUS_MAX 0xff + +/* Other AVS related constants */ +#define AVS_LOOP_LIMIT 10000 +#define AVS_TIMEOUT 300 /* in ms; expected completion is < 10ms */ +#define AVS_FIRMWARE_MAGIC 0xa11600d1 + +#define BRCM_AVS_CPUFREQ_PREFIX "brcmstb-avs" +#define BRCM_AVS_CPUFREQ_NAME BRCM_AVS_CPUFREQ_PREFIX "-cpufreq" +#define BRCM_AVS_CPU_DATA "brcm,avs-cpu-data-mem" +#define BRCM_AVS_CPU_INTR "brcm,avs-cpu-l2-intr" +#define BRCM_AVS_HOST_INTR "sw_intr" + +struct pmap { + unsigned int mode; + unsigned int p1; + unsigned int p2; + unsigned int state; +}; + +struct private_data { + void __iomem *base; + void __iomem *avs_intr_base; + struct device *dev; +#ifdef CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG + struct dentry *debugfs; +#endif + struct completion done; + struct semaphore sem; + struct pmap pmap; +}; + +#ifdef CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG + +enum debugfs_format { + DEBUGFS_NORMAL, + DEBUGFS_FLOAT, + DEBUGFS_REV, +}; + +struct debugfs_data { + struct debugfs_entry *entry; + struct private_data *priv; +}; + +struct debugfs_entry { + char *name; + u32 offset; + fmode_t mode; + enum debugfs_format format; +}; + +#define DEBUGFS_ENTRY(name, mode, format) { \ + #name, AVS_MBOX_##name, mode, format \ +} + +/* + * These are used for debugfs only. Otherwise we use AVS_MBOX_PARAM() directly. + */ +#define AVS_MBOX_PARAM1 AVS_MBOX_PARAM(0) +#define AVS_MBOX_PARAM2 AVS_MBOX_PARAM(1) +#define AVS_MBOX_PARAM3 AVS_MBOX_PARAM(2) +#define AVS_MBOX_PARAM4 AVS_MBOX_PARAM(3) + +/* + * This table stores the name, access permissions and offset for each hardware + * register and is used to generate debugfs entries. + */ +static struct debugfs_entry debugfs_entries[] = { + DEBUGFS_ENTRY(COMMAND, S_IWUSR, DEBUGFS_NORMAL), + DEBUGFS_ENTRY(STATUS, S_IWUSR, DEBUGFS_NORMAL), + DEBUGFS_ENTRY(VOLTAGE0, 0, DEBUGFS_FLOAT), + DEBUGFS_ENTRY(TEMP0, 0, DEBUGFS_FLOAT), + DEBUGFS_ENTRY(PV0, 0, DEBUGFS_FLOAT), + DEBUGFS_ENTRY(MV0, 0, DEBUGFS_FLOAT), + DEBUGFS_ENTRY(PARAM1, S_IWUSR, DEBUGFS_NORMAL), + DEBUGFS_ENTRY(PARAM2, S_IWUSR, DEBUGFS_NORMAL), + DEBUGFS_ENTRY(PARAM3, S_IWUSR, DEBUGFS_NORMAL), + DEBUGFS_ENTRY(PARAM4, S_IWUSR, DEBUGFS_NORMAL), + DEBUGFS_ENTRY(REVISION, 0, DEBUGFS_REV), + DEBUGFS_ENTRY(PSTATE, 0, DEBUGFS_NORMAL), + DEBUGFS_ENTRY(HEARTBEAT, 0, DEBUGFS_NORMAL), + DEBUGFS_ENTRY(MAGIC, S_IWUSR, DEBUGFS_NORMAL), + DEBUGFS_ENTRY(SIGMA_HVT, 0, DEBUGFS_NORMAL), + DEBUGFS_ENTRY(SIGMA_SVT, 0, DEBUGFS_NORMAL), + DEBUGFS_ENTRY(VOLTAGE1, 0, DEBUGFS_FLOAT), + DEBUGFS_ENTRY(TEMP1, 0, DEBUGFS_FLOAT), + DEBUGFS_ENTRY(PV1, 0, DEBUGFS_FLOAT), + DEBUGFS_ENTRY(MV1, 0, DEBUGFS_FLOAT), + DEBUGFS_ENTRY(FREQUENCY, 0, DEBUGFS_NORMAL), +}; + +static int brcm_avs_target_index(struct cpufreq_policy *, unsigned int); + +static char *__strtolower(char *s) +{ + char *p; + + for (p = s; *p; p++) + *p = tolower(*p); + + return s; +} + +#endif /* CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG */ + +static void __iomem *__map_region(const char *name) +{ + struct device_node *np; + void __iomem *ptr; + + np = of_find_compatible_node(NULL, NULL, name); + if (!np) + return NULL; + + ptr = of_iomap(np, 0); + of_node_put(np); + + return ptr; +} + +static int __issue_avs_command(struct private_data *priv, int cmd, bool is_send, + u32 args[]) +{ + unsigned long time_left = msecs_to_jiffies(AVS_TIMEOUT); + void __iomem *base = priv->base; + unsigned int i; + int ret; + u32 val; + + ret = down_interruptible(&priv->sem); + if (ret) + return ret; + + /* + * Make sure no other command is currently running: cmd is 0 if AVS + * co-processor is idle. Due to the guard above, we should almost never + * have to wait here. + */ + for (i = 0, val = 1; val != 0 && i < AVS_LOOP_LIMIT; i++) + val = readl(base + AVS_MBOX_COMMAND); + + /* Give the caller a chance to retry if AVS is busy. */ + if (i == AVS_LOOP_LIMIT) { + ret = -EAGAIN; + goto out; + } + + /* Clear status before we begin. */ + writel(AVS_STATUS_CLEAR, base + AVS_MBOX_STATUS); + + /* We need to send arguments for this command. */ + if (args && is_send) { + for (i = 0; i < AVS_MAX_CMD_ARGS; i++) + writel(args[i], base + AVS_MBOX_PARAM(i)); + } + + /* Protect from spurious interrupts. */ + reinit_completion(&priv->done); + + /* Now issue the command & tell firmware to wake up to process it. */ + writel(cmd, base + AVS_MBOX_COMMAND); + writel(AVS_CPU_L2_INT_MASK, priv->avs_intr_base + AVS_CPU_L2_SET0); + + /* Wait for AVS co-processor to finish processing the command. */ + time_left = wait_for_completion_timeout(&priv->done, time_left); + + /* + * If the AVS status is not in the expected range, it means AVS didn't + * complete our command in time, and we return an error. Also, if there + * is no "time left", we timed out waiting for the interrupt. + */ + val = readl(base + AVS_MBOX_STATUS); + if (time_left == 0 || val == 0 || val > AVS_STATUS_MAX) { + dev_err(priv->dev, "AVS command %#x didn't complete in time\n", + cmd); + dev_err(priv->dev, " Time left: %u ms, AVS status: %#x\n", + jiffies_to_msecs(time_left), val); + ret = -ETIMEDOUT; + goto out; + } + + /* This command returned arguments, so we read them back. */ + if (args && !is_send) { + for (i = 0; i < AVS_MAX_CMD_ARGS; i++) + args[i] = readl(base + AVS_MBOX_PARAM(i)); + } + + /* Clear status to tell AVS co-processor we are done. */ + writel(AVS_STATUS_CLEAR, base + AVS_MBOX_STATUS); + + /* Convert firmware errors to errno's as much as possible. */ + switch (val) { + case AVS_STATUS_INVALID: + ret = -EINVAL; + break; + case AVS_STATUS_NO_SUPP: + ret = -ENOTSUPP; + break; + case AVS_STATUS_NO_MAP: + ret = -ENOENT; + break; + case AVS_STATUS_MAP_SET: + ret = -EEXIST; + break; + case AVS_STATUS_FAILURE: + ret = -EIO; + break; + } + +out: + up(&priv->sem); + + return ret; +} + +static irqreturn_t irq_handler(int irq, void *data) +{ + struct private_data *priv = data; + + /* AVS command completed execution. Wake up __issue_avs_command(). */ + complete(&priv->done); + + return IRQ_HANDLED; +} + +static char *brcm_avs_mode_to_string(unsigned int mode) +{ + switch (mode) { + case AVS_MODE_AVS: + return "AVS"; + case AVS_MODE_DFS: + return "DFS"; + case AVS_MODE_DVS: + return "DVS"; + case AVS_MODE_DVFS: + return "DVFS"; + } + return NULL; +} + +static void brcm_avs_parse_p1(u32 p1, unsigned int *mdiv_p0, unsigned int *pdiv, + unsigned int *ndiv) +{ + *mdiv_p0 = (p1 >> MDIV_P0_SHIFT) & MDIV_P0_MASK; + *pdiv = (p1 >> PDIV_SHIFT) & PDIV_MASK; + *ndiv = (p1 >> NDIV_INT_SHIFT) & NDIV_INT_MASK; +} + +static void brcm_avs_parse_p2(u32 p2, unsigned int *mdiv_p1, + unsigned int *mdiv_p2, unsigned int *mdiv_p3, + unsigned int *mdiv_p4) +{ + *mdiv_p4 = (p2 >> MDIV_P4_SHIFT) & MDIV_P4_MASK; + *mdiv_p3 = (p2 >> MDIV_P3_SHIFT) & MDIV_P3_MASK; + *mdiv_p2 = (p2 >> MDIV_P2_SHIFT) & MDIV_P2_MASK; + *mdiv_p1 = (p2 >> MDIV_P1_SHIFT) & MDIV_P1_MASK; +} + +static int brcm_avs_get_pmap(struct private_data *priv, struct pmap *pmap) +{ + u32 args[AVS_MAX_CMD_ARGS]; + int ret; + + ret = __issue_avs_command(priv, AVS_CMD_GET_PMAP, false, args); + if (ret || !pmap) + return ret; + + pmap->mode = args[0]; + pmap->p1 = args[1]; + pmap->p2 = args[2]; + pmap->state = args[3]; + + return 0; +} + +static int brcm_avs_set_pmap(struct private_data *priv, struct pmap *pmap) +{ + u32 args[AVS_MAX_CMD_ARGS]; + + args[0] = pmap->mode; + args[1] = pmap->p1; + args[2] = pmap->p2; + args[3] = pmap->state; + + return __issue_avs_command(priv, AVS_CMD_SET_PMAP, true, args); +} + +static int brcm_avs_get_pstate(struct private_data *priv, unsigned int *pstate) +{ + u32 args[AVS_MAX_CMD_ARGS]; + int ret; + + ret = __issue_avs_command(priv, AVS_CMD_GET_PSTATE, false, args); + if (ret) + return ret; + *pstate = args[0]; + + return 0; +} + +static int brcm_avs_set_pstate(struct private_data *priv, unsigned int pstate) +{ + u32 args[AVS_MAX_CMD_ARGS]; + + args[0] = pstate; + + return __issue_avs_command(priv, AVS_CMD_SET_PSTATE, true, args); +} + +static unsigned long brcm_avs_get_voltage(void __iomem *base) +{ + return readl(base + AVS_MBOX_VOLTAGE1); +} + +static unsigned long brcm_avs_get_frequency(void __iomem *base) +{ + return readl(base + AVS_MBOX_FREQUENCY) * 1000; /* in kHz */ +} + +/* + * We determine which frequencies are supported by cycling through all P-states + * and reading back what frequency we are running at for each P-state. + */ +static struct cpufreq_frequency_table * +brcm_avs_get_freq_table(struct device *dev, struct private_data *priv) +{ + struct cpufreq_frequency_table *table; + unsigned int pstate; + int i, ret; + + /* Remember P-state for later */ + ret = brcm_avs_get_pstate(priv, &pstate); + if (ret) + return ERR_PTR(ret); + + table = devm_kzalloc(dev, (AVS_PSTATE_MAX + 1) * sizeof(*table), + GFP_KERNEL); + if (!table) + return ERR_PTR(-ENOMEM); + + for (i = AVS_PSTATE_P0; i <= AVS_PSTATE_MAX; i++) { + ret = brcm_avs_set_pstate(priv, i); + if (ret) + return ERR_PTR(ret); + table[i].frequency = brcm_avs_get_frequency(priv->base); + table[i].driver_data = i; + } + table[i].frequency = CPUFREQ_TABLE_END; + + /* Restore P-state */ + ret = brcm_avs_set_pstate(priv, pstate); + if (ret) + return ERR_PTR(ret); + + return table; +} + +#ifdef CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG + +#define MANT(x) (unsigned int)(abs((x)) / 1000) +#define FRAC(x) (unsigned int)(abs((x)) - abs((x)) / 1000 * 1000) + +static int brcm_avs_debug_show(struct seq_file *s, void *data) +{ + struct debugfs_data *dbgfs = s->private; + void __iomem *base; + u32 val, offset; + + if (!dbgfs) { + seq_puts(s, "No device pointer\n"); + return 0; + } + + base = dbgfs->priv->base; + offset = dbgfs->entry->offset; + val = readl(base + offset); + switch (dbgfs->entry->format) { + case DEBUGFS_NORMAL: + seq_printf(s, "%u\n", val); + break; + case DEBUGFS_FLOAT: + seq_printf(s, "%d.%03d\n", MANT(val), FRAC(val)); + break; + case DEBUGFS_REV: + seq_printf(s, "%c.%c.%c.%c\n", (val >> 24 & 0xff), + (val >> 16 & 0xff), (val >> 8 & 0xff), + val & 0xff); + break; + } + seq_printf(s, "0x%08x\n", val); + + return 0; +} + +#undef MANT +#undef FRAC + +static ssize_t brcm_avs_seq_write(struct file *file, const char __user *buf, + size_t size, loff_t *ppos) +{ + struct seq_file *s = file->private_data; + struct debugfs_data *dbgfs = s->private; + struct private_data *priv = dbgfs->priv; + void __iomem *base, *avs_intr_base; + bool use_issue_command = false; + unsigned long val, offset; + char str[128]; + int ret; + char *str_ptr = str; + + if (size >= sizeof(str)) + return -E2BIG; + + memset(str, 0, sizeof(str)); + ret = copy_from_user(str, buf, size); + if (ret) + return ret; + + base = priv->base; + avs_intr_base = priv->avs_intr_base; + offset = dbgfs->entry->offset; + /* + * Special case writing to "command" entry only: if the string starts + * with a 'c', we use the driver's __issue_avs_command() function. + * Otherwise, we perform a raw write. This should allow testing of raw + * access as well as using the higher level function. (Raw access + * doesn't clear the firmware return status after issuing the command.) + */ + if (str_ptr[0] == 'c' && offset == AVS_MBOX_COMMAND) { + use_issue_command = true; + str_ptr++; + } + if (kstrtoul(str_ptr, 0, &val) != 0) + return -EINVAL; + + /* + * Setting the P-state is a special case. We need to update the CPU + * frequency we report. + */ + if (val == AVS_CMD_SET_PSTATE) { + struct cpufreq_policy *policy; + unsigned int pstate; + + policy = cpufreq_cpu_get(smp_processor_id()); + /* Read back the P-state we are about to set */ + pstate = readl(base + AVS_MBOX_PARAM(0)); + if (use_issue_command) { + ret = brcm_avs_target_index(policy, pstate); + return ret ? ret : size; + } + policy->cur = policy->freq_table[pstate].frequency; + } + + if (use_issue_command) { + ret = __issue_avs_command(priv, val, false, NULL); + } else { + /* Locking here is not perfect, but is only for debug. */ + ret = down_interruptible(&priv->sem); + if (ret) + return ret; + + writel(val, base + offset); + /* We have to wake up the firmware to process a command. */ + if (offset == AVS_MBOX_COMMAND) + writel(AVS_CPU_L2_INT_MASK, + avs_intr_base + AVS_CPU_L2_SET0); + up(&priv->sem); + } + + return ret ? ret : size; +} + +static struct debugfs_entry *__find_debugfs_entry(const char *name) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(debugfs_entries); i++) + if (strcasecmp(debugfs_entries[i].name, name) == 0) + return &debugfs_entries[i]; + + return NULL; +} + +static int brcm_avs_debug_open(struct inode *inode, struct file *file) +{ + struct debugfs_data *data; + fmode_t fmode; + int ret; + + /* + * seq_open(), which is called by single_open(), clears "write" access. + * We need write access to some files, so we preserve our access mode + * and restore it. + */ + fmode = file->f_mode; + /* + * Check access permissions even for root. We don't want to be writing + * to read-only registers. Access for regular users has already been + * checked by the VFS layer. + */ + if ((fmode & FMODE_WRITER) && !(inode->i_mode & S_IWUSR)) + return -EACCES; + + data = kmalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + /* + * We use the same file system operations for all our debug files. To + * produce specific output, we look up the file name upon opening a + * debugfs entry and map it to a memory offset. This offset is then used + * in the generic "show" function to read a specific register. + */ + data->entry = __find_debugfs_entry(file->f_path.dentry->d_iname); + data->priv = inode->i_private; + + ret = single_open(file, brcm_avs_debug_show, data); + if (ret) + kfree(data); + file->f_mode = fmode; + + return ret; +} + +static int brcm_avs_debug_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq_priv = file->private_data; + struct debugfs_data *data = seq_priv->private; + + kfree(data); + return single_release(inode, file); +} + +static const struct file_operations brcm_avs_debug_ops = { + .open = brcm_avs_debug_open, + .read = seq_read, + .write = brcm_avs_seq_write, + .llseek = seq_lseek, + .release = brcm_avs_debug_release, +}; + +static void brcm_avs_cpufreq_debug_init(struct platform_device *pdev) +{ + struct private_data *priv = platform_get_drvdata(pdev); + struct dentry *dir; + int i; + + if (!priv) + return; + + dir = debugfs_create_dir(BRCM_AVS_CPUFREQ_NAME, NULL); + if (IS_ERR_OR_NULL(dir)) + return; + priv->debugfs = dir; + + for (i = 0; i < ARRAY_SIZE(debugfs_entries); i++) { + /* + * The DEBUGFS_ENTRY macro generates uppercase strings. We + * convert them to lowercase before creating the debugfs + * entries. + */ + char *entry = __strtolower(debugfs_entries[i].name); + fmode_t mode = debugfs_entries[i].mode; + + if (!debugfs_create_file(entry, S_IFREG | S_IRUGO | mode, + dir, priv, &brcm_avs_debug_ops)) { + priv->debugfs = NULL; + debugfs_remove_recursive(dir); + break; + } + } +} + +static void brcm_avs_cpufreq_debug_exit(struct platform_device *pdev) +{ + struct private_data *priv = platform_get_drvdata(pdev); + + if (priv && priv->debugfs) { + debugfs_remove_recursive(priv->debugfs); + priv->debugfs = NULL; + } +} + +#else + +static void brcm_avs_cpufreq_debug_init(struct platform_device *pdev) {} +static void brcm_avs_cpufreq_debug_exit(struct platform_device *pdev) {} + +#endif /* CONFIG_ARM_BRCMSTB_AVS_CPUFREQ_DEBUG */ + +/* + * To ensure the right firmware is running we need to + * - check the MAGIC matches what we expect + * - brcm_avs_get_pmap() doesn't return -ENOTSUPP or -EINVAL + * We need to set up our interrupt handling before calling brcm_avs_get_pmap()! + */ +static bool brcm_avs_is_firmware_loaded(struct private_data *priv) +{ + u32 magic; + int rc; + + rc = brcm_avs_get_pmap(priv, NULL); + magic = readl(priv->base + AVS_MBOX_MAGIC); + + return (magic == AVS_FIRMWARE_MAGIC) && (rc != -ENOTSUPP) && + (rc != -EINVAL); +} + +static unsigned int brcm_avs_cpufreq_get(unsigned int cpu) +{ + struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); + struct private_data *priv = policy->driver_data; + + return brcm_avs_get_frequency(priv->base); +} + +static int brcm_avs_target_index(struct cpufreq_policy *policy, + unsigned int index) +{ + return brcm_avs_set_pstate(policy->driver_data, + policy->freq_table[index].driver_data); +} + +static int brcm_avs_suspend(struct cpufreq_policy *policy) +{ + struct private_data *priv = policy->driver_data; + + return brcm_avs_get_pmap(priv, &priv->pmap); +} + +static int brcm_avs_resume(struct cpufreq_policy *policy) +{ + struct private_data *priv = policy->driver_data; + int ret; + + ret = brcm_avs_set_pmap(priv, &priv->pmap); + if (ret == -EEXIST) { + struct platform_device *pdev = cpufreq_get_driver_data(); + struct device *dev = &pdev->dev; + + dev_warn(dev, "PMAP was already set\n"); + ret = 0; + } + + return ret; +} + +/* + * All initialization code that we only want to execute once goes here. Setup + * code that can be re-tried on every core (if it failed before) can go into + * brcm_avs_cpufreq_init(). + */ +static int brcm_avs_prepare_init(struct platform_device *pdev) +{ + struct private_data *priv; + struct device *dev; + int host_irq, ret; + + dev = &pdev->dev; + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->dev = dev; + sema_init(&priv->sem, 1); + init_completion(&priv->done); + platform_set_drvdata(pdev, priv); + + priv->base = __map_region(BRCM_AVS_CPU_DATA); + if (!priv->base) { + dev_err(dev, "Couldn't find property %s in device tree.\n", + BRCM_AVS_CPU_DATA); + return -ENOENT; + } + + priv->avs_intr_base = __map_region(BRCM_AVS_CPU_INTR); + if (!priv->avs_intr_base) { + dev_err(dev, "Couldn't find property %s in device tree.\n", + BRCM_AVS_CPU_INTR); + ret = -ENOENT; + goto unmap_base; + } + + host_irq = platform_get_irq_byname(pdev, BRCM_AVS_HOST_INTR); + if (host_irq < 0) { + dev_err(dev, "Couldn't find interrupt %s -- %d\n", + BRCM_AVS_HOST_INTR, host_irq); + ret = host_irq; + goto unmap_intr_base; + } + + ret = devm_request_irq(dev, host_irq, irq_handler, IRQF_TRIGGER_RISING, + BRCM_AVS_HOST_INTR, priv); + if (ret) { + dev_err(dev, "IRQ request failed: %s (%d) -- %d\n", + BRCM_AVS_HOST_INTR, host_irq, ret); + goto unmap_intr_base; + } + + if (brcm_avs_is_firmware_loaded(priv)) + return 0; + + dev_err(dev, "AVS firmware is not loaded or doesn't support DVFS\n"); + ret = -ENODEV; + +unmap_intr_base: + iounmap(priv->avs_intr_base); +unmap_base: + iounmap(priv->base); + platform_set_drvdata(pdev, NULL); + + return ret; +} + +static int brcm_avs_cpufreq_init(struct cpufreq_policy *policy) +{ + struct cpufreq_frequency_table *freq_table; + struct platform_device *pdev; + struct private_data *priv; + struct device *dev; + int ret; + + pdev = cpufreq_get_driver_data(); + priv = platform_get_drvdata(pdev); + policy->driver_data = priv; + dev = &pdev->dev; + + freq_table = brcm_avs_get_freq_table(dev, priv); + if (IS_ERR(freq_table)) { + ret = PTR_ERR(freq_table); + dev_err(dev, "Couldn't determine frequency table (%d).\n", ret); + return ret; + } + + ret = cpufreq_table_validate_and_show(policy, freq_table); + if (ret) { + dev_err(dev, "invalid frequency table: %d\n", ret); + return ret; + } + + /* All cores share the same clock and thus the same policy. */ + cpumask_setall(policy->cpus); + + ret = __issue_avs_command(priv, AVS_CMD_ENABLE, false, NULL); + if (!ret) { + unsigned int pstate; + + ret = brcm_avs_get_pstate(priv, &pstate); + if (!ret) { + policy->cur = freq_table[pstate].frequency; + dev_info(dev, "registered\n"); + return 0; + } + } + + dev_err(dev, "couldn't initialize driver (%d)\n", ret); + + return ret; +} + +static ssize_t show_brcm_avs_pstate(struct cpufreq_policy *policy, char *buf) +{ + struct private_data *priv = policy->driver_data; + unsigned int pstate; + + if (brcm_avs_get_pstate(priv, &pstate)) + return sprintf(buf, "<unknown>\n"); + + return sprintf(buf, "%u\n", pstate); +} + +static ssize_t show_brcm_avs_mode(struct cpufreq_policy *policy, char *buf) +{ + struct private_data *priv = policy->driver_data; + struct pmap pmap; + + if (brcm_avs_get_pmap(priv, &pmap)) + return sprintf(buf, "<unknown>\n"); + + return sprintf(buf, "%s %u\n", brcm_avs_mode_to_string(pmap.mode), + pmap.mode); +} + +static ssize_t show_brcm_avs_pmap(struct cpufreq_policy *policy, char *buf) +{ + unsigned int mdiv_p0, mdiv_p1, mdiv_p2, mdiv_p3, mdiv_p4; + struct private_data *priv = policy->driver_data; + unsigned int ndiv, pdiv; + struct pmap pmap; + + if (brcm_avs_get_pmap(priv, &pmap)) + return sprintf(buf, "<unknown>\n"); + + brcm_avs_parse_p1(pmap.p1, &mdiv_p0, &pdiv, &ndiv); + brcm_avs_parse_p2(pmap.p2, &mdiv_p1, &mdiv_p2, &mdiv_p3, &mdiv_p4); + + return sprintf(buf, "0x%08x 0x%08x %u %u %u %u %u %u %u\n", + pmap.p1, pmap.p2, ndiv, pdiv, mdiv_p0, mdiv_p1, mdiv_p2, + mdiv_p3, mdiv_p4); +} + +static ssize_t show_brcm_avs_voltage(struct cpufreq_policy *policy, char *buf) +{ + struct private_data *priv = policy->driver_data; + + return sprintf(buf, "0x%08lx\n", brcm_avs_get_voltage(priv->base)); +} + +static ssize_t show_brcm_avs_frequency(struct cpufreq_policy *policy, char *buf) +{ + struct private_data *priv = policy->driver_data; + + return sprintf(buf, "0x%08lx\n", brcm_avs_get_frequency(priv->base)); +} + +cpufreq_freq_attr_ro(brcm_avs_pstate); +cpufreq_freq_attr_ro(brcm_avs_mode); +cpufreq_freq_attr_ro(brcm_avs_pmap); +cpufreq_freq_attr_ro(brcm_avs_voltage); +cpufreq_freq_attr_ro(brcm_avs_frequency); + +static struct freq_attr *brcm_avs_cpufreq_attr[] = { + &cpufreq_freq_attr_scaling_available_freqs, + &brcm_avs_pstate, + &brcm_avs_mode, + &brcm_avs_pmap, + &brcm_avs_voltage, + &brcm_avs_frequency, + NULL +}; + +static struct cpufreq_driver brcm_avs_driver = { + .flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK, + .verify = cpufreq_generic_frequency_table_verify, + .target_index = brcm_avs_target_index, + .get = brcm_avs_cpufreq_get, + .suspend = brcm_avs_suspend, + .resume = brcm_avs_resume, + .init = brcm_avs_cpufreq_init, + .attr = brcm_avs_cpufreq_attr, + .name = BRCM_AVS_CPUFREQ_PREFIX, +}; + +static int brcm_avs_cpufreq_probe(struct platform_device *pdev) +{ + int ret; + + ret = brcm_avs_prepare_init(pdev); + if (ret) + return ret; + + brcm_avs_driver.driver_data = pdev; + ret = cpufreq_register_driver(&brcm_avs_driver); + if (!ret) + brcm_avs_cpufreq_debug_init(pdev); + + return ret; +} + +static int brcm_avs_cpufreq_remove(struct platform_device *pdev) +{ + struct private_data *priv; + int ret; + + ret = cpufreq_unregister_driver(&brcm_avs_driver); + if (ret) + return ret; + + brcm_avs_cpufreq_debug_exit(pdev); + + priv = platform_get_drvdata(pdev); + iounmap(priv->base); + iounmap(priv->avs_intr_base); + platform_set_drvdata(pdev, NULL); + + return 0; +} + +static const struct of_device_id brcm_avs_cpufreq_match[] = { + { .compatible = BRCM_AVS_CPU_DATA }, + { } +}; +MODULE_DEVICE_TABLE(of, brcm_avs_cpufreq_match); + +static struct platform_driver brcm_avs_cpufreq_platdrv = { + .driver = { + .name = BRCM_AVS_CPUFREQ_NAME, + .of_match_table = brcm_avs_cpufreq_match, + }, + .probe = brcm_avs_cpufreq_probe, + .remove = brcm_avs_cpufreq_remove, +}; +module_platform_driver(brcm_avs_cpufreq_platdrv); + +MODULE_AUTHOR("Markus Mayer <mmayer@broadcom.com>"); +MODULE_DESCRIPTION("CPUfreq driver for Broadcom STB AVS"); +MODULE_LICENSE("GPL"); diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 4852d9efe74e..e82bb3c30b92 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -247,3 +247,10 @@ MODULE_DESCRIPTION("CPUFreq driver based on the ACPI CPPC v5.0+ spec"); MODULE_LICENSE("GPL"); late_initcall(cppc_cpufreq_init); + +static const struct acpi_device_id cppc_acpi_ids[] = { + {ACPI_PROCESSOR_DEVICE_HID, }, + {} +}; + +MODULE_DEVICE_TABLE(acpi, cppc_acpi_ids); diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c index 71267626456b..bc97b6a4b1cf 100644 --- a/drivers/cpufreq/cpufreq-dt-platdev.c +++ b/drivers/cpufreq/cpufreq-dt-platdev.c @@ -26,6 +26,9 @@ static const struct of_device_id machines[] __initconst = { { .compatible = "allwinner,sun8i-a83t", }, { .compatible = "allwinner,sun8i-h3", }, + { .compatible = "arm,integrator-ap", }, + { .compatible = "arm,integrator-cp", }, + { .compatible = "hisilicon,hi6220", }, { .compatible = "fsl,imx27", }, @@ -34,6 +37,8 @@ static const struct of_device_id machines[] __initconst = { { .compatible = "fsl,imx7d", }, { .compatible = "marvell,berlin", }, + { .compatible = "marvell,pxa250", }, + { .compatible = "marvell,pxa270", }, { .compatible = "samsung,exynos3250", }, { .compatible = "samsung,exynos4210", }, @@ -50,6 +55,8 @@ static const struct of_device_id machines[] __initconst = { { .compatible = "renesas,r7s72100", }, { .compatible = "renesas,r8a73a4", }, { .compatible = "renesas,r8a7740", }, + { .compatible = "renesas,r8a7743", }, + { .compatible = "renesas,r8a7745", }, { .compatible = "renesas,r8a7778", }, { .compatible = "renesas,r8a7779", }, { .compatible = "renesas,r8a7790", }, @@ -72,6 +79,12 @@ static const struct of_device_id machines[] __initconst = { { .compatible = "sigma,tango4" }, + { .compatible = "socionext,uniphier-pro5", }, + { .compatible = "socionext,uniphier-pxs2", }, + { .compatible = "socionext,uniphier-ld6b", }, + { .compatible = "socionext,uniphier-ld11", }, + { .compatible = "socionext,uniphier-ld20", }, + { .compatible = "ti,am33xx", }, { .compatible = "ti,dra7", }, { .compatible = "ti,omap2", }, @@ -81,6 +94,8 @@ static const struct of_device_id machines[] __initconst = { { .compatible = "xlnx,zynq-7000", }, + { .compatible = "zte,zx296718", }, + { } }; diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index 5c07ae05d69a..269013311e79 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -28,6 +28,7 @@ #include "cpufreq-dt.h" struct private_data { + struct opp_table *opp_table; struct device *cpu_dev; struct thermal_cooling_device *cdev; const char *reg_name; @@ -143,6 +144,7 @@ static int resources_available(void) static int cpufreq_init(struct cpufreq_policy *policy) { struct cpufreq_frequency_table *freq_table; + struct opp_table *opp_table = NULL; struct private_data *priv; struct device *cpu_dev; struct clk *cpu_clk; @@ -186,8 +188,9 @@ static int cpufreq_init(struct cpufreq_policy *policy) */ name = find_supply_name(cpu_dev); if (name) { - ret = dev_pm_opp_set_regulator(cpu_dev, name); - if (ret) { + opp_table = dev_pm_opp_set_regulators(cpu_dev, &name, 1); + if (IS_ERR(opp_table)) { + ret = PTR_ERR(opp_table); dev_err(cpu_dev, "Failed to set regulator for cpu%d: %d\n", policy->cpu, ret); goto out_put_clk; @@ -237,6 +240,7 @@ static int cpufreq_init(struct cpufreq_policy *policy) } priv->reg_name = name; + priv->opp_table = opp_table; ret = dev_pm_opp_init_cpufreq_table(cpu_dev, &freq_table); if (ret) { @@ -285,7 +289,7 @@ out_free_priv: out_free_opp: dev_pm_opp_of_cpumask_remove_table(policy->cpus); if (name) - dev_pm_opp_put_regulator(cpu_dev); + dev_pm_opp_put_regulators(opp_table); out_put_clk: clk_put(cpu_clk); @@ -300,7 +304,7 @@ static int cpufreq_exit(struct cpufreq_policy *policy) dev_pm_opp_free_cpufreq_table(priv->cpu_dev, &policy->freq_table); dev_pm_opp_of_cpumask_remove_table(policy->related_cpus); if (priv->reg_name) - dev_pm_opp_put_regulator(priv->cpu_dev); + dev_pm_opp_put_regulators(priv->opp_table); clk_put(policy->clk); kfree(priv); diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 6e6c1fb60fbc..cc475eff90b3 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1526,7 +1526,10 @@ unsigned int cpufreq_get(unsigned int cpu) if (policy) { down_read(&policy->rwsem); - ret_freq = __cpufreq_get(policy); + + if (!policy_is_inactive(policy)) + ret_freq = __cpufreq_get(policy); + up_read(&policy->rwsem); cpufreq_cpu_put(policy); @@ -2254,17 +2257,19 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, * Useful for policy notifiers which have different necessities * at different times. */ -int cpufreq_update_policy(unsigned int cpu) +void cpufreq_update_policy(unsigned int cpu) { struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); struct cpufreq_policy new_policy; - int ret; if (!policy) - return -ENODEV; + return; down_write(&policy->rwsem); + if (policy_is_inactive(policy)) + goto unlock; + pr_debug("updating policy for CPU %u\n", cpu); memcpy(&new_policy, policy, sizeof(*policy)); new_policy.min = policy->user_policy.min; @@ -2275,24 +2280,20 @@ int cpufreq_update_policy(unsigned int cpu) * -> ask driver for current freq and notify governors about a change */ if (cpufreq_driver->get && !cpufreq_driver->setpolicy) { - if (cpufreq_suspended) { - ret = -EAGAIN; + if (cpufreq_suspended) goto unlock; - } + new_policy.cur = cpufreq_update_current_freq(policy); - if (WARN_ON(!new_policy.cur)) { - ret = -EIO; + if (WARN_ON(!new_policy.cur)) goto unlock; - } } - ret = cpufreq_set_policy(policy, &new_policy); + cpufreq_set_policy(policy, &new_policy); unlock: up_write(&policy->rwsem); cpufreq_cpu_put(policy); - return ret; } EXPORT_SYMBOL(cpufreq_update_policy); diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 13475890d792..992f7c20760f 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -37,16 +37,16 @@ struct cs_dbs_tuners { #define DEF_SAMPLING_DOWN_FACTOR (1) #define MAX_SAMPLING_DOWN_FACTOR (10) -static inline unsigned int get_freq_target(struct cs_dbs_tuners *cs_tuners, - struct cpufreq_policy *policy) +static inline unsigned int get_freq_step(struct cs_dbs_tuners *cs_tuners, + struct cpufreq_policy *policy) { - unsigned int freq_target = (cs_tuners->freq_step * policy->max) / 100; + unsigned int freq_step = (cs_tuners->freq_step * policy->max) / 100; /* max freq cannot be less than 100. But who knows... */ - if (unlikely(freq_target == 0)) - freq_target = DEF_FREQUENCY_STEP; + if (unlikely(freq_step == 0)) + freq_step = DEF_FREQUENCY_STEP; - return freq_target; + return freq_step; } /* @@ -55,10 +55,10 @@ static inline unsigned int get_freq_target(struct cs_dbs_tuners *cs_tuners, * sampling_down_factor, we check, if current idle time is more than 80% * (default), then we try to decrease frequency * - * Any frequency increase takes it to the maximum frequency. Frequency reduction - * happens at minimum steps of 5% (default) of maximum frequency + * Frequency updates happen at minimum steps of 5% (default) of maximum + * frequency */ -static unsigned int cs_dbs_timer(struct cpufreq_policy *policy) +static unsigned int cs_dbs_update(struct cpufreq_policy *policy) { struct policy_dbs_info *policy_dbs = policy->governor_data; struct cs_policy_dbs_info *dbs_info = to_dbs_info(policy_dbs); @@ -66,6 +66,7 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy) struct dbs_data *dbs_data = policy_dbs->dbs_data; struct cs_dbs_tuners *cs_tuners = dbs_data->tuners; unsigned int load = dbs_update(policy); + unsigned int freq_step; /* * break out if we 'cannot' reduce the speed as the user might @@ -82,6 +83,23 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy) if (requested_freq > policy->max || requested_freq < policy->min) requested_freq = policy->cur; + freq_step = get_freq_step(cs_tuners, policy); + + /* + * Decrease requested_freq one freq_step for each idle period that + * we didn't update the frequency. + */ + if (policy_dbs->idle_periods < UINT_MAX) { + unsigned int freq_steps = policy_dbs->idle_periods * freq_step; + + if (requested_freq > freq_steps) + requested_freq -= freq_steps; + else + requested_freq = policy->min; + + policy_dbs->idle_periods = UINT_MAX; + } + /* Check for frequency increase */ if (load > dbs_data->up_threshold) { dbs_info->down_skip = 0; @@ -90,7 +108,7 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy) if (requested_freq == policy->max) goto out; - requested_freq += get_freq_target(cs_tuners, policy); + requested_freq += freq_step; if (requested_freq > policy->max) requested_freq = policy->max; @@ -106,16 +124,14 @@ static unsigned int cs_dbs_timer(struct cpufreq_policy *policy) /* Check for frequency decrease */ if (load < cs_tuners->down_threshold) { - unsigned int freq_target; /* * if we cannot reduce the frequency anymore, break out early */ if (requested_freq == policy->min) goto out; - freq_target = get_freq_target(cs_tuners, policy); - if (requested_freq > freq_target) - requested_freq -= freq_target; + if (requested_freq > freq_step) + requested_freq -= freq_step; else requested_freq = policy->min; @@ -305,7 +321,7 @@ static void cs_start(struct cpufreq_policy *policy) static struct dbs_governor cs_governor = { .gov = CPUFREQ_DBS_GOVERNOR_INITIALIZER("conservative"), .kobj_type = { .default_attrs = cs_attributes }, - .gov_dbs_timer = cs_dbs_timer, + .gov_dbs_update = cs_dbs_update, .alloc = cs_alloc, .free = cs_free, .init = cs_init, diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index 642dd0f183a8..0196467280bd 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -61,7 +61,7 @@ ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf, * entries can't be freed concurrently. */ list_for_each_entry(policy_dbs, &attr_set->policy_list, list) { - mutex_lock(&policy_dbs->timer_mutex); + mutex_lock(&policy_dbs->update_mutex); /* * On 32-bit architectures this may race with the * sample_delay_ns read in dbs_update_util_handler(), but that @@ -76,7 +76,7 @@ ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf, * taken, so it shouldn't be significant. */ gov_update_sample_delay(policy_dbs, 0); - mutex_unlock(&policy_dbs->timer_mutex); + mutex_unlock(&policy_dbs->update_mutex); } return count; @@ -117,7 +117,7 @@ unsigned int dbs_update(struct cpufreq_policy *policy) struct policy_dbs_info *policy_dbs = policy->governor_data; struct dbs_data *dbs_data = policy_dbs->dbs_data; unsigned int ignore_nice = dbs_data->ignore_nice_load; - unsigned int max_load = 0; + unsigned int max_load = 0, idle_periods = UINT_MAX; unsigned int sampling_rate, io_busy, j; /* @@ -215,9 +215,19 @@ unsigned int dbs_update(struct cpufreq_policy *policy) j_cdbs->prev_load = load; } + if (time_elapsed > 2 * sampling_rate) { + unsigned int periods = time_elapsed / sampling_rate; + + if (periods < idle_periods) + idle_periods = periods; + } + if (load > max_load) max_load = load; } + + policy_dbs->idle_periods = idle_periods; + return max_load; } EXPORT_SYMBOL_GPL(dbs_update); @@ -236,9 +246,9 @@ static void dbs_work_handler(struct work_struct *work) * Make sure cpufreq_governor_limits() isn't evaluating load or the * ondemand governor isn't updating the sampling rate in parallel. */ - mutex_lock(&policy_dbs->timer_mutex); - gov_update_sample_delay(policy_dbs, gov->gov_dbs_timer(policy)); - mutex_unlock(&policy_dbs->timer_mutex); + mutex_lock(&policy_dbs->update_mutex); + gov_update_sample_delay(policy_dbs, gov->gov_dbs_update(policy)); + mutex_unlock(&policy_dbs->update_mutex); /* Allow the utilization update handler to queue up more work. */ atomic_set(&policy_dbs->work_count, 0); @@ -348,7 +358,7 @@ static struct policy_dbs_info *alloc_policy_dbs_info(struct cpufreq_policy *poli return NULL; policy_dbs->policy = policy; - mutex_init(&policy_dbs->timer_mutex); + mutex_init(&policy_dbs->update_mutex); atomic_set(&policy_dbs->work_count, 0); init_irq_work(&policy_dbs->irq_work, dbs_irq_work); INIT_WORK(&policy_dbs->work, dbs_work_handler); @@ -367,7 +377,7 @@ static void free_policy_dbs_info(struct policy_dbs_info *policy_dbs, { int j; - mutex_destroy(&policy_dbs->timer_mutex); + mutex_destroy(&policy_dbs->update_mutex); for_each_cpu(j, policy_dbs->policy->related_cpus) { struct cpu_dbs_info *j_cdbs = &per_cpu(cpu_dbs, j); @@ -547,10 +557,10 @@ void cpufreq_dbs_governor_limits(struct cpufreq_policy *policy) { struct policy_dbs_info *policy_dbs = policy->governor_data; - mutex_lock(&policy_dbs->timer_mutex); + mutex_lock(&policy_dbs->update_mutex); cpufreq_policy_apply_limits(policy); gov_update_sample_delay(policy_dbs, 0); - mutex_unlock(&policy_dbs->timer_mutex); + mutex_unlock(&policy_dbs->update_mutex); } EXPORT_SYMBOL_GPL(cpufreq_dbs_governor_limits); diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h index ef1037e9c92b..f5717ca070cc 100644 --- a/drivers/cpufreq/cpufreq_governor.h +++ b/drivers/cpufreq/cpufreq_governor.h @@ -85,7 +85,7 @@ struct policy_dbs_info { * Per policy mutex that serializes load evaluation from limit-change * and work-handler. */ - struct mutex timer_mutex; + struct mutex update_mutex; u64 last_sample_time; s64 sample_delay_ns; @@ -97,6 +97,7 @@ struct policy_dbs_info { struct list_head list; /* Multiplier for increasing sample delay temporarily. */ unsigned int rate_mult; + unsigned int idle_periods; /* For conservative */ /* Status indicators */ bool is_shared; /* This object is used by multiple CPUs */ bool work_in_progress; /* Work is being queued up or in progress */ @@ -135,7 +136,7 @@ struct dbs_governor { */ struct dbs_data *gdbs_data; - unsigned int (*gov_dbs_timer)(struct cpufreq_policy *policy); + unsigned int (*gov_dbs_update)(struct cpufreq_policy *policy); struct policy_dbs_info *(*alloc)(void); void (*free)(struct policy_dbs_info *policy_dbs); int (*init)(struct dbs_data *dbs_data); diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index 3a1f49f5f4c6..4a017e895296 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -25,7 +25,7 @@ #define MAX_SAMPLING_DOWN_FACTOR (100000) #define MICRO_FREQUENCY_UP_THRESHOLD (95) #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) -#define MIN_FREQUENCY_UP_THRESHOLD (11) +#define MIN_FREQUENCY_UP_THRESHOLD (1) #define MAX_FREQUENCY_UP_THRESHOLD (100) static struct od_ops od_ops; @@ -169,7 +169,7 @@ static void od_update(struct cpufreq_policy *policy) } } -static unsigned int od_dbs_timer(struct cpufreq_policy *policy) +static unsigned int od_dbs_update(struct cpufreq_policy *policy) { struct policy_dbs_info *policy_dbs = policy->governor_data; struct dbs_data *dbs_data = policy_dbs->dbs_data; @@ -191,7 +191,7 @@ static unsigned int od_dbs_timer(struct cpufreq_policy *policy) od_update(policy); if (dbs_info->freq_lo) { - /* Setup timer for SUB_SAMPLE */ + /* Setup SUB_SAMPLE */ dbs_info->sample_type = OD_SUB_SAMPLE; return dbs_info->freq_hi_delay_us; } @@ -255,11 +255,11 @@ static ssize_t store_sampling_down_factor(struct gov_attr_set *attr_set, list_for_each_entry(policy_dbs, &attr_set->policy_list, list) { /* * Doing this without locking might lead to using different - * rate_mult values in od_update() and od_dbs_timer(). + * rate_mult values in od_update() and od_dbs_update(). */ - mutex_lock(&policy_dbs->timer_mutex); + mutex_lock(&policy_dbs->update_mutex); policy_dbs->rate_mult = 1; - mutex_unlock(&policy_dbs->timer_mutex); + mutex_unlock(&policy_dbs->update_mutex); } return count; @@ -374,8 +374,7 @@ static int od_init(struct dbs_data *dbs_data) dbs_data->up_threshold = MICRO_FREQUENCY_UP_THRESHOLD; /* * In nohz/micro accounting case we set the minimum frequency - * not depending on HZ, but fixed (very low). The deferred - * timer might skip some samples if idle/sleeping as needed. + * not depending on HZ, but fixed (very low). */ dbs_data->min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE; } else { @@ -415,7 +414,7 @@ static struct od_ops od_ops = { static struct dbs_governor od_dbs_gov = { .gov = CPUFREQ_DBS_GOVERNOR_INITIALIZER("ondemand"), .kobj_type = { .default_attrs = od_attributes }, - .gov_dbs_timer = od_dbs_timer, + .gov_dbs_update = od_dbs_update, .alloc = od_alloc, .free = od_free, .init = od_init, diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index 06d3abdffd3a..ac284e66839c 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -41,6 +41,18 @@ static int cpufreq_stats_update(struct cpufreq_stats *stats) return 0; } +static void cpufreq_stats_clear_table(struct cpufreq_stats *stats) +{ + unsigned int count = stats->max_state; + + memset(stats->time_in_state, 0, count * sizeof(u64)); +#ifdef CONFIG_CPU_FREQ_STAT_DETAILS + memset(stats->trans_table, 0, count * count * sizeof(int)); +#endif + stats->last_time = get_jiffies_64(); + stats->total_trans = 0; +} + static ssize_t show_total_trans(struct cpufreq_policy *policy, char *buf) { return sprintf(buf, "%d\n", policy->stats->total_trans); @@ -64,6 +76,14 @@ static ssize_t show_time_in_state(struct cpufreq_policy *policy, char *buf) return len; } +static ssize_t store_reset(struct cpufreq_policy *policy, const char *buf, + size_t count) +{ + /* We don't care what is written to the attribute. */ + cpufreq_stats_clear_table(policy->stats); + return count; +} + #ifdef CONFIG_CPU_FREQ_STAT_DETAILS static ssize_t show_trans_table(struct cpufreq_policy *policy, char *buf) { @@ -113,10 +133,12 @@ cpufreq_freq_attr_ro(trans_table); cpufreq_freq_attr_ro(total_trans); cpufreq_freq_attr_ro(time_in_state); +cpufreq_freq_attr_wo(reset); static struct attribute *default_attrs[] = { &total_trans.attr, &time_in_state.attr, + &reset.attr, #ifdef CONFIG_CPU_FREQ_STAT_DETAILS &trans_table.attr, #endif diff --git a/drivers/cpufreq/integrator-cpufreq.c b/drivers/cpufreq/integrator-cpufreq.c deleted file mode 100644 index 79e3ff2771a6..000000000000 --- a/drivers/cpufreq/integrator-cpufreq.c +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Copyright (C) 2001-2002 Deep Blue Solutions Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * CPU support functions - */ -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/cpufreq.h> -#include <linux/sched.h> -#include <linux/smp.h> -#include <linux/init.h> -#include <linux/io.h> -#include <linux/platform_device.h> -#include <linux/of.h> -#include <linux/of_address.h> - -#include <asm/mach-types.h> -#include <asm/hardware/icst.h> - -static void __iomem *cm_base; -/* The cpufreq driver only use the OSC register */ -#define INTEGRATOR_HDR_OSC_OFFSET 0x08 -#define INTEGRATOR_HDR_LOCK_OFFSET 0x14 - -static struct cpufreq_driver integrator_driver; - -static const struct icst_params lclk_params = { - .ref = 24000000, - .vco_max = ICST525_VCO_MAX_5V, - .vco_min = ICST525_VCO_MIN, - .vd_min = 8, - .vd_max = 132, - .rd_min = 24, - .rd_max = 24, - .s2div = icst525_s2div, - .idx2s = icst525_idx2s, -}; - -static const struct icst_params cclk_params = { - .ref = 24000000, - .vco_max = ICST525_VCO_MAX_5V, - .vco_min = ICST525_VCO_MIN, - .vd_min = 12, - .vd_max = 160, - .rd_min = 24, - .rd_max = 24, - .s2div = icst525_s2div, - .idx2s = icst525_idx2s, -}; - -/* - * Validate the speed policy. - */ -static int integrator_verify_policy(struct cpufreq_policy *policy) -{ - struct icst_vco vco; - - cpufreq_verify_within_cpu_limits(policy); - - vco = icst_hz_to_vco(&cclk_params, policy->max * 1000); - policy->max = icst_hz(&cclk_params, vco) / 1000; - - vco = icst_hz_to_vco(&cclk_params, policy->min * 1000); - policy->min = icst_hz(&cclk_params, vco) / 1000; - - cpufreq_verify_within_cpu_limits(policy); - return 0; -} - - -static int integrator_set_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation) -{ - cpumask_t cpus_allowed; - int cpu = policy->cpu; - struct icst_vco vco; - struct cpufreq_freqs freqs; - u_int cm_osc; - - /* - * Save this threads cpus_allowed mask. - */ - cpus_allowed = current->cpus_allowed; - - /* - * Bind to the specified CPU. When this call returns, - * we should be running on the right CPU. - */ - set_cpus_allowed_ptr(current, cpumask_of(cpu)); - BUG_ON(cpu != smp_processor_id()); - - /* get current setting */ - cm_osc = __raw_readl(cm_base + INTEGRATOR_HDR_OSC_OFFSET); - - if (machine_is_integrator()) - vco.s = (cm_osc >> 8) & 7; - else if (machine_is_cintegrator()) - vco.s = 1; - vco.v = cm_osc & 255; - vco.r = 22; - freqs.old = icst_hz(&cclk_params, vco) / 1000; - - /* icst_hz_to_vco rounds down -- so we need the next - * larger freq in case of CPUFREQ_RELATION_L. - */ - if (relation == CPUFREQ_RELATION_L) - target_freq += 999; - if (target_freq > policy->max) - target_freq = policy->max; - vco = icst_hz_to_vco(&cclk_params, target_freq * 1000); - freqs.new = icst_hz(&cclk_params, vco) / 1000; - - if (freqs.old == freqs.new) { - set_cpus_allowed_ptr(current, &cpus_allowed); - return 0; - } - - cpufreq_freq_transition_begin(policy, &freqs); - - cm_osc = __raw_readl(cm_base + INTEGRATOR_HDR_OSC_OFFSET); - - if (machine_is_integrator()) { - cm_osc &= 0xfffff800; - cm_osc |= vco.s << 8; - } else if (machine_is_cintegrator()) { - cm_osc &= 0xffffff00; - } - cm_osc |= vco.v; - - __raw_writel(0xa05f, cm_base + INTEGRATOR_HDR_LOCK_OFFSET); - __raw_writel(cm_osc, cm_base + INTEGRATOR_HDR_OSC_OFFSET); - __raw_writel(0, cm_base + INTEGRATOR_HDR_LOCK_OFFSET); - - /* - * Restore the CPUs allowed mask. - */ - set_cpus_allowed_ptr(current, &cpus_allowed); - - cpufreq_freq_transition_end(policy, &freqs, 0); - - return 0; -} - -static unsigned int integrator_get(unsigned int cpu) -{ - cpumask_t cpus_allowed; - unsigned int current_freq; - u_int cm_osc; - struct icst_vco vco; - - cpus_allowed = current->cpus_allowed; - - set_cpus_allowed_ptr(current, cpumask_of(cpu)); - BUG_ON(cpu != smp_processor_id()); - - /* detect memory etc. */ - cm_osc = __raw_readl(cm_base + INTEGRATOR_HDR_OSC_OFFSET); - - if (machine_is_integrator()) - vco.s = (cm_osc >> 8) & 7; - else - vco.s = 1; - vco.v = cm_osc & 255; - vco.r = 22; - - current_freq = icst_hz(&cclk_params, vco) / 1000; /* current freq */ - - set_cpus_allowed_ptr(current, &cpus_allowed); - - return current_freq; -} - -static int integrator_cpufreq_init(struct cpufreq_policy *policy) -{ - - /* set default policy and cpuinfo */ - policy->max = policy->cpuinfo.max_freq = 160000; - policy->min = policy->cpuinfo.min_freq = 12000; - policy->cpuinfo.transition_latency = 1000000; /* 1 ms, assumed */ - - return 0; -} - -static struct cpufreq_driver integrator_driver = { - .flags = CPUFREQ_NEED_INITIAL_FREQ_CHECK, - .verify = integrator_verify_policy, - .target = integrator_set_target, - .get = integrator_get, - .init = integrator_cpufreq_init, - .name = "integrator", -}; - -static int __init integrator_cpufreq_probe(struct platform_device *pdev) -{ - struct resource *res; - - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENODEV; - - cm_base = devm_ioremap(&pdev->dev, res->start, resource_size(res)); - if (!cm_base) - return -ENODEV; - - return cpufreq_register_driver(&integrator_driver); -} - -static int __exit integrator_cpufreq_remove(struct platform_device *pdev) -{ - return cpufreq_unregister_driver(&integrator_driver); -} - -static const struct of_device_id integrator_cpufreq_match[] = { - { .compatible = "arm,core-module-integrator"}, - { }, -}; - -MODULE_DEVICE_TABLE(of, integrator_cpufreq_match); - -static struct platform_driver integrator_cpufreq_driver = { - .driver = { - .name = "integrator-cpufreq", - .of_match_table = integrator_cpufreq_match, - }, - .remove = __exit_p(integrator_cpufreq_remove), -}; - -module_platform_driver_probe(integrator_cpufreq_driver, - integrator_cpufreq_probe); - -MODULE_AUTHOR("Russell M. King"); -MODULE_DESCRIPTION("cpufreq driver for ARM Integrator CPUs"); -MODULE_LICENSE("GPL"); diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index e8dc42fc0915..6acbd4af632e 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -37,6 +37,8 @@ #include <asm/cpufeature.h> #include <asm/intel-family.h> +#define INTEL_CPUFREQ_TRANSITION_LATENCY 20000 + #define ATOM_RATIOS 0x66a #define ATOM_VIDS 0x66b #define ATOM_TURBO_RATIOS 0x66c @@ -53,6 +55,8 @@ #define EXT_BITS 6 #define EXT_FRAC_BITS (EXT_BITS + FRAC_BITS) +#define fp_ext_toint(X) ((X) >> EXT_FRAC_BITS) +#define int_ext_tofp(X) ((int64_t)(X) << EXT_FRAC_BITS) static inline int32_t mul_fp(int32_t x, int32_t y) { @@ -123,6 +127,8 @@ struct sample { * @scaling: Scaling factor to convert frequency to cpufreq * frequency units * @turbo_pstate: Max Turbo P state possible for this platform + * @max_freq: @max_pstate frequency in cpufreq units + * @turbo_freq: @turbo_pstate frequency in cpufreq units * * Stores the per cpu model P state limits and current P state. */ @@ -133,6 +139,8 @@ struct pstate_data { int max_pstate_physical; int scaling; int turbo_pstate; + unsigned int max_freq; + unsigned int turbo_freq; }; /** @@ -178,6 +186,48 @@ struct _pid { }; /** + * struct perf_limits - Store user and policy limits + * @no_turbo: User requested turbo state from intel_pstate sysfs + * @turbo_disabled: Platform turbo status either from msr + * MSR_IA32_MISC_ENABLE or when maximum available pstate + * matches the maximum turbo pstate + * @max_perf_pct: Effective maximum performance limit in percentage, this + * is minimum of either limits enforced by cpufreq policy + * or limits from user set limits via intel_pstate sysfs + * @min_perf_pct: Effective minimum performance limit in percentage, this + * is maximum of either limits enforced by cpufreq policy + * or limits from user set limits via intel_pstate sysfs + * @max_perf: This is a scaled value between 0 to 255 for max_perf_pct + * This value is used to limit max pstate + * @min_perf: This is a scaled value between 0 to 255 for min_perf_pct + * This value is used to limit min pstate + * @max_policy_pct: The maximum performance in percentage enforced by + * cpufreq setpolicy interface + * @max_sysfs_pct: The maximum performance in percentage enforced by + * intel pstate sysfs interface, unused when per cpu + * controls are enforced + * @min_policy_pct: The minimum performance in percentage enforced by + * cpufreq setpolicy interface + * @min_sysfs_pct: The minimum performance in percentage enforced by + * intel pstate sysfs interface, unused when per cpu + * controls are enforced + * + * Storage for user and policy defined limits. + */ +struct perf_limits { + int no_turbo; + int turbo_disabled; + int max_perf_pct; + int min_perf_pct; + int32_t max_perf; + int32_t min_perf; + int max_policy_pct; + int max_sysfs_pct; + int min_policy_pct; + int min_sysfs_pct; +}; + +/** * struct cpudata - Per CPU instance data storage * @cpu: CPU number for this instance data * @policy: CPUFreq policy value @@ -195,8 +245,19 @@ struct _pid { * @prev_cummulative_iowait: IO Wait time difference from last and * current sample * @sample: Storage for storing last Sample data + * @perf_limits: Pointer to perf_limit unique to this CPU + * Not all field in the structure are applicable + * when per cpu controls are enforced * @acpi_perf_data: Stores ACPI perf information read from _PSS * @valid_pss_table: Set to true for valid ACPI _PSS entries found + * @epp_powersave: Last saved HWP energy performance preference + * (EPP) or energy performance bias (EPB), + * when policy switched to performance + * @epp_policy: Last saved policy used to set EPP/EPB + * @epp_default: Power on default HWP energy performance + * preference/bias + * @epp_saved: Saved EPP/EPB during system suspend or CPU offline + * operation * * This structure stores per CPU instance data for all CPUs. */ @@ -218,11 +279,16 @@ struct cpudata { u64 prev_tsc; u64 prev_cummulative_iowait; struct sample sample; + struct perf_limits *perf_limits; #ifdef CONFIG_ACPI struct acpi_processor_performance acpi_perf_data; bool valid_pss_table; #endif unsigned int iowait_boost; + s16 epp_powersave; + s16 epp_policy; + s16 epp_default; + s16 epp_saved; }; static struct cpudata **all_cpu_data; @@ -236,7 +302,6 @@ static struct cpudata **all_cpu_data; * @p_gain_pct: PID proportional gain * @i_gain_pct: PID integral gain * @d_gain_pct: PID derivative gain - * @boost_iowait: Whether or not to use iowait boosting. * * Stores per CPU model static PID configuration data. */ @@ -248,7 +313,6 @@ struct pstate_adjust_policy { int p_gain_pct; int d_gain_pct; int i_gain_pct; - bool boost_iowait; }; /** @@ -292,58 +356,19 @@ static inline int32_t get_target_pstate_use_cpu_load(struct cpudata *cpu); static struct pstate_adjust_policy pid_params __read_mostly; static struct pstate_funcs pstate_funcs __read_mostly; static int hwp_active __read_mostly; +static bool per_cpu_limits __read_mostly; #ifdef CONFIG_ACPI static bool acpi_ppc; #endif -/** - * struct perf_limits - Store user and policy limits - * @no_turbo: User requested turbo state from intel_pstate sysfs - * @turbo_disabled: Platform turbo status either from msr - * MSR_IA32_MISC_ENABLE or when maximum available pstate - * matches the maximum turbo pstate - * @max_perf_pct: Effective maximum performance limit in percentage, this - * is minimum of either limits enforced by cpufreq policy - * or limits from user set limits via intel_pstate sysfs - * @min_perf_pct: Effective minimum performance limit in percentage, this - * is maximum of either limits enforced by cpufreq policy - * or limits from user set limits via intel_pstate sysfs - * @max_perf: This is a scaled value between 0 to 255 for max_perf_pct - * This value is used to limit max pstate - * @min_perf: This is a scaled value between 0 to 255 for min_perf_pct - * This value is used to limit min pstate - * @max_policy_pct: The maximum performance in percentage enforced by - * cpufreq setpolicy interface - * @max_sysfs_pct: The maximum performance in percentage enforced by - * intel pstate sysfs interface - * @min_policy_pct: The minimum performance in percentage enforced by - * cpufreq setpolicy interface - * @min_sysfs_pct: The minimum performance in percentage enforced by - * intel pstate sysfs interface - * - * Storage for user and policy defined limits. - */ -struct perf_limits { - int no_turbo; - int turbo_disabled; - int max_perf_pct; - int min_perf_pct; - int32_t max_perf; - int32_t min_perf; - int max_policy_pct; - int max_sysfs_pct; - int min_policy_pct; - int min_sysfs_pct; -}; - static struct perf_limits performance_limits = { .no_turbo = 0, .turbo_disabled = 0, .max_perf_pct = 100, - .max_perf = int_tofp(1), + .max_perf = int_ext_tofp(1), .min_perf_pct = 100, - .min_perf = int_tofp(1), + .min_perf = int_ext_tofp(1), .max_policy_pct = 100, .max_sysfs_pct = 100, .min_policy_pct = 0, @@ -354,7 +379,7 @@ static struct perf_limits powersave_limits = { .no_turbo = 0, .turbo_disabled = 0, .max_perf_pct = 100, - .max_perf = int_tofp(1), + .max_perf = int_ext_tofp(1), .min_perf_pct = 0, .min_perf = 0, .max_policy_pct = 100, @@ -369,6 +394,8 @@ static struct perf_limits *limits = &performance_limits; static struct perf_limits *limits = &powersave_limits; #endif +static DEFINE_MUTEX(intel_pstate_limits_lock); + #ifdef CONFIG_ACPI static bool intel_pstate_get_ppc_enable_status(void) @@ -513,11 +540,11 @@ static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy) } #else -static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy) +static inline void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy) { } -static void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy) +static inline void intel_pstate_exit_perf_limits(struct cpufreq_policy *policy) { } #endif @@ -613,24 +640,252 @@ static inline void update_turbo_state(void) cpu->pstate.max_pstate == cpu->pstate.turbo_pstate); } +static s16 intel_pstate_get_epb(struct cpudata *cpu_data) +{ + u64 epb; + int ret; + + if (!static_cpu_has(X86_FEATURE_EPB)) + return -ENXIO; + + ret = rdmsrl_on_cpu(cpu_data->cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + if (ret) + return (s16)ret; + + return (s16)(epb & 0x0f); +} + +static s16 intel_pstate_get_epp(struct cpudata *cpu_data, u64 hwp_req_data) +{ + s16 epp; + + if (static_cpu_has(X86_FEATURE_HWP_EPP)) { + /* + * When hwp_req_data is 0, means that caller didn't read + * MSR_HWP_REQUEST, so need to read and get EPP. + */ + if (!hwp_req_data) { + epp = rdmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, + &hwp_req_data); + if (epp) + return epp; + } + epp = (hwp_req_data >> 24) & 0xff; + } else { + /* When there is no EPP present, HWP uses EPB settings */ + epp = intel_pstate_get_epb(cpu_data); + } + + return epp; +} + +static int intel_pstate_set_epb(int cpu, s16 pref) +{ + u64 epb; + int ret; + + if (!static_cpu_has(X86_FEATURE_EPB)) + return -ENXIO; + + ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + if (ret) + return ret; + + epb = (epb & ~0x0f) | pref; + wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, epb); + + return 0; +} + +/* + * EPP/EPB display strings corresponding to EPP index in the + * energy_perf_strings[] + * index String + *------------------------------------- + * 0 default + * 1 performance + * 2 balance_performance + * 3 balance_power + * 4 power + */ +static const char * const energy_perf_strings[] = { + "default", + "performance", + "balance_performance", + "balance_power", + "power", + NULL +}; + +static int intel_pstate_get_energy_pref_index(struct cpudata *cpu_data) +{ + s16 epp; + int index = -EINVAL; + + epp = intel_pstate_get_epp(cpu_data, 0); + if (epp < 0) + return epp; + + if (static_cpu_has(X86_FEATURE_HWP_EPP)) { + /* + * Range: + * 0x00-0x3F : Performance + * 0x40-0x7F : Balance performance + * 0x80-0xBF : Balance power + * 0xC0-0xFF : Power + * The EPP is a 8 bit value, but our ranges restrict the + * value which can be set. Here only using top two bits + * effectively. + */ + index = (epp >> 6) + 1; + } else if (static_cpu_has(X86_FEATURE_EPB)) { + /* + * Range: + * 0x00-0x03 : Performance + * 0x04-0x07 : Balance performance + * 0x08-0x0B : Balance power + * 0x0C-0x0F : Power + * The EPB is a 4 bit value, but our ranges restrict the + * value which can be set. Here only using top two bits + * effectively. + */ + index = (epp >> 2) + 1; + } + + return index; +} + +static int intel_pstate_set_energy_pref_index(struct cpudata *cpu_data, + int pref_index) +{ + int epp = -EINVAL; + int ret; + + if (!pref_index) + epp = cpu_data->epp_default; + + mutex_lock(&intel_pstate_limits_lock); + + if (static_cpu_has(X86_FEATURE_HWP_EPP)) { + u64 value; + + ret = rdmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, &value); + if (ret) + goto return_pref; + + value &= ~GENMASK_ULL(31, 24); + + /* + * If epp is not default, convert from index into + * energy_perf_strings to epp value, by shifting 6 + * bits left to use only top two bits in epp. + * The resultant epp need to shifted by 24 bits to + * epp position in MSR_HWP_REQUEST. + */ + if (epp == -EINVAL) + epp = (pref_index - 1) << 6; + + value |= (u64)epp << 24; + ret = wrmsrl_on_cpu(cpu_data->cpu, MSR_HWP_REQUEST, value); + } else { + if (epp == -EINVAL) + epp = (pref_index - 1) << 2; + ret = intel_pstate_set_epb(cpu_data->cpu, epp); + } +return_pref: + mutex_unlock(&intel_pstate_limits_lock); + + return ret; +} + +static ssize_t show_energy_performance_available_preferences( + struct cpufreq_policy *policy, char *buf) +{ + int i = 0; + int ret = 0; + + while (energy_perf_strings[i] != NULL) + ret += sprintf(&buf[ret], "%s ", energy_perf_strings[i++]); + + ret += sprintf(&buf[ret], "\n"); + + return ret; +} + +cpufreq_freq_attr_ro(energy_performance_available_preferences); + +static ssize_t store_energy_performance_preference( + struct cpufreq_policy *policy, const char *buf, size_t count) +{ + struct cpudata *cpu_data = all_cpu_data[policy->cpu]; + char str_preference[21]; + int ret, i = 0; + + ret = sscanf(buf, "%20s", str_preference); + if (ret != 1) + return -EINVAL; + + while (energy_perf_strings[i] != NULL) { + if (!strcmp(str_preference, energy_perf_strings[i])) { + intel_pstate_set_energy_pref_index(cpu_data, i); + return count; + } + ++i; + } + + return -EINVAL; +} + +static ssize_t show_energy_performance_preference( + struct cpufreq_policy *policy, char *buf) +{ + struct cpudata *cpu_data = all_cpu_data[policy->cpu]; + int preference; + + preference = intel_pstate_get_energy_pref_index(cpu_data); + if (preference < 0) + return preference; + + return sprintf(buf, "%s\n", energy_perf_strings[preference]); +} + +cpufreq_freq_attr_rw(energy_performance_preference); + +static struct freq_attr *hwp_cpufreq_attrs[] = { + &energy_performance_preference, + &energy_performance_available_preferences, + NULL, +}; + static void intel_pstate_hwp_set(const struct cpumask *cpumask) { int min, hw_min, max, hw_max, cpu, range, adj_range; + struct perf_limits *perf_limits = limits; u64 value, cap; for_each_cpu(cpu, cpumask) { + int max_perf_pct, min_perf_pct; + struct cpudata *cpu_data = all_cpu_data[cpu]; + s16 epp; + + if (per_cpu_limits) + perf_limits = all_cpu_data[cpu]->perf_limits; + rdmsrl_on_cpu(cpu, MSR_HWP_CAPABILITIES, &cap); hw_min = HWP_LOWEST_PERF(cap); hw_max = HWP_HIGHEST_PERF(cap); range = hw_max - hw_min; + max_perf_pct = perf_limits->max_perf_pct; + min_perf_pct = perf_limits->min_perf_pct; + rdmsrl_on_cpu(cpu, MSR_HWP_REQUEST, &value); - adj_range = limits->min_perf_pct * range / 100; + adj_range = min_perf_pct * range / 100; min = hw_min + adj_range; value &= ~HWP_MIN_PERF(~0L); value |= HWP_MIN_PERF(min); - adj_range = limits->max_perf_pct * range / 100; + adj_range = max_perf_pct * range / 100; max = hw_min + adj_range; if (limits->no_turbo) { hw_max = HWP_GUARANTEED_PERF(cap); @@ -640,6 +895,53 @@ static void intel_pstate_hwp_set(const struct cpumask *cpumask) value &= ~HWP_MAX_PERF(~0L); value |= HWP_MAX_PERF(max); + + if (cpu_data->epp_policy == cpu_data->policy) + goto skip_epp; + + cpu_data->epp_policy = cpu_data->policy; + + if (cpu_data->epp_saved >= 0) { + epp = cpu_data->epp_saved; + cpu_data->epp_saved = -EINVAL; + goto update_epp; + } + + if (cpu_data->policy == CPUFREQ_POLICY_PERFORMANCE) { + epp = intel_pstate_get_epp(cpu_data, value); + cpu_data->epp_powersave = epp; + /* If EPP read was failed, then don't try to write */ + if (epp < 0) + goto skip_epp; + + + epp = 0; + } else { + /* skip setting EPP, when saved value is invalid */ + if (cpu_data->epp_powersave < 0) + goto skip_epp; + + /* + * No need to restore EPP when it is not zero. This + * means: + * - Policy is not changed + * - user has manually changed + * - Error reading EPB + */ + epp = intel_pstate_get_epp(cpu_data, value); + if (epp) + goto skip_epp; + + epp = cpu_data->epp_powersave; + } +update_epp: + if (static_cpu_has(X86_FEATURE_HWP_EPP)) { + value &= ~GENMASK_ULL(31, 24); + value |= (u64)epp << 24; + } else { + intel_pstate_set_epb(cpu, epp); + } +skip_epp: wrmsrl_on_cpu(cpu, MSR_HWP_REQUEST, value); } } @@ -652,6 +954,28 @@ static int intel_pstate_hwp_set_policy(struct cpufreq_policy *policy) return 0; } +static int intel_pstate_hwp_save_state(struct cpufreq_policy *policy) +{ + struct cpudata *cpu_data = all_cpu_data[policy->cpu]; + + if (!hwp_active) + return 0; + + cpu_data->epp_saved = intel_pstate_get_epp(cpu_data, 0); + + return 0; +} + +static int intel_pstate_resume(struct cpufreq_policy *policy) +{ + if (!hwp_active) + return 0; + + all_cpu_data[policy->cpu]->epp_policy = 0; + + return intel_pstate_hwp_set_policy(policy); +} + static void intel_pstate_hwp_set_online_cpus(void) { get_online_cpus(); @@ -694,8 +1018,10 @@ static void __init intel_pstate_debug_expose_params(void) struct dentry *debugfs_parent; int i = 0; - if (hwp_active) + if (hwp_active || + pstate_funcs.get_target_pstate == get_target_pstate_use_cpu_load) return; + debugfs_parent = debugfs_create_dir("pstate_snb", NULL); if (IS_ERR_OR_NULL(debugfs_parent)) return; @@ -768,9 +1094,12 @@ static ssize_t store_no_turbo(struct kobject *a, struct attribute *b, if (ret != 1) return -EINVAL; + mutex_lock(&intel_pstate_limits_lock); + update_turbo_state(); if (limits->turbo_disabled) { pr_warn("Turbo disabled by BIOS or unavailable on processor\n"); + mutex_unlock(&intel_pstate_limits_lock); return -EPERM; } @@ -779,6 +1108,8 @@ static ssize_t store_no_turbo(struct kobject *a, struct attribute *b, if (hwp_active) intel_pstate_hwp_set_online_cpus(); + mutex_unlock(&intel_pstate_limits_lock); + return count; } @@ -792,6 +1123,8 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b, if (ret != 1) return -EINVAL; + mutex_lock(&intel_pstate_limits_lock); + limits->max_sysfs_pct = clamp_t(int, input, 0 , 100); limits->max_perf_pct = min(limits->max_policy_pct, limits->max_sysfs_pct); @@ -799,10 +1132,13 @@ static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b, limits->max_perf_pct); limits->max_perf_pct = max(limits->min_perf_pct, limits->max_perf_pct); - limits->max_perf = div_fp(limits->max_perf_pct, 100); + limits->max_perf = div_ext_fp(limits->max_perf_pct, 100); if (hwp_active) intel_pstate_hwp_set_online_cpus(); + + mutex_unlock(&intel_pstate_limits_lock); + return count; } @@ -816,6 +1152,8 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b, if (ret != 1) return -EINVAL; + mutex_lock(&intel_pstate_limits_lock); + limits->min_sysfs_pct = clamp_t(int, input, 0 , 100); limits->min_perf_pct = max(limits->min_policy_pct, limits->min_sysfs_pct); @@ -823,10 +1161,13 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b, limits->min_perf_pct); limits->min_perf_pct = min(limits->max_perf_pct, limits->min_perf_pct); - limits->min_perf = div_fp(limits->min_perf_pct, 100); + limits->min_perf = div_ext_fp(limits->min_perf_pct, 100); if (hwp_active) intel_pstate_hwp_set_online_cpus(); + + mutex_unlock(&intel_pstate_limits_lock); + return count; } @@ -841,8 +1182,6 @@ define_one_global_ro(num_pstates); static struct attribute *intel_pstate_attributes[] = { &no_turbo.attr, - &max_perf_pct.attr, - &min_perf_pct.attr, &turbo_pct.attr, &num_pstates.attr, NULL @@ -859,9 +1198,26 @@ static void __init intel_pstate_sysfs_expose_params(void) intel_pstate_kobject = kobject_create_and_add("intel_pstate", &cpu_subsys.dev_root->kobj); - BUG_ON(!intel_pstate_kobject); + if (WARN_ON(!intel_pstate_kobject)) + return; + rc = sysfs_create_group(intel_pstate_kobject, &intel_pstate_attr_group); - BUG_ON(rc); + if (WARN_ON(rc)) + return; + + /* + * If per cpu limits are enforced there are no global limits, so + * return without creating max/min_perf_pct attributes + */ + if (per_cpu_limits) + return; + + rc = sysfs_create_file(intel_pstate_kobject, &max_perf_pct.attr); + WARN_ON(rc); + + rc = sysfs_create_file(intel_pstate_kobject, &min_perf_pct.attr); + WARN_ON(rc); + } /************************** sysfs end ************************/ @@ -872,6 +1228,9 @@ static void intel_pstate_hwp_enable(struct cpudata *cpudata) wrmsrl_on_cpu(cpudata->cpu, MSR_HWP_INTERRUPT, 0x00); wrmsrl_on_cpu(cpudata->cpu, MSR_PM_ENABLE, 0x1); + cpudata->epp_policy = 0; + if (cpudata->epp_default == -EINVAL) + cpudata->epp_default = intel_pstate_get_epp(cpudata, 0); } static int atom_get_min_pstate(void) @@ -1099,7 +1458,6 @@ static const struct cpu_defaults silvermont_params = { .p_gain_pct = 14, .d_gain_pct = 0, .i_gain_pct = 4, - .boost_iowait = true, }, .funcs = { .get_max = atom_get_max_pstate, @@ -1121,7 +1479,6 @@ static const struct cpu_defaults airmont_params = { .p_gain_pct = 14, .d_gain_pct = 0, .i_gain_pct = 4, - .boost_iowait = true, }, .funcs = { .get_max = atom_get_max_pstate, @@ -1163,7 +1520,6 @@ static const struct cpu_defaults bxt_params = { .p_gain_pct = 14, .d_gain_pct = 0, .i_gain_pct = 4, - .boost_iowait = true, }, .funcs = { .get_max = core_get_max_pstate, @@ -1181,20 +1537,24 @@ static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max) int max_perf = cpu->pstate.turbo_pstate; int max_perf_adj; int min_perf; + struct perf_limits *perf_limits = limits; if (limits->no_turbo || limits->turbo_disabled) max_perf = cpu->pstate.max_pstate; + if (per_cpu_limits) + perf_limits = cpu->perf_limits; + /* * performance can be limited by user through sysfs, by cpufreq * policy, or by cpu specific default values determined through * experimentation. */ - max_perf_adj = fp_toint(max_perf * limits->max_perf); + max_perf_adj = fp_ext_toint(max_perf * perf_limits->max_perf); *max = clamp_t(int, max_perf_adj, cpu->pstate.min_pstate, cpu->pstate.turbo_pstate); - min_perf = fp_toint(max_perf * limits->min_perf); + min_perf = fp_ext_toint(max_perf * perf_limits->min_perf); *min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf); } @@ -1232,6 +1592,8 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) cpu->pstate.max_pstate_physical = pstate_funcs.get_max_physical(); cpu->pstate.turbo_pstate = pstate_funcs.get_turbo(); cpu->pstate.scaling = pstate_funcs.get_scaling(); + cpu->pstate.max_freq = cpu->pstate.max_pstate * cpu->pstate.scaling; + cpu->pstate.turbo_freq = cpu->pstate.turbo_pstate * cpu->pstate.scaling; if (pstate_funcs.get_vid) pstate_funcs.get_vid(cpu); @@ -1370,15 +1732,19 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu) return cpu->pstate.current_pstate - pid_calc(&cpu->pid, perf_scaled); } -static inline void intel_pstate_update_pstate(struct cpudata *cpu, int pstate) +static int intel_pstate_prepare_request(struct cpudata *cpu, int pstate) { int max_perf, min_perf; - update_turbo_state(); - intel_pstate_get_min_max(cpu, &min_perf, &max_perf); pstate = clamp_t(int, pstate, min_perf, max_perf); trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu); + return pstate; +} + +static void intel_pstate_update_pstate(struct cpudata *cpu, int pstate) +{ + pstate = intel_pstate_prepare_request(cpu, pstate); if (pstate == cpu->pstate.current_pstate) return; @@ -1396,6 +1762,8 @@ static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu) target_pstate = cpu->policy == CPUFREQ_POLICY_PERFORMANCE ? cpu->pstate.turbo_pstate : pstate_funcs.get_target_pstate(cpu); + update_turbo_state(); + intel_pstate_update_pstate(cpu, target_pstate); sample = &cpu->sample; @@ -1416,7 +1784,7 @@ static void intel_pstate_update_util(struct update_util_data *data, u64 time, struct cpudata *cpu = container_of(data, struct cpudata, update_util); u64 delta_ns; - if (pid_params.boost_iowait) { + if (pstate_funcs.get_target_pstate == get_target_pstate_use_cpu_load) { if (flags & SCHED_CPUFREQ_IOWAIT) { cpu->iowait_boost = int_tofp(1); } else if (cpu->iowait_boost) { @@ -1462,6 +1830,7 @@ static const struct x86_cpu_id intel_pstate_cpu_ids[] = { ICPU(INTEL_FAM6_SKYLAKE_DESKTOP, core_params), ICPU(INTEL_FAM6_BROADWELL_XEON_D, core_params), ICPU(INTEL_FAM6_XEON_PHI_KNL, knl_params), + ICPU(INTEL_FAM6_XEON_PHI_KNM, knl_params), ICPU(INTEL_FAM6_ATOM_GOLDMONT, bxt_params), {} }; @@ -1478,11 +1847,26 @@ static int intel_pstate_init_cpu(unsigned int cpunum) { struct cpudata *cpu; - if (!all_cpu_data[cpunum]) - all_cpu_data[cpunum] = kzalloc(sizeof(struct cpudata), - GFP_KERNEL); - if (!all_cpu_data[cpunum]) - return -ENOMEM; + cpu = all_cpu_data[cpunum]; + + if (!cpu) { + unsigned int size = sizeof(struct cpudata); + + if (per_cpu_limits) + size += sizeof(struct perf_limits); + + cpu = kzalloc(size, GFP_KERNEL); + if (!cpu) + return -ENOMEM; + + all_cpu_data[cpunum] = cpu; + if (per_cpu_limits) + cpu->perf_limits = (struct perf_limits *)(cpu + 1); + + cpu->epp_default = -EINVAL; + cpu->epp_powersave = -EINVAL; + cpu->epp_saved = -EINVAL; + } cpu = all_cpu_data[cpunum]; @@ -1541,18 +1925,57 @@ static void intel_pstate_set_performance_limits(struct perf_limits *limits) limits->no_turbo = 0; limits->turbo_disabled = 0; limits->max_perf_pct = 100; - limits->max_perf = int_tofp(1); + limits->max_perf = int_ext_tofp(1); limits->min_perf_pct = 100; - limits->min_perf = int_tofp(1); + limits->min_perf = int_ext_tofp(1); limits->max_policy_pct = 100; limits->max_sysfs_pct = 100; limits->min_policy_pct = 0; limits->min_sysfs_pct = 0; } +static void intel_pstate_update_perf_limits(struct cpufreq_policy *policy, + struct perf_limits *limits) +{ + + limits->max_policy_pct = DIV_ROUND_UP(policy->max * 100, + policy->cpuinfo.max_freq); + limits->max_policy_pct = clamp_t(int, limits->max_policy_pct, 0, 100); + if (policy->max == policy->min) { + limits->min_policy_pct = limits->max_policy_pct; + } else { + limits->min_policy_pct = DIV_ROUND_UP(policy->min * 100, + policy->cpuinfo.max_freq); + limits->min_policy_pct = clamp_t(int, limits->min_policy_pct, + 0, 100); + } + + /* Normalize user input to [min_policy_pct, max_policy_pct] */ + limits->min_perf_pct = max(limits->min_policy_pct, + limits->min_sysfs_pct); + limits->min_perf_pct = min(limits->max_policy_pct, + limits->min_perf_pct); + limits->max_perf_pct = min(limits->max_policy_pct, + limits->max_sysfs_pct); + limits->max_perf_pct = max(limits->min_policy_pct, + limits->max_perf_pct); + + /* Make sure min_perf_pct <= max_perf_pct */ + limits->min_perf_pct = min(limits->max_perf_pct, limits->min_perf_pct); + + limits->min_perf = div_ext_fp(limits->min_perf_pct, 100); + limits->max_perf = div_ext_fp(limits->max_perf_pct, 100); + limits->max_perf = round_up(limits->max_perf, EXT_FRAC_BITS); + limits->min_perf = round_up(limits->min_perf, EXT_FRAC_BITS); + + pr_debug("cpu:%d max_perf_pct:%d min_perf_pct:%d\n", policy->cpu, + limits->max_perf_pct, limits->min_perf_pct); +} + static int intel_pstate_set_policy(struct cpufreq_policy *policy) { struct cpudata *cpu; + struct perf_limits *perf_limits = NULL; if (!policy->cpuinfo.max_freq) return -ENODEV; @@ -1570,41 +1993,31 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) policy->max = policy->cpuinfo.max_freq; } - if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) { - limits = &performance_limits; + if (per_cpu_limits) + perf_limits = cpu->perf_limits; + + mutex_lock(&intel_pstate_limits_lock); + + if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) { + if (!perf_limits) { + limits = &performance_limits; + perf_limits = limits; + } if (policy->max >= policy->cpuinfo.max_freq) { pr_debug("set performance\n"); - intel_pstate_set_performance_limits(limits); + intel_pstate_set_performance_limits(perf_limits); goto out; } } else { pr_debug("set powersave\n"); - limits = &powersave_limits; - } - - limits->min_policy_pct = (policy->min * 100) / policy->cpuinfo.max_freq; - limits->min_policy_pct = clamp_t(int, limits->min_policy_pct, 0 , 100); - limits->max_policy_pct = DIV_ROUND_UP(policy->max * 100, - policy->cpuinfo.max_freq); - limits->max_policy_pct = clamp_t(int, limits->max_policy_pct, 0 , 100); - - /* Normalize user input to [min_policy_pct, max_policy_pct] */ - limits->min_perf_pct = max(limits->min_policy_pct, - limits->min_sysfs_pct); - limits->min_perf_pct = min(limits->max_policy_pct, - limits->min_perf_pct); - limits->max_perf_pct = min(limits->max_policy_pct, - limits->max_sysfs_pct); - limits->max_perf_pct = max(limits->min_policy_pct, - limits->max_perf_pct); - - /* Make sure min_perf_pct <= max_perf_pct */ - limits->min_perf_pct = min(limits->max_perf_pct, limits->min_perf_pct); + if (!perf_limits) { + limits = &powersave_limits; + perf_limits = limits; + } - limits->min_perf = div_fp(limits->min_perf_pct, 100); - limits->max_perf = div_fp(limits->max_perf_pct, 100); - limits->max_perf = round_up(limits->max_perf, FRAC_BITS); + } + intel_pstate_update_perf_limits(policy, perf_limits); out: if (cpu->policy == CPUFREQ_POLICY_PERFORMANCE) { /* @@ -1619,6 +2032,8 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) intel_pstate_hwp_set_policy(policy); + mutex_unlock(&intel_pstate_limits_lock); + return 0; } @@ -1633,22 +2048,32 @@ static int intel_pstate_verify_policy(struct cpufreq_policy *policy) return 0; } +static void intel_cpufreq_stop_cpu(struct cpufreq_policy *policy) +{ + intel_pstate_set_min_pstate(all_cpu_data[policy->cpu]); +} + static void intel_pstate_stop_cpu(struct cpufreq_policy *policy) { - int cpu_num = policy->cpu; - struct cpudata *cpu = all_cpu_data[cpu_num]; + pr_debug("CPU %d exiting\n", policy->cpu); - pr_debug("CPU %d exiting\n", cpu_num); + intel_pstate_clear_update_util_hook(policy->cpu); + if (hwp_active) + intel_pstate_hwp_save_state(policy); + else + intel_cpufreq_stop_cpu(policy); +} - intel_pstate_clear_update_util_hook(cpu_num); +static int intel_pstate_cpu_exit(struct cpufreq_policy *policy) +{ + intel_pstate_exit_perf_limits(policy); - if (hwp_active) - return; + policy->fast_switch_possible = false; - intel_pstate_set_min_pstate(cpu); + return 0; } -static int intel_pstate_cpu_init(struct cpufreq_policy *policy) +static int __intel_pstate_cpu_init(struct cpufreq_policy *policy) { struct cpudata *cpu; int rc; @@ -1659,10 +2084,13 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy) cpu = all_cpu_data[policy->cpu]; - if (limits->min_perf_pct == 100 && limits->max_perf_pct == 100) - policy->policy = CPUFREQ_POLICY_PERFORMANCE; - else - policy->policy = CPUFREQ_POLICY_POWERSAVE; + /* + * We need sane value in the cpu->perf_limits, so inherit from global + * perf_limits limits, which are seeded with values based on the + * CONFIG_CPU_FREQ_DEFAULT_GOV_*, during boot up. + */ + if (per_cpu_limits) + memcpy(cpu->perf_limits, limits, sizeof(struct perf_limits)); policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling; policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling; @@ -1675,24 +2103,35 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy) policy->cpuinfo.max_freq *= cpu->pstate.scaling; intel_pstate_init_acpi_perf_limits(policy); - policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; cpumask_set_cpu(policy->cpu, policy->cpus); + policy->fast_switch_possible = true; + return 0; } -static int intel_pstate_cpu_exit(struct cpufreq_policy *policy) +static int intel_pstate_cpu_init(struct cpufreq_policy *policy) { - intel_pstate_exit_perf_limits(policy); + int ret = __intel_pstate_cpu_init(policy); + + if (ret) + return ret; + + policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + if (limits->min_perf_pct == 100 && limits->max_perf_pct == 100) + policy->policy = CPUFREQ_POLICY_PERFORMANCE; + else + policy->policy = CPUFREQ_POLICY_POWERSAVE; return 0; } -static struct cpufreq_driver intel_pstate_driver = { +static struct cpufreq_driver intel_pstate = { .flags = CPUFREQ_CONST_LOOPS, .verify = intel_pstate_verify_policy, .setpolicy = intel_pstate_set_policy, - .resume = intel_pstate_hwp_set_policy, + .suspend = intel_pstate_hwp_save_state, + .resume = intel_pstate_resume, .get = intel_pstate_get, .init = intel_pstate_cpu_init, .exit = intel_pstate_cpu_exit, @@ -1700,6 +2139,118 @@ static struct cpufreq_driver intel_pstate_driver = { .name = "intel_pstate", }; +static int intel_cpufreq_verify_policy(struct cpufreq_policy *policy) +{ + struct cpudata *cpu = all_cpu_data[policy->cpu]; + struct perf_limits *perf_limits = limits; + + update_turbo_state(); + policy->cpuinfo.max_freq = limits->turbo_disabled ? + cpu->pstate.max_freq : cpu->pstate.turbo_freq; + + cpufreq_verify_within_cpu_limits(policy); + + if (per_cpu_limits) + perf_limits = cpu->perf_limits; + + intel_pstate_update_perf_limits(policy, perf_limits); + + return 0; +} + +static unsigned int intel_cpufreq_turbo_update(struct cpudata *cpu, + struct cpufreq_policy *policy, + unsigned int target_freq) +{ + unsigned int max_freq; + + update_turbo_state(); + + max_freq = limits->no_turbo || limits->turbo_disabled ? + cpu->pstate.max_freq : cpu->pstate.turbo_freq; + policy->cpuinfo.max_freq = max_freq; + if (policy->max > max_freq) + policy->max = max_freq; + + if (target_freq > max_freq) + target_freq = max_freq; + + return target_freq; +} + +static int intel_cpufreq_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + struct cpudata *cpu = all_cpu_data[policy->cpu]; + struct cpufreq_freqs freqs; + int target_pstate; + + freqs.old = policy->cur; + freqs.new = intel_cpufreq_turbo_update(cpu, policy, target_freq); + + cpufreq_freq_transition_begin(policy, &freqs); + switch (relation) { + case CPUFREQ_RELATION_L: + target_pstate = DIV_ROUND_UP(freqs.new, cpu->pstate.scaling); + break; + case CPUFREQ_RELATION_H: + target_pstate = freqs.new / cpu->pstate.scaling; + break; + default: + target_pstate = DIV_ROUND_CLOSEST(freqs.new, cpu->pstate.scaling); + break; + } + target_pstate = intel_pstate_prepare_request(cpu, target_pstate); + if (target_pstate != cpu->pstate.current_pstate) { + cpu->pstate.current_pstate = target_pstate; + wrmsrl_on_cpu(policy->cpu, MSR_IA32_PERF_CTL, + pstate_funcs.get_val(cpu, target_pstate)); + } + cpufreq_freq_transition_end(policy, &freqs, false); + + return 0; +} + +static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy, + unsigned int target_freq) +{ + struct cpudata *cpu = all_cpu_data[policy->cpu]; + int target_pstate; + + target_freq = intel_cpufreq_turbo_update(cpu, policy, target_freq); + target_pstate = DIV_ROUND_UP(target_freq, cpu->pstate.scaling); + intel_pstate_update_pstate(cpu, target_pstate); + return target_freq; +} + +static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy) +{ + int ret = __intel_pstate_cpu_init(policy); + + if (ret) + return ret; + + policy->cpuinfo.transition_latency = INTEL_CPUFREQ_TRANSITION_LATENCY; + /* This reflects the intel_pstate_get_cpu_pstates() setting. */ + policy->cur = policy->cpuinfo.min_freq; + + return 0; +} + +static struct cpufreq_driver intel_cpufreq = { + .flags = CPUFREQ_CONST_LOOPS, + .verify = intel_cpufreq_verify_policy, + .target = intel_cpufreq_target, + .fast_switch = intel_cpufreq_fast_switch, + .init = intel_cpufreq_cpu_init, + .exit = intel_pstate_cpu_exit, + .stop_cpu = intel_cpufreq_stop_cpu, + .name = "intel_cpufreq", +}; + +static struct cpufreq_driver *intel_pstate_driver = &intel_pstate; + static int no_load __initdata; static int no_hwp __initdata; static int hwp_only __initdata; @@ -1726,6 +2277,19 @@ static void __init copy_pid_params(struct pstate_adjust_policy *policy) pid_params.setpoint = policy->setpoint; } +#ifdef CONFIG_ACPI +static void intel_pstate_use_acpi_profile(void) +{ + if (acpi_gbl_FADT.preferred_profile == PM_MOBILE) + pstate_funcs.get_target_pstate = + get_target_pstate_use_cpu_load; +} +#else +static void intel_pstate_use_acpi_profile(void) +{ +} +#endif + static void __init copy_cpu_funcs(struct pstate_funcs *funcs) { pstate_funcs.get_max = funcs->get_max; @@ -1737,6 +2301,7 @@ static void __init copy_cpu_funcs(struct pstate_funcs *funcs) pstate_funcs.get_vid = funcs->get_vid; pstate_funcs.get_target_pstate = funcs->get_target_pstate; + intel_pstate_use_acpi_profile(); } #ifdef CONFIG_ACPI @@ -1850,9 +2415,20 @@ static bool __init intel_pstate_platform_pwr_mgmt_exists(void) return false; } + +static void intel_pstate_request_control_from_smm(void) +{ + /* + * It may be unsafe to request P-states control from SMM if _PPC support + * has not been enabled. + */ + if (acpi_ppc) + acpi_processor_pstate_control(); +} #else /* CONFIG_ACPI not enabled */ static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; } static inline bool intel_pstate_has_acpi_ppc(void) { return false; } +static inline void intel_pstate_request_control_from_smm(void) {} #endif /* CONFIG_ACPI */ static const struct x86_cpu_id hwp_support_ids[] __initconst = { @@ -1872,6 +2448,7 @@ static int __init intel_pstate_init(void) if (x86_match_cpu(hwp_support_ids) && !no_hwp) { copy_cpu_funcs(&core_params.funcs); hwp_active++; + intel_pstate.attr = hwp_cpufreq_attrs; goto hwp_cpu_matched; } @@ -1904,7 +2481,9 @@ hwp_cpu_matched: if (!hwp_active && hwp_only) goto out; - rc = cpufreq_register_driver(&intel_pstate_driver); + intel_pstate_request_control_from_smm(); + + rc = cpufreq_register_driver(intel_pstate_driver); if (rc) goto out; @@ -1919,7 +2498,9 @@ out: get_online_cpus(); for_each_online_cpu(cpu) { if (all_cpu_data[cpu]) { - intel_pstate_clear_update_util_hook(cpu); + if (intel_pstate_driver == &intel_pstate) + intel_pstate_clear_update_util_hook(cpu); + kfree(all_cpu_data[cpu]); } } @@ -1935,8 +2516,13 @@ static int __init intel_pstate_setup(char *str) if (!str) return -EINVAL; - if (!strcmp(str, "disable")) + if (!strcmp(str, "disable")) { no_load = 1; + } else if (!strcmp(str, "passive")) { + pr_info("Passive mode enabled\n"); + intel_pstate_driver = &intel_cpufreq; + no_hwp = 1; + } if (!strcmp(str, "no_hwp")) { pr_info("HWP disabled\n"); no_hwp = 1; @@ -1945,6 +2531,8 @@ static int __init intel_pstate_setup(char *str) force_load = 1; if (!strcmp(str, "hwp_only")) hwp_only = 1; + if (!strcmp(str, "per_cpu_perf_limits")) + per_cpu_limits = true; #ifdef CONFIG_ACPI if (!strcmp(str, "support_acpi_ppc")) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index d3ffde806629..37671b545880 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -42,6 +42,10 @@ #define PMSR_PSAFE_ENABLE (1UL << 30) #define PMSR_SPR_EM_DISABLE (1UL << 31) #define PMSR_MAX(x) ((x >> 32) & 0xFF) +#define LPSTATE_SHIFT 48 +#define GPSTATE_SHIFT 56 +#define GET_LPSTATE(x) (((x) >> LPSTATE_SHIFT) & 0xFF) +#define GET_GPSTATE(x) (((x) >> GPSTATE_SHIFT) & 0xFF) #define MAX_RAMP_DOWN_TIME 5120 /* @@ -592,7 +596,8 @@ void gpstate_timer_handler(unsigned long data) { struct cpufreq_policy *policy = (struct cpufreq_policy *)data; struct global_pstate_info *gpstates = policy->driver_data; - int gpstate_idx; + int gpstate_idx, lpstate_idx; + unsigned long val; unsigned int time_diff = jiffies_to_msecs(jiffies) - gpstates->last_sampled_time; struct powernv_smp_call_data freq_data; @@ -600,21 +605,37 @@ void gpstate_timer_handler(unsigned long data) if (!spin_trylock(&gpstates->gpstate_lock)) return; + /* + * If PMCR was last updated was using fast_swtich then + * We may have wrong in gpstate->last_lpstate_idx + * value. Hence, read from PMCR to get correct data. + */ + val = get_pmspr(SPRN_PMCR); + freq_data.gpstate_id = (s8)GET_GPSTATE(val); + freq_data.pstate_id = (s8)GET_LPSTATE(val); + if (freq_data.gpstate_id == freq_data.pstate_id) { + reset_gpstates(policy); + spin_unlock(&gpstates->gpstate_lock); + return; + } + gpstates->last_sampled_time += time_diff; gpstates->elapsed_time += time_diff; - freq_data.pstate_id = idx_to_pstate(gpstates->last_lpstate_idx); - if ((gpstates->last_gpstate_idx == gpstates->last_lpstate_idx) || - (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME)) { + if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) { gpstate_idx = pstate_to_idx(freq_data.pstate_id); + lpstate_idx = gpstate_idx; reset_gpstates(policy); gpstates->highest_lpstate_idx = gpstate_idx; } else { + lpstate_idx = pstate_to_idx(freq_data.pstate_id); gpstate_idx = calc_global_pstate(gpstates->elapsed_time, gpstates->highest_lpstate_idx, - gpstates->last_lpstate_idx); + lpstate_idx); } - + freq_data.gpstate_id = idx_to_pstate(gpstate_idx); + gpstates->last_gpstate_idx = gpstate_idx; + gpstates->last_lpstate_idx = lpstate_idx; /* * If local pstate is equal to global pstate, rampdown is over * So timer is not required to be queued. @@ -622,10 +643,6 @@ void gpstate_timer_handler(unsigned long data) if (gpstate_idx != gpstates->last_lpstate_idx) queue_gpstate_timer(gpstates); - freq_data.gpstate_id = idx_to_pstate(gpstate_idx); - gpstates->last_gpstate_idx = pstate_to_idx(freq_data.gpstate_id); - gpstates->last_lpstate_idx = pstate_to_idx(freq_data.pstate_id); - spin_unlock(&gpstates->gpstate_lock); /* Timer may get migrated to a different cpu on cpu hot unplug */ @@ -647,8 +664,14 @@ static int powernv_cpufreq_target_index(struct cpufreq_policy *policy, if (unlikely(rebooting) && new_index != get_nominal_index()) return 0; - if (!throttled) + if (!throttled) { + /* we don't want to be preempted while + * checking if the CPU frequency has been throttled + */ + preempt_disable(); powernv_cpufreq_throttle_check(NULL); + preempt_enable(); + } cur_msec = jiffies_to_msecs(get_jiffies_64()); @@ -752,9 +775,12 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy) spin_lock_init(&gpstates->gpstate_lock); ret = cpufreq_table_validate_and_show(policy, powernv_freqs); - if (ret < 0) + if (ret < 0) { kfree(policy->driver_data); + return ret; + } + policy->fast_switch_possible = true; return ret; } @@ -897,6 +923,20 @@ static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy) del_timer_sync(&gpstates->timer); } +static unsigned int powernv_fast_switch(struct cpufreq_policy *policy, + unsigned int target_freq) +{ + int index; + struct powernv_smp_call_data freq_data; + + index = cpufreq_table_find_index_dl(policy, target_freq); + freq_data.pstate_id = powernv_freqs[index].driver_data; + freq_data.gpstate_id = powernv_freqs[index].driver_data; + set_pstate(&freq_data); + + return powernv_freqs[index].frequency; +} + static struct cpufreq_driver powernv_cpufreq_driver = { .name = "powernv-cpufreq", .flags = CPUFREQ_CONST_LOOPS, @@ -904,6 +944,7 @@ static struct cpufreq_driver powernv_cpufreq_driver = { .exit = powernv_cpufreq_cpu_exit, .verify = cpufreq_generic_frequency_table_verify, .target_index = powernv_cpufreq_target_index, + .fast_switch = powernv_fast_switch, .get = powernv_cpufreq_get, .stop_cpu = powernv_cpufreq_stop_cpu, .attr = powernv_cpu_freq_attr, diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 7fe442ca38f4..0835a37a5f3a 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -22,7 +22,7 @@ #define POWERNV_THRESHOLD_LATENCY_NS 200000 -struct cpuidle_driver powernv_idle_driver = { +static struct cpuidle_driver powernv_idle_driver = { .name = "powernv_idle", .owner = THIS_MODULE, }; diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index c73207abb5a4..62810ff3b00f 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -97,7 +97,23 @@ static int find_deepest_state(struct cpuidle_driver *drv, return ret; } -#ifdef CONFIG_SUSPEND +/** + * cpuidle_use_deepest_state - Set/clear governor override flag. + * @enable: New value of the flag. + * + * Set/unset the current CPU to use the deepest idle state (override governors + * going forward if set). + */ +void cpuidle_use_deepest_state(bool enable) +{ + struct cpuidle_device *dev; + + preempt_disable(); + dev = cpuidle_get_device(); + dev->use_deepest_state = enable; + preempt_enable(); +} + /** * cpuidle_find_deepest_state - Find the deepest available idle state. * @drv: cpuidle driver for the given CPU. @@ -109,6 +125,7 @@ int cpuidle_find_deepest_state(struct cpuidle_driver *drv, return find_deepest_state(drv, dev, UINT_MAX, 0, false); } +#ifdef CONFIG_SUSPEND static void enter_freeze_proper(struct cpuidle_driver *drv, struct cpuidle_device *dev, int index) { diff --git a/drivers/cpuidle/dt_idle_states.c b/drivers/cpuidle/dt_idle_states.c index a5c111b67f37..ffca4fc0061d 100644 --- a/drivers/cpuidle/dt_idle_states.c +++ b/drivers/cpuidle/dt_idle_states.c @@ -38,6 +38,12 @@ static int init_state_node(struct cpuidle_state *idle_state, * state enter function. */ idle_state->enter = match_id->data; + /* + * Since this is not a "coupled" state, it's safe to assume interrupts + * won't be enabled when it exits allowing the tick to be frozen + * safely. So enter() can be also enter_freeze() callback. + */ + idle_state->enter_freeze = match_id->data; err = of_property_read_u32(state_node, "wakeup-latency-us", &idle_state->exit_latency); diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c index fb9f511cca23..4e78263e34a4 100644 --- a/drivers/cpuidle/governor.c +++ b/drivers/cpuidle/governor.c @@ -9,7 +9,6 @@ */ #include <linux/mutex.h> -#include <linux/module.h> #include <linux/cpuidle.h> #include "cpuidle.h" @@ -53,14 +52,11 @@ int cpuidle_switch_governor(struct cpuidle_governor *gov) if (cpuidle_curr_governor) { list_for_each_entry(dev, &cpuidle_detected_devices, device_list) cpuidle_disable_device(dev); - module_put(cpuidle_curr_governor->owner); } cpuidle_curr_governor = gov; if (gov) { - if (!try_module_get(cpuidle_curr_governor->owner)) - return -EINVAL; list_for_each_entry(dev, &cpuidle_detected_devices, device_list) cpuidle_enable_device(dev); cpuidle_install_idle_handler(); diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c index 63bd5a403e22..fe8f08948fcb 100644 --- a/drivers/cpuidle/governors/ladder.c +++ b/drivers/cpuidle/governors/ladder.c @@ -15,7 +15,6 @@ #include <linux/kernel.h> #include <linux/cpuidle.h> #include <linux/pm_qos.h> -#include <linux/module.h> #include <linux/jiffies.h> #include <linux/tick.h> @@ -177,7 +176,6 @@ static struct cpuidle_governor ladder_governor = { .enable = ladder_enable_device, .select = ladder_select_state, .reflect = ladder_reflect, - .owner = THIS_MODULE, }; /** diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 03d38c291de6..d9b5b9398a0f 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -19,7 +19,6 @@ #include <linux/tick.h> #include <linux/sched.h> #include <linux/math64.h> -#include <linux/module.h> /* * Please note when changing the tuning values: @@ -484,7 +483,6 @@ static struct cpuidle_governor menu_governor = { .enable = menu_enable_device, .select = menu_select, .reflect = menu_reflect, - .owner = THIS_MODULE, }; /** diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index 832a2c3f01ff..c5adc8c9ac43 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -403,8 +403,10 @@ static int cpuidle_add_state_sysfs(struct cpuidle_device *device) /* state statistics */ for (i = 0; i < drv->state_count; i++) { kobj = kzalloc(sizeof(struct cpuidle_state_kobj), GFP_KERNEL); - if (!kobj) + if (!kobj) { + ret = -ENOMEM; goto error_state; + } kobj->state = &drv->states[i]; kobj->state_usage = &device->states_usage[i]; init_completion(&kobj->kobj_unregister); diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index bf3ea7603a58..a324801d6a66 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -850,7 +850,7 @@ err_out: EXPORT_SYMBOL(devfreq_add_governor); /** - * devfreq_remove_device() - Remove devfreq feature from a device. + * devfreq_remove_governor() - Remove devfreq feature from a device. * @governor: the devfreq governor to be removed */ int devfreq_remove_governor(struct devfreq_governor *governor) diff --git a/drivers/devfreq/event/exynos-nocp.c b/drivers/devfreq/event/exynos-nocp.c index 49e712aca0c1..5c3e7b11e8a6 100644 --- a/drivers/devfreq/event/exynos-nocp.c +++ b/drivers/devfreq/event/exynos-nocp.c @@ -190,6 +190,7 @@ static const struct of_device_id exynos_nocp_id_match[] = { { .compatible = "samsung,exynos5420-nocp", }, { /* sentinel */ }, }; +MODULE_DEVICE_TABLE(of, exynos_nocp_id_match); static struct regmap_config exynos_nocp_regmap_config = { .reg_bits = 32, diff --git a/drivers/devfreq/event/exynos-ppmu.c b/drivers/devfreq/event/exynos-ppmu.c index f55cf0eb2a66..107eb91a9415 100644 --- a/drivers/devfreq/event/exynos-ppmu.c +++ b/drivers/devfreq/event/exynos-ppmu.c @@ -15,7 +15,6 @@ #include <linux/io.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/mutex.h> #include <linux/of_address.h> #include <linux/platform_device.h> #include <linux/suspend.h> @@ -34,7 +33,6 @@ struct exynos_ppmu { unsigned int num_events; struct device *dev; - struct mutex lock; struct exynos_ppmu_data ppmu; }; @@ -90,8 +88,6 @@ struct __exynos_ppmu_events { PPMU_EVENT(d1-cpu), PPMU_EVENT(d1-general), PPMU_EVENT(d1-rt), - - { /* sentinel */ }, }; static int exynos_ppmu_find_ppmu_id(struct devfreq_event_dev *edev) @@ -351,6 +347,7 @@ static const struct of_device_id exynos_ppmu_id_match[] = { }, { /* sentinel */ }, }; +MODULE_DEVICE_TABLE(of, exynos_ppmu_id_match); static struct devfreq_event_ops *exynos_bus_get_ops(struct device_node *np) { @@ -463,7 +460,6 @@ static int exynos_ppmu_probe(struct platform_device *pdev) if (!info) return -ENOMEM; - mutex_init(&info->lock); info->dev = &pdev->dev; /* Parse dt data to get resource */ diff --git a/drivers/devfreq/event/rockchip-dfi.c b/drivers/devfreq/event/rockchip-dfi.c index 43fcc5a7f515..22b113363ffc 100644 --- a/drivers/devfreq/event/rockchip-dfi.c +++ b/drivers/devfreq/event/rockchip-dfi.c @@ -188,6 +188,7 @@ static const struct of_device_id rockchip_dfi_id_match[] = { { .compatible = "rockchip,rk3399-dfi" }, { }, }; +MODULE_DEVICE_TABLE(of, rockchip_dfi_id_match); static int rockchip_dfi_probe(struct platform_device *pdev) { diff --git a/drivers/devfreq/exynos-bus.c b/drivers/devfreq/exynos-bus.c index 29866f7e6d7e..a8ed7792ece2 100644 --- a/drivers/devfreq/exynos-bus.c +++ b/drivers/devfreq/exynos-bus.c @@ -35,7 +35,7 @@ struct exynos_bus { unsigned int edev_count; struct mutex lock; - struct dev_pm_opp *curr_opp; + unsigned long curr_freq; struct regulator *regulator; struct clk *clk; @@ -99,7 +99,7 @@ static int exynos_bus_target(struct device *dev, unsigned long *freq, u32 flags) { struct exynos_bus *bus = dev_get_drvdata(dev); struct dev_pm_opp *new_opp; - unsigned long old_freq, new_freq, old_volt, new_volt, tol; + unsigned long old_freq, new_freq, new_volt, tol; int ret = 0; /* Get new opp-bus instance according to new bus clock */ @@ -113,8 +113,7 @@ static int exynos_bus_target(struct device *dev, unsigned long *freq, u32 flags) new_freq = dev_pm_opp_get_freq(new_opp); new_volt = dev_pm_opp_get_voltage(new_opp); - old_freq = dev_pm_opp_get_freq(bus->curr_opp); - old_volt = dev_pm_opp_get_voltage(bus->curr_opp); + old_freq = bus->curr_freq; rcu_read_unlock(); if (old_freq == new_freq) @@ -146,7 +145,7 @@ static int exynos_bus_target(struct device *dev, unsigned long *freq, u32 flags) goto out; } } - bus->curr_opp = new_opp; + bus->curr_freq = new_freq; dev_dbg(dev, "Set the frequency of bus (%lukHz -> %lukHz)\n", old_freq/1000, new_freq/1000); @@ -163,9 +162,7 @@ static int exynos_bus_get_dev_status(struct device *dev, struct devfreq_event_data edata; int ret; - rcu_read_lock(); - stat->current_frequency = dev_pm_opp_get_freq(bus->curr_opp); - rcu_read_unlock(); + stat->current_frequency = bus->curr_freq; ret = exynos_bus_get_event(bus, &edata); if (ret < 0) { @@ -226,7 +223,7 @@ static int exynos_bus_passive_target(struct device *dev, unsigned long *freq, } new_freq = dev_pm_opp_get_freq(new_opp); - old_freq = dev_pm_opp_get_freq(bus->curr_opp); + old_freq = bus->curr_freq; rcu_read_unlock(); if (old_freq == new_freq) @@ -242,7 +239,7 @@ static int exynos_bus_passive_target(struct device *dev, unsigned long *freq, } *freq = new_freq; - bus->curr_opp = new_opp; + bus->curr_freq = new_freq; dev_dbg(dev, "Set the frequency of bus (%lukHz -> %lukHz)\n", old_freq/1000, new_freq/1000); @@ -335,6 +332,7 @@ static int exynos_bus_parse_of(struct device_node *np, struct exynos_bus *bus) { struct device *dev = bus->dev; + struct dev_pm_opp *opp; unsigned long rate; int ret; @@ -352,22 +350,23 @@ static int exynos_bus_parse_of(struct device_node *np, } /* Get the freq and voltage from OPP table to scale the bus freq */ - rcu_read_lock(); ret = dev_pm_opp_of_add_table(dev); if (ret < 0) { dev_err(dev, "failed to get OPP table\n"); - rcu_read_unlock(); goto err_clk; } rate = clk_get_rate(bus->clk); - bus->curr_opp = devfreq_recommended_opp(dev, &rate, 0); - if (IS_ERR(bus->curr_opp)) { + + rcu_read_lock(); + opp = devfreq_recommended_opp(dev, &rate, 0); + if (IS_ERR(opp)) { dev_err(dev, "failed to find dev_pm_opp\n"); rcu_read_unlock(); - ret = PTR_ERR(bus->curr_opp); + ret = PTR_ERR(opp); goto err_opp; } + bus->curr_freq = dev_pm_opp_get_freq(opp); rcu_read_unlock(); return 0; diff --git a/drivers/devfreq/rk3399_dmc.c b/drivers/devfreq/rk3399_dmc.c index e24b73d66659..27d2f349b53c 100644 --- a/drivers/devfreq/rk3399_dmc.c +++ b/drivers/devfreq/rk3399_dmc.c @@ -80,7 +80,6 @@ struct rk3399_dmcfreq { struct regulator *vdd_center; unsigned long rate, target_rate; unsigned long volt, target_volt; - struct dev_pm_opp *curr_opp; }; static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq, @@ -102,9 +101,6 @@ static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq, target_rate = dev_pm_opp_get_freq(opp); target_volt = dev_pm_opp_get_voltage(opp); - dmcfreq->rate = dev_pm_opp_get_freq(dmcfreq->curr_opp); - dmcfreq->volt = dev_pm_opp_get_voltage(dmcfreq->curr_opp); - rcu_read_unlock(); if (dmcfreq->rate == target_rate) @@ -165,7 +161,9 @@ static int rk3399_dmcfreq_target(struct device *dev, unsigned long *freq, if (err) dev_err(dev, "Cannot to set vol %lu uV\n", target_volt); - dmcfreq->curr_opp = opp; + dmcfreq->rate = target_rate; + dmcfreq->volt = target_volt; + out: mutex_unlock(&dmcfreq->lock); return err; @@ -414,7 +412,6 @@ static int rk3399_dmcfreq_probe(struct platform_device *pdev) */ if (dev_pm_opp_of_add_table(dev)) { dev_err(dev, "Invalid operating-points in device tree.\n"); - rcu_read_unlock(); return -EINVAL; } @@ -431,12 +428,13 @@ static int rk3399_dmcfreq_probe(struct platform_device *pdev) rcu_read_unlock(); return PTR_ERR(opp); } + data->rate = dev_pm_opp_get_freq(opp); + data->volt = dev_pm_opp_get_voltage(opp); rcu_read_unlock(); - data->curr_opp = opp; rk3399_devfreq_dmc_profile.initial_freq = data->rate; - data->devfreq = devfreq_add_device(dev, + data->devfreq = devm_devfreq_add_device(dev, &rk3399_devfreq_dmc_profile, "simple_ondemand", &data->ondemand_data); @@ -454,6 +452,7 @@ static const struct of_device_id rk3399dmc_devfreq_of_match[] = { { .compatible = "rockchip,rk3399-dmc" }, { }, }; +MODULE_DEVICE_TABLE(of, rk3399dmc_devfreq_of_match); static struct platform_driver rk3399_dmcfreq_driver = { .probe = rk3399_dmcfreq_probe, diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 4466a2f969d7..7d8ea3d5fda6 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -98,8 +98,6 @@ static int intel_idle(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index); static void intel_idle_freeze(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index); -static int intel_idle_cpu_init(int cpu); - static struct cpuidle_state *cpuidle_state_table; /* @@ -724,6 +722,50 @@ static struct cpuidle_state atom_cstates[] = { { .enter = NULL } }; +static struct cpuidle_state tangier_cstates[] = { + { + .name = "C1-TNG", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 4, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C4-TNG", + .desc = "MWAIT 0x30", + .flags = MWAIT2flg(0x30) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 100, + .target_residency = 400, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C6-TNG", + .desc = "MWAIT 0x52", + .flags = MWAIT2flg(0x52) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 140, + .target_residency = 560, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C7-TNG", + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 1200, + .target_residency = 4000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .name = "C9-TNG", + .desc = "MWAIT 0x64", + .flags = MWAIT2flg(0x64) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 10000, + .target_residency = 20000, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze, }, + { + .enter = NULL } +}; static struct cpuidle_state avn_cstates[] = { { .name = "C1-AVN", @@ -907,51 +949,15 @@ static void intel_idle_freeze(struct cpuidle_device *dev, mwait_idle_with_hints(eax, ecx); } -static void __setup_broadcast_timer(void *arg) +static void __setup_broadcast_timer(bool on) { - unsigned long on = (unsigned long)arg; - if (on) tick_broadcast_enable(); else tick_broadcast_disable(); } -static int cpu_hotplug_notify(struct notifier_block *n, - unsigned long action, void *hcpu) -{ - int hotcpu = (unsigned long)hcpu; - struct cpuidle_device *dev; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - - if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE) - smp_call_function_single(hotcpu, __setup_broadcast_timer, - (void *)true, 1); - - /* - * Some systems can hotplug a cpu at runtime after - * the kernel has booted, we have to initialize the - * driver in this case - */ - dev = per_cpu_ptr(intel_idle_cpuidle_devices, hotcpu); - if (dev->registered) - break; - - if (intel_idle_cpu_init(hotcpu)) - return NOTIFY_BAD; - - break; - } - return NOTIFY_OK; -} - -static struct notifier_block cpu_hotplug_notifier = { - .notifier_call = cpu_hotplug_notify, -}; - -static void auto_demotion_disable(void *dummy) +static void auto_demotion_disable(void) { unsigned long long msr_bits; @@ -959,7 +965,7 @@ static void auto_demotion_disable(void *dummy) msr_bits &= ~(icpu->auto_demotion_disable_flags); wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits); } -static void c1e_promotion_disable(void *dummy) +static void c1e_promotion_disable(void) { unsigned long long msr_bits; @@ -978,6 +984,10 @@ static const struct idle_cpu idle_cpu_atom = { .state_table = atom_cstates, }; +static const struct idle_cpu idle_cpu_tangier = { + .state_table = tangier_cstates, +}; + static const struct idle_cpu idle_cpu_lincroft = { .state_table = atom_cstates, .auto_demotion_disable_flags = ATM_LNC_C6_AUTO_DEMOTE, @@ -1066,6 +1076,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { ICPU(INTEL_FAM6_SANDYBRIDGE_X, idle_cpu_snb), ICPU(INTEL_FAM6_ATOM_CEDARVIEW, idle_cpu_atom), ICPU(INTEL_FAM6_ATOM_SILVERMONT1, idle_cpu_byt), + ICPU(INTEL_FAM6_ATOM_MERRIFIELD, idle_cpu_tangier), ICPU(INTEL_FAM6_ATOM_AIRMONT, idle_cpu_cht), ICPU(INTEL_FAM6_IVYBRIDGE, idle_cpu_ivb), ICPU(INTEL_FAM6_IVYBRIDGE_X, idle_cpu_ivt), @@ -1084,6 +1095,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { ICPU(INTEL_FAM6_KABYLAKE_DESKTOP, idle_cpu_skl), ICPU(INTEL_FAM6_SKYLAKE_X, idle_cpu_skx), ICPU(INTEL_FAM6_XEON_PHI_KNL, idle_cpu_knl), + ICPU(INTEL_FAM6_XEON_PHI_KNM, idle_cpu_knl), ICPU(INTEL_FAM6_ATOM_GOLDMONT, idle_cpu_bxt), ICPU(INTEL_FAM6_ATOM_DENVERTON, idle_cpu_dnv), {} @@ -1373,12 +1385,11 @@ static void __init intel_idle_cpuidle_driver_init(void) * allocate, initialize, register cpuidle_devices * @cpu: cpu/core to initialize */ -static int intel_idle_cpu_init(int cpu) +static int intel_idle_cpu_init(unsigned int cpu) { struct cpuidle_device *dev; dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu); - dev->cpu = cpu; if (cpuidle_register_device(dev)) { @@ -1387,17 +1398,36 @@ static int intel_idle_cpu_init(int cpu) } if (icpu->auto_demotion_disable_flags) - smp_call_function_single(cpu, auto_demotion_disable, NULL, 1); + auto_demotion_disable(); if (icpu->disable_promotion_to_c1e) - smp_call_function_single(cpu, c1e_promotion_disable, NULL, 1); + c1e_promotion_disable(); + + return 0; +} + +static int intel_idle_cpu_online(unsigned int cpu) +{ + struct cpuidle_device *dev; + + if (lapic_timer_reliable_states != LAPIC_TIMER_ALWAYS_RELIABLE) + __setup_broadcast_timer(true); + + /* + * Some systems can hotplug a cpu at runtime after + * the kernel has booted, we have to initialize the + * driver in this case + */ + dev = per_cpu_ptr(intel_idle_cpuidle_devices, cpu); + if (!dev->registered) + return intel_idle_cpu_init(cpu); return 0; } static int __init intel_idle_init(void) { - int retval, i; + int retval; /* Do not load intel_idle at all for now if idle= is passed */ if (boot_option_idle_override != IDLE_NO_OVERRIDE) @@ -1417,35 +1447,29 @@ static int __init intel_idle_init(void) struct cpuidle_driver *drv = cpuidle_get_driver(); printk(KERN_DEBUG PREFIX "intel_idle yielding to %s", drv ? drv->name : "none"); - free_percpu(intel_idle_cpuidle_devices); - return retval; + goto init_driver_fail; } - cpu_notifier_register_begin(); - - for_each_online_cpu(i) { - retval = intel_idle_cpu_init(i); - if (retval) { - intel_idle_cpuidle_devices_uninit(); - cpu_notifier_register_done(); - cpuidle_unregister_driver(&intel_idle_driver); - free_percpu(intel_idle_cpuidle_devices); - return retval; - } - } - __register_cpu_notifier(&cpu_hotplug_notifier); - if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */ lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE; - else - on_each_cpu(__setup_broadcast_timer, (void *)true, 1); - cpu_notifier_register_done(); + retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online", + intel_idle_cpu_online, NULL); + if (retval < 0) + goto hp_setup_fail; pr_debug(PREFIX "lapic_timer_reliable_states 0x%x\n", lapic_timer_reliable_states); return 0; + +hp_setup_fail: + intel_idle_cpuidle_devices_uninit(); + cpuidle_unregister_driver(&intel_idle_driver); +init_driver_fail: + free_percpu(intel_idle_cpuidle_devices); + return retval; + } device_initcall(intel_idle_init); diff --git a/drivers/net/ethernet/smsc/smsc911x.c b/drivers/net/ethernet/smsc/smsc911x.c index c48fc0c4abd9..fa5ca0992be6 100644 --- a/drivers/net/ethernet/smsc/smsc911x.c +++ b/drivers/net/ethernet/smsc/smsc911x.c @@ -2585,6 +2585,9 @@ static int smsc911x_suspend(struct device *dev) PMT_CTRL_PM_MODE_D1_ | PMT_CTRL_WOL_EN_ | PMT_CTRL_ED_EN_ | PMT_CTRL_PME_EN_); + pm_runtime_disable(dev); + pm_runtime_set_suspended(dev); + return 0; } @@ -2594,6 +2597,9 @@ static int smsc911x_resume(struct device *dev) struct smsc911x_data *pdata = netdev_priv(ndev); unsigned int to = 100; + pm_runtime_enable(dev); + pm_runtime_resume(dev); + /* Note 3.11 from the datasheet: * "When the LAN9220 is in a power saving state, a write of any * data to the BYTE_TEST register will wake-up the device." diff --git a/drivers/power/avs/rockchip-io-domain.c b/drivers/power/avs/rockchip-io-domain.c index 01b6d3f9b8fb..56bce1908be2 100644 --- a/drivers/power/avs/rockchip-io-domain.c +++ b/drivers/power/avs/rockchip-io-domain.c @@ -143,7 +143,7 @@ static int rockchip_iodomain_notify(struct notifier_block *nb, if (ret && event == REGULATOR_EVENT_PRE_VOLTAGE_CHANGE) return NOTIFY_BAD; - dev_info(supply->iod->dev, "Setting to %d done\n", uV); + dev_dbg(supply->iod->dev, "Setting to %d done\n", uV); return NOTIFY_OK; } diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c index 243b233ff31b..9a25110c4a46 100644 --- a/drivers/powercap/intel_rapl.c +++ b/drivers/powercap/intel_rapl.c @@ -189,14 +189,13 @@ struct rapl_package { unsigned int time_unit; struct rapl_domain *domains; /* array of domains, sized at runtime */ struct powercap_zone *power_zone; /* keep track of parent zone */ - int nr_cpus; /* active cpus on the package, topology info is lost during - * cpu hotplug. so we have to track ourselves. - */ unsigned long power_limit_irq; /* keep track of package power limit * notify interrupt enable status. */ struct list_head plist; int lead_cpu; /* one active cpu per package for access */ + /* Track active cpus */ + struct cpumask cpumask; }; struct rapl_defaults { @@ -275,18 +274,6 @@ static struct rapl_package *find_package_by_id(int id) return NULL; } -/* caller must hold cpu hotplug lock */ -static void rapl_cleanup_data(void) -{ - struct rapl_package *p, *tmp; - - list_for_each_entry_safe(p, tmp, &rapl_packages, plist) { - kfree(p->domains); - list_del(&p->plist); - kfree(p); - } -} - static int get_energy_counter(struct powercap_zone *power_zone, u64 *energy_raw) { struct rapl_domain *rd; @@ -442,6 +429,7 @@ static int contraint_to_pl(struct rapl_domain *rd, int cid) return i; } } + pr_err("Cannot find matching power limit for constraint %d\n", cid); return -EINVAL; } @@ -457,6 +445,10 @@ static int set_power_limit(struct powercap_zone *power_zone, int cid, get_online_cpus(); rd = power_zone_to_rapl_domain(power_zone); id = contraint_to_pl(rd, cid); + if (id < 0) { + ret = id; + goto set_exit; + } rp = rd->rp; @@ -496,6 +488,11 @@ static int get_current_power_limit(struct powercap_zone *power_zone, int cid, get_online_cpus(); rd = power_zone_to_rapl_domain(power_zone); id = contraint_to_pl(rd, cid); + if (id < 0) { + ret = id; + goto get_exit; + } + switch (rd->rpl[id].prim_id) { case PL1_ENABLE: prim = POWER_LIMIT1; @@ -512,6 +509,7 @@ static int get_current_power_limit(struct powercap_zone *power_zone, int cid, else *data = val; +get_exit: put_online_cpus(); return ret; @@ -527,6 +525,10 @@ static int set_time_window(struct powercap_zone *power_zone, int cid, get_online_cpus(); rd = power_zone_to_rapl_domain(power_zone); id = contraint_to_pl(rd, cid); + if (id < 0) { + ret = id; + goto set_time_exit; + } switch (rd->rpl[id].prim_id) { case PL1_ENABLE: @@ -538,6 +540,8 @@ static int set_time_window(struct powercap_zone *power_zone, int cid, default: ret = -EINVAL; } + +set_time_exit: put_online_cpus(); return ret; } @@ -552,6 +556,10 @@ static int get_time_window(struct powercap_zone *power_zone, int cid, u64 *data) get_online_cpus(); rd = power_zone_to_rapl_domain(power_zone); id = contraint_to_pl(rd, cid); + if (id < 0) { + ret = id; + goto get_time_exit; + } switch (rd->rpl[id].prim_id) { case PL1_ENABLE: @@ -566,6 +574,8 @@ static int get_time_window(struct powercap_zone *power_zone, int cid, u64 *data) } if (!ret) *data = val; + +get_time_exit: put_online_cpus(); return ret; @@ -707,7 +717,7 @@ static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type, case ENERGY_UNIT: scale = ENERGY_UNIT_SCALE; /* per domain unit takes precedence */ - if (rd && rd->domain_energy_unit) + if (rd->domain_energy_unit) units = rd->domain_energy_unit; else units = rp->energy_unit; @@ -976,10 +986,20 @@ static void package_power_limit_irq_save(struct rapl_package *rp) smp_call_function_single(rp->lead_cpu, power_limit_irq_save_cpu, rp, 1); } -static void power_limit_irq_restore_cpu(void *info) +/* + * Restore per package power limit interrupt enable state. Called from cpu + * hotplug code on package removal. + */ +static void package_power_limit_irq_restore(struct rapl_package *rp) { - u32 l, h = 0; - struct rapl_package *rp = (struct rapl_package *)info; + u32 l, h; + + if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN)) + return; + + /* irq enable state not saved, nothing to restore */ + if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) + return; rdmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h); @@ -991,19 +1011,6 @@ static void power_limit_irq_restore_cpu(void *info) wrmsr_safe(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); } -/* restore per package power limit interrupt enable state */ -static void package_power_limit_irq_restore(struct rapl_package *rp) -{ - if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN)) - return; - - /* irq enable state not saved, nothing to restore */ - if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) - return; - - smp_call_function_single(rp->lead_cpu, power_limit_irq_restore_cpu, rp, 1); -} - static void set_floor_freq_default(struct rapl_domain *rd, bool mode) { int nr_powerlimit = find_nr_power_limit(rd); @@ -1160,84 +1167,49 @@ static const struct x86_cpu_id rapl_ids[] __initconst = { RAPL_CPU(INTEL_FAM6_ATOM_DENVERTON, rapl_defaults_core), RAPL_CPU(INTEL_FAM6_XEON_PHI_KNL, rapl_defaults_hsw_server), + RAPL_CPU(INTEL_FAM6_XEON_PHI_KNM, rapl_defaults_hsw_server), {} }; MODULE_DEVICE_TABLE(x86cpu, rapl_ids); -/* read once for all raw primitive data for all packages, domains */ -static void rapl_update_domain_data(void) +/* Read once for all raw primitive data for domains */ +static void rapl_update_domain_data(struct rapl_package *rp) { int dmn, prim; u64 val; - struct rapl_package *rp; - list_for_each_entry(rp, &rapl_packages, plist) { - for (dmn = 0; dmn < rp->nr_domains; dmn++) { - pr_debug("update package %d domain %s data\n", rp->id, - rp->domains[dmn].name); - /* exclude non-raw primitives */ - for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) - if (!rapl_read_data_raw(&rp->domains[dmn], prim, - rpi[prim].unit, - &val)) - rp->domains[dmn].rdd.primitives[prim] = - val; + for (dmn = 0; dmn < rp->nr_domains; dmn++) { + pr_debug("update package %d domain %s data\n", rp->id, + rp->domains[dmn].name); + /* exclude non-raw primitives */ + for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) { + if (!rapl_read_data_raw(&rp->domains[dmn], prim, + rpi[prim].unit, &val)) + rp->domains[dmn].rdd.primitives[prim] = val; } } } -static int rapl_unregister_powercap(void) +static void rapl_unregister_powercap(void) { - struct rapl_package *rp; - struct rapl_domain *rd, *rd_package = NULL; - - /* unregister all active rapl packages from the powercap layer, - * hotplug lock held - */ - list_for_each_entry(rp, &rapl_packages, plist) { - package_power_limit_irq_restore(rp); - - for (rd = rp->domains; rd < rp->domains + rp->nr_domains; - rd++) { - pr_debug("remove package, undo power limit on %d: %s\n", - rp->id, rd->name); - rapl_write_data_raw(rd, PL1_ENABLE, 0); - rapl_write_data_raw(rd, PL1_CLAMP, 0); - if (find_nr_power_limit(rd) > 1) { - rapl_write_data_raw(rd, PL2_ENABLE, 0); - rapl_write_data_raw(rd, PL2_CLAMP, 0); - } - if (rd->id == RAPL_DOMAIN_PACKAGE) { - rd_package = rd; - continue; - } - powercap_unregister_zone(control_type, &rd->power_zone); - } - /* do the package zone last */ - if (rd_package) - powercap_unregister_zone(control_type, - &rd_package->power_zone); - } - if (platform_rapl_domain) { powercap_unregister_zone(control_type, &platform_rapl_domain->power_zone); kfree(platform_rapl_domain); } - powercap_unregister_control_type(control_type); - - return 0; } static int rapl_package_register_powercap(struct rapl_package *rp) { struct rapl_domain *rd; - int ret = 0; char dev_name[17]; /* max domain name = 7 + 1 + 8 for int + 1 for null*/ struct powercap_zone *power_zone = NULL; - int nr_pl; + int nr_pl, ret;; + + /* Update the domain data of the new package */ + rapl_update_domain_data(rp); /* first we register package domain as the parent zone*/ for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { @@ -1257,8 +1229,7 @@ static int rapl_package_register_powercap(struct rapl_package *rp) if (IS_ERR(power_zone)) { pr_debug("failed to register package, %d\n", rp->id); - ret = PTR_ERR(power_zone); - goto exit_package; + return PTR_ERR(power_zone); } /* track parent zone in per package/socket data */ rp->power_zone = power_zone; @@ -1268,8 +1239,7 @@ static int rapl_package_register_powercap(struct rapl_package *rp) } if (!power_zone) { pr_err("no package domain found, unknown topology!\n"); - ret = -ENODEV; - goto exit_package; + return -ENODEV; } /* now register domains as children of the socket/package*/ for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { @@ -1290,11 +1260,11 @@ static int rapl_package_register_powercap(struct rapl_package *rp) goto err_cleanup; } } + return 0; -exit_package: - return ret; err_cleanup: - /* clean up previously initialized domains within the package if we + /* + * Clean up previously initialized domains within the package if we * failed after the first domain setup. */ while (--rd >= rp->domains) { @@ -1305,7 +1275,7 @@ err_cleanup: return ret; } -static int rapl_register_psys(void) +static int __init rapl_register_psys(void) { struct rapl_domain *rd; struct powercap_zone *power_zone; @@ -1346,40 +1316,14 @@ static int rapl_register_psys(void) return 0; } -static int rapl_register_powercap(void) +static int __init rapl_register_powercap(void) { - struct rapl_domain *rd; - struct rapl_package *rp; - int ret = 0; - control_type = powercap_register_control_type(NULL, "intel-rapl", NULL); if (IS_ERR(control_type)) { pr_debug("failed to register powercap control_type.\n"); return PTR_ERR(control_type); } - /* read the initial data */ - rapl_update_domain_data(); - list_for_each_entry(rp, &rapl_packages, plist) - if (rapl_package_register_powercap(rp)) - goto err_cleanup_package; - - /* Don't bail out if PSys is not supported */ - rapl_register_psys(); - - return ret; - -err_cleanup_package: - /* clean up previously initialized packages */ - list_for_each_entry_continue_reverse(rp, &rapl_packages, plist) { - for (rd = rp->domains; rd < rp->domains + rp->nr_domains; - rd++) { - pr_debug("unregister zone/package %d, %s domain\n", - rp->id, rd->name); - powercap_unregister_zone(control_type, &rd->power_zone); - } - } - - return ret; + return 0; } static int rapl_check_domain(int cpu, int domain) @@ -1452,9 +1396,8 @@ static void rapl_detect_powerlimit(struct rapl_domain *rd) */ static int rapl_detect_domains(struct rapl_package *rp, int cpu) { - int i; - int ret = 0; struct rapl_domain *rd; + int i; for (i = 0; i < RAPL_DOMAIN_MAX; i++) { /* use physical package id to read counters */ @@ -1466,84 +1409,20 @@ static int rapl_detect_domains(struct rapl_package *rp, int cpu) rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX); if (!rp->nr_domains) { pr_debug("no valid rapl domains found in package %d\n", rp->id); - ret = -ENODEV; - goto done; + return -ENODEV; } pr_debug("found %d domains on package %d\n", rp->nr_domains, rp->id); rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain), GFP_KERNEL); - if (!rp->domains) { - ret = -ENOMEM; - goto done; - } + if (!rp->domains) + return -ENOMEM; + rapl_init_domains(rp); for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) rapl_detect_powerlimit(rd); - - -done: - return ret; -} - -static bool is_package_new(int package) -{ - struct rapl_package *rp; - - /* caller prevents cpu hotplug, there will be no new packages added - * or deleted while traversing the package list, no need for locking. - */ - list_for_each_entry(rp, &rapl_packages, plist) - if (package == rp->id) - return false; - - return true; -} - -/* RAPL interface can be made of a two-level hierarchy: package level and domain - * level. We first detect the number of packages then domains of each package. - * We have to consider the possiblity of CPU online/offline due to hotplug and - * other scenarios. - */ -static int rapl_detect_topology(void) -{ - int i; - int phy_package_id; - struct rapl_package *new_package, *rp; - - for_each_online_cpu(i) { - phy_package_id = topology_physical_package_id(i); - if (is_package_new(phy_package_id)) { - new_package = kzalloc(sizeof(*rp), GFP_KERNEL); - if (!new_package) { - rapl_cleanup_data(); - return -ENOMEM; - } - /* add the new package to the list */ - new_package->id = phy_package_id; - new_package->nr_cpus = 1; - /* use the first active cpu of the package to access */ - new_package->lead_cpu = i; - /* check if the package contains valid domains */ - if (rapl_detect_domains(new_package, i) || - rapl_defaults->check_unit(new_package, i)) { - kfree(new_package->domains); - kfree(new_package); - /* free up the packages already initialized */ - rapl_cleanup_data(); - return -ENODEV; - } - INIT_LIST_HEAD(&new_package->plist); - list_add(&new_package->plist, &rapl_packages); - } else { - rp = find_package_by_id(phy_package_id); - if (rp) - ++rp->nr_cpus; - } - } - return 0; } @@ -1552,12 +1431,21 @@ static void rapl_remove_package(struct rapl_package *rp) { struct rapl_domain *rd, *rd_package = NULL; + package_power_limit_irq_restore(rp); + for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { + rapl_write_data_raw(rd, PL1_ENABLE, 0); + rapl_write_data_raw(rd, PL1_CLAMP, 0); + if (find_nr_power_limit(rd) > 1) { + rapl_write_data_raw(rd, PL2_ENABLE, 0); + rapl_write_data_raw(rd, PL2_CLAMP, 0); + } if (rd->id == RAPL_DOMAIN_PACKAGE) { rd_package = rd; continue; } - pr_debug("remove package %d, %s domain\n", rp->id, rd->name); + pr_debug("remove package, undo power limit on %d: %s\n", + rp->id, rd->name); powercap_unregister_zone(control_type, &rd->power_zone); } /* do parent zone last */ @@ -1567,20 +1455,17 @@ static void rapl_remove_package(struct rapl_package *rp) } /* called from CPU hotplug notifier, hotplug lock held */ -static int rapl_add_package(int cpu) +static struct rapl_package *rapl_add_package(int cpu, int pkgid) { - int ret = 0; - int phy_package_id; struct rapl_package *rp; + int ret; - phy_package_id = topology_physical_package_id(cpu); rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL); if (!rp) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* add the new package to the list */ - rp->id = phy_package_id; - rp->nr_cpus = 1; + rp->id = pkgid; rp->lead_cpu = cpu; /* check if the package contains valid domains */ @@ -1589,17 +1474,17 @@ static int rapl_add_package(int cpu) ret = -ENODEV; goto err_free_package; } - if (!rapl_package_register_powercap(rp)) { + ret = rapl_package_register_powercap(rp); + if (!ret) { INIT_LIST_HEAD(&rp->plist); list_add(&rp->plist, &rapl_packages); - return ret; + return rp; } err_free_package: kfree(rp->domains); kfree(rp); - - return ret; + return ERR_PTR(ret); } /* Handles CPU hotplug on multi-socket systems. @@ -1609,55 +1494,46 @@ err_free_package: * associated domains. Cooling devices are handled accordingly at * per-domain level. */ -static int rapl_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int rapl_cpu_online(unsigned int cpu) { - unsigned long cpu = (unsigned long)hcpu; - int phy_package_id; + int pkgid = topology_physical_package_id(cpu); struct rapl_package *rp; - int lead_cpu; - phy_package_id = topology_physical_package_id(cpu); - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - rp = find_package_by_id(phy_package_id); - if (rp) - ++rp->nr_cpus; - else - rapl_add_package(cpu); - break; - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - rp = find_package_by_id(phy_package_id); - if (!rp) - break; - if (--rp->nr_cpus == 0) - rapl_remove_package(rp); - else if (cpu == rp->lead_cpu) { - /* choose another active cpu in the package */ - lead_cpu = cpumask_any_but(topology_core_cpumask(cpu), cpu); - if (lead_cpu < nr_cpu_ids) - rp->lead_cpu = lead_cpu; - else /* should never go here */ - pr_err("no active cpu available for package %d\n", - phy_package_id); - } + rp = find_package_by_id(pkgid); + if (!rp) { + rp = rapl_add_package(cpu, pkgid); + if (IS_ERR(rp)) + return PTR_ERR(rp); } + cpumask_set_cpu(cpu, &rp->cpumask); + return 0; +} + +static int rapl_cpu_down_prep(unsigned int cpu) +{ + int pkgid = topology_physical_package_id(cpu); + struct rapl_package *rp; + int lead_cpu; + + rp = find_package_by_id(pkgid); + if (!rp) + return 0; - return NOTIFY_OK; + cpumask_clear_cpu(cpu, &rp->cpumask); + lead_cpu = cpumask_first(&rp->cpumask); + if (lead_cpu >= nr_cpu_ids) + rapl_remove_package(rp); + else if (rp->lead_cpu == cpu) + rp->lead_cpu = lead_cpu; + return 0; } -static struct notifier_block rapl_cpu_notifier = { - .notifier_call = rapl_cpu_callback, -}; +static enum cpuhp_state pcap_rapl_online; static int __init rapl_init(void) { - int ret = 0; const struct x86_cpu_id *id; + int ret; id = x86_match_cpu(rapl_ids); if (!id) { @@ -1669,36 +1545,29 @@ static int __init rapl_init(void) rapl_defaults = (struct rapl_defaults *)id->driver_data; - cpu_notifier_register_begin(); - - /* prevent CPU hotplug during detection */ - get_online_cpus(); - ret = rapl_detect_topology(); + ret = rapl_register_powercap(); if (ret) - goto done; + return ret; - if (rapl_register_powercap()) { - rapl_cleanup_data(); - ret = -ENODEV; - goto done; - } - __register_hotcpu_notifier(&rapl_cpu_notifier); -done: - put_online_cpus(); - cpu_notifier_register_done(); + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powercap/rapl:online", + rapl_cpu_online, rapl_cpu_down_prep); + if (ret < 0) + goto err_unreg; + pcap_rapl_online = ret; + + /* Don't bail out if PSys is not supported */ + rapl_register_psys(); + return 0; +err_unreg: + rapl_unregister_powercap(); return ret; } static void __exit rapl_exit(void) { - cpu_notifier_register_begin(); - get_online_cpus(); - __unregister_hotcpu_notifier(&rapl_cpu_notifier); + cpuhp_remove_state(pcap_rapl_online); rapl_unregister_powercap(); - rapl_cleanup_data(); - put_online_cpus(); - cpu_notifier_register_done(); } module_init(rapl_init); diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c index 350cb5e22ff3..df64692e9e64 100644 --- a/drivers/thermal/intel_powerclamp.c +++ b/drivers/thermal/intel_powerclamp.c @@ -43,7 +43,6 @@ #include <linux/kernel.h> #include <linux/delay.h> #include <linux/kthread.h> -#include <linux/freezer.h> #include <linux/cpu.h> #include <linux/thermal.h> #include <linux/slab.h> @@ -85,11 +84,26 @@ static unsigned int control_cpu; /* The cpu assigned to collect stat and update */ static bool clamping; +static const struct sched_param sparam = { + .sched_priority = MAX_USER_RT_PRIO / 2, +}; +struct powerclamp_worker_data { + struct kthread_worker *worker; + struct kthread_work balancing_work; + struct kthread_delayed_work idle_injection_work; + unsigned int cpu; + unsigned int count; + unsigned int guard; + unsigned int window_size_now; + unsigned int target_ratio; + unsigned int duration_jiffies; + bool clamping; +}; -static struct task_struct * __percpu *powerclamp_thread; +static struct powerclamp_worker_data * __percpu worker_data; static struct thermal_cooling_device *cooling_dev; static unsigned long *cpu_clamping_mask; /* bit map for tracking per cpu - * clamping thread + * clamping kthread worker */ static unsigned int duration; @@ -261,11 +275,6 @@ static u64 pkg_state_counter(void) return count; } -static void noop_timer(unsigned long foo) -{ - /* empty... just the fact that we get the interrupt wakes us up */ -} - static unsigned int get_compensation(int ratio) { unsigned int comp = 0; @@ -367,103 +376,79 @@ static bool powerclamp_adjust_controls(unsigned int target_ratio, return set_target_ratio + guard <= current_ratio; } -static int clamp_thread(void *arg) +static void clamp_balancing_func(struct kthread_work *work) { - int cpunr = (unsigned long)arg; - DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0); - static const struct sched_param param = { - .sched_priority = MAX_USER_RT_PRIO/2, - }; - unsigned int count = 0; - unsigned int target_ratio; + struct powerclamp_worker_data *w_data; + int sleeptime; + unsigned long target_jiffies; + unsigned int compensated_ratio; + int interval; /* jiffies to sleep for each attempt */ - set_bit(cpunr, cpu_clamping_mask); - set_freezable(); - init_timer_on_stack(&wakeup_timer); - sched_setscheduler(current, SCHED_FIFO, ¶m); - - while (true == clamping && !kthread_should_stop() && - cpu_online(cpunr)) { - int sleeptime; - unsigned long target_jiffies; - unsigned int guard; - unsigned int compensated_ratio; - int interval; /* jiffies to sleep for each attempt */ - unsigned int duration_jiffies = msecs_to_jiffies(duration); - unsigned int window_size_now; - - try_to_freeze(); - /* - * make sure user selected ratio does not take effect until - * the next round. adjust target_ratio if user has changed - * target such that we can converge quickly. - */ - target_ratio = set_target_ratio; - guard = 1 + target_ratio/20; - window_size_now = window_size; - count++; - - /* - * systems may have different ability to enter package level - * c-states, thus we need to compensate the injected idle ratio - * to achieve the actual target reported by the HW. - */ - compensated_ratio = target_ratio + - get_compensation(target_ratio); - if (compensated_ratio <= 0) - compensated_ratio = 1; - interval = duration_jiffies * 100 / compensated_ratio; - - /* align idle time */ - target_jiffies = roundup(jiffies, interval); - sleeptime = target_jiffies - jiffies; - if (sleeptime <= 0) - sleeptime = 1; - schedule_timeout_interruptible(sleeptime); - /* - * only elected controlling cpu can collect stats and update - * control parameters. - */ - if (cpunr == control_cpu && !(count%window_size_now)) { - should_skip = - powerclamp_adjust_controls(target_ratio, - guard, window_size_now); - smp_mb(); - } + w_data = container_of(work, struct powerclamp_worker_data, + balancing_work); - if (should_skip) - continue; - - target_jiffies = jiffies + duration_jiffies; - mod_timer(&wakeup_timer, target_jiffies); - if (unlikely(local_softirq_pending())) - continue; - /* - * stop tick sched during idle time, interrupts are still - * allowed. thus jiffies are updated properly. - */ - preempt_disable(); - /* mwait until target jiffies is reached */ - while (time_before(jiffies, target_jiffies)) { - unsigned long ecx = 1; - unsigned long eax = target_mwait; - - /* - * REVISIT: may call enter_idle() to notify drivers who - * can save power during cpu idle. same for exit_idle() - */ - local_touch_nmi(); - stop_critical_timings(); - mwait_idle_with_hints(eax, ecx); - start_critical_timings(); - atomic_inc(&idle_wakeup_counter); - } - preempt_enable(); + /* + * make sure user selected ratio does not take effect until + * the next round. adjust target_ratio if user has changed + * target such that we can converge quickly. + */ + w_data->target_ratio = READ_ONCE(set_target_ratio); + w_data->guard = 1 + w_data->target_ratio / 20; + w_data->window_size_now = window_size; + w_data->duration_jiffies = msecs_to_jiffies(duration); + w_data->count++; + + /* + * systems may have different ability to enter package level + * c-states, thus we need to compensate the injected idle ratio + * to achieve the actual target reported by the HW. + */ + compensated_ratio = w_data->target_ratio + + get_compensation(w_data->target_ratio); + if (compensated_ratio <= 0) + compensated_ratio = 1; + interval = w_data->duration_jiffies * 100 / compensated_ratio; + + /* align idle time */ + target_jiffies = roundup(jiffies, interval); + sleeptime = target_jiffies - jiffies; + if (sleeptime <= 0) + sleeptime = 1; + + if (clamping && w_data->clamping && cpu_online(w_data->cpu)) + kthread_queue_delayed_work(w_data->worker, + &w_data->idle_injection_work, + sleeptime); +} + +static void clamp_idle_injection_func(struct kthread_work *work) +{ + struct powerclamp_worker_data *w_data; + + w_data = container_of(work, struct powerclamp_worker_data, + idle_injection_work.work); + + /* + * only elected controlling cpu can collect stats and update + * control parameters. + */ + if (w_data->cpu == control_cpu && + !(w_data->count % w_data->window_size_now)) { + should_skip = + powerclamp_adjust_controls(w_data->target_ratio, + w_data->guard, + w_data->window_size_now); + smp_mb(); } - del_timer_sync(&wakeup_timer); - clear_bit(cpunr, cpu_clamping_mask); - return 0; + if (should_skip) + goto balance; + + play_idle(jiffies_to_msecs(w_data->duration_jiffies)); + +balance: + if (clamping && w_data->clamping && cpu_online(w_data->cpu)) + kthread_queue_work(w_data->worker, &w_data->balancing_work); } /* @@ -507,10 +492,60 @@ static void poll_pkg_cstate(struct work_struct *dummy) schedule_delayed_work(&poll_pkg_cstate_work, HZ); } +static void start_power_clamp_worker(unsigned long cpu) +{ + struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu); + struct kthread_worker *worker; + + worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inject/%ld", cpu); + if (IS_ERR(worker)) + return; + + w_data->worker = worker; + w_data->count = 0; + w_data->cpu = cpu; + w_data->clamping = true; + set_bit(cpu, cpu_clamping_mask); + sched_setscheduler(worker->task, SCHED_FIFO, &sparam); + kthread_init_work(&w_data->balancing_work, clamp_balancing_func); + kthread_init_delayed_work(&w_data->idle_injection_work, + clamp_idle_injection_func); + kthread_queue_work(w_data->worker, &w_data->balancing_work); +} + +static void stop_power_clamp_worker(unsigned long cpu) +{ + struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu); + + if (!w_data->worker) + return; + + w_data->clamping = false; + /* + * Make sure that all works that get queued after this point see + * the clamping disabled. The counter part is not needed because + * there is an implicit memory barrier when the queued work + * is proceed. + */ + smp_wmb(); + kthread_cancel_work_sync(&w_data->balancing_work); + kthread_cancel_delayed_work_sync(&w_data->idle_injection_work); + /* + * The balancing work still might be queued here because + * the handling of the "clapming" variable, cancel, and queue + * operations are not synchronized via a lock. But it is not + * a big deal. The balancing work is fast and destroy kthread + * will wait for it. + */ + clear_bit(w_data->cpu, cpu_clamping_mask); + kthread_destroy_worker(w_data->worker); + + w_data->worker = NULL; +} + static int start_power_clamp(void) { unsigned long cpu; - struct task_struct *thread; set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1); /* prevent cpu hotplug */ @@ -524,22 +559,9 @@ static int start_power_clamp(void) clamping = true; schedule_delayed_work(&poll_pkg_cstate_work, 0); - /* start one thread per online cpu */ + /* start one kthread worker per online cpu */ for_each_online_cpu(cpu) { - struct task_struct **p = - per_cpu_ptr(powerclamp_thread, cpu); - - thread = kthread_create_on_node(clamp_thread, - (void *) cpu, - cpu_to_node(cpu), - "kidle_inject/%ld", cpu); - /* bind to cpu here */ - if (likely(!IS_ERR(thread))) { - kthread_bind(thread, cpu); - wake_up_process(thread); - *p = thread; - } - + start_power_clamp_worker(cpu); } put_online_cpus(); @@ -549,71 +571,49 @@ static int start_power_clamp(void) static void end_power_clamp(void) { int i; - struct task_struct *thread; - clamping = false; /* - * make clamping visible to other cpus and give per cpu clamping threads - * sometime to exit, or gets killed later. + * Block requeuing in all the kthread workers. They will flush and + * stop faster. */ - smp_mb(); - msleep(20); + clamping = false; if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) { for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) { - pr_debug("clamping thread for cpu %d alive, kill\n", i); - thread = *per_cpu_ptr(powerclamp_thread, i); - kthread_stop(thread); + pr_debug("clamping worker for cpu %d alive, destroy\n", + i); + stop_power_clamp_worker(i); } } } -static int powerclamp_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int powerclamp_cpu_online(unsigned int cpu) { - unsigned long cpu = (unsigned long)hcpu; - struct task_struct *thread; - struct task_struct **percpu_thread = - per_cpu_ptr(powerclamp_thread, cpu); - - if (false == clamping) - goto exit_ok; - - switch (action) { - case CPU_ONLINE: - thread = kthread_create_on_node(clamp_thread, - (void *) cpu, - cpu_to_node(cpu), - "kidle_inject/%lu", cpu); - if (likely(!IS_ERR(thread))) { - kthread_bind(thread, cpu); - wake_up_process(thread); - *percpu_thread = thread; - } - /* prefer BSP as controlling CPU */ - if (cpu == 0) { - control_cpu = 0; - smp_mb(); - } - break; - case CPU_DEAD: - if (test_bit(cpu, cpu_clamping_mask)) { - pr_err("cpu %lu dead but powerclamping thread is not\n", - cpu); - kthread_stop(*percpu_thread); - } - if (cpu == control_cpu) { - control_cpu = smp_processor_id(); - smp_mb(); - } + if (clamping == false) + return 0; + start_power_clamp_worker(cpu); + /* prefer BSP as controlling CPU */ + if (cpu == 0) { + control_cpu = 0; + smp_mb(); } - -exit_ok: - return NOTIFY_OK; + return 0; } -static struct notifier_block powerclamp_cpu_notifier = { - .notifier_call = powerclamp_cpu_callback, -}; +static int powerclamp_cpu_predown(unsigned int cpu) +{ + if (clamping == false) + return 0; + + stop_power_clamp_worker(cpu); + if (cpu != control_cpu) + return 0; + + control_cpu = cpumask_first(cpu_online_mask); + if (control_cpu == cpu) + control_cpu = cpumask_next(cpu, cpu_online_mask); + smp_mb(); + return 0; +} static int powerclamp_get_max_state(struct thermal_cooling_device *cdev, unsigned long *state) @@ -741,6 +741,8 @@ file_error: debugfs_remove_recursive(debug_dir); } +static enum cpuhp_state hp_state; + static int __init powerclamp_init(void) { int retval; @@ -758,10 +760,17 @@ static int __init powerclamp_init(void) /* set default limit, maybe adjusted during runtime based on feedback */ window_size = 2; - register_hotcpu_notifier(&powerclamp_cpu_notifier); + retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "thermal/intel_powerclamp:online", + powerclamp_cpu_online, + powerclamp_cpu_predown); + if (retval < 0) + goto exit_free; + + hp_state = retval; - powerclamp_thread = alloc_percpu(struct task_struct *); - if (!powerclamp_thread) { + worker_data = alloc_percpu(struct powerclamp_worker_data); + if (!worker_data) { retval = -ENOMEM; goto exit_unregister; } @@ -781,9 +790,9 @@ static int __init powerclamp_init(void) return 0; exit_free_thread: - free_percpu(powerclamp_thread); + free_percpu(worker_data); exit_unregister: - unregister_hotcpu_notifier(&powerclamp_cpu_notifier); + cpuhp_remove_state_nocalls(hp_state); exit_free: kfree(cpu_clamping_mask); return retval; @@ -792,9 +801,9 @@ module_init(powerclamp_init); static void __exit powerclamp_exit(void) { - unregister_hotcpu_notifier(&powerclamp_cpu_notifier); end_power_clamp(); - free_percpu(powerclamp_thread); + cpuhp_remove_state_nocalls(hp_state); + free_percpu(worker_data); thermal_cooling_device_unregister(cooling_dev); kfree(cpu_clamping_mask); diff --git a/include/acpi/processor.h b/include/acpi/processor.h index f3db11c24654..c1ba00fc4888 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -249,6 +249,7 @@ extern int acpi_processor_register_performance(struct acpi_processor_performance *performance, unsigned int cpu); extern void acpi_processor_unregister_performance(unsigned int cpu); +int acpi_processor_pstate_control(void); /* note: this locks both the calling module and the processor module if a _PPC object exists, rmmod is disallowed then */ int acpi_processor_notify_smm(struct module *calling_module); @@ -294,7 +295,7 @@ static inline void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx #ifdef CONFIG_CPU_FREQ void acpi_processor_ppc_init(void); void acpi_processor_ppc_exit(void); -int acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag); +void acpi_processor_ppc_has_changed(struct acpi_processor *pr, int event_flag); extern int acpi_processor_get_bios_limit(int cpu, unsigned int *limit); #else static inline void acpi_processor_ppc_init(void) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index e571128ad99a..09807c2ce328 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -238,6 +238,8 @@ void arch_cpu_idle_dead(void); int cpu_report_state(int cpu); int cpu_check_up_prepare(int cpu); void cpu_set_state_online(int cpu); +void play_idle(unsigned long duration_ms); + #ifdef CONFIG_HOTPLUG_CPU bool cpu_wait_death(unsigned int cpu, int seconds); bool cpu_report_death(void); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 32dc0cbd51ca..7e05c5e4e45c 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -175,7 +175,7 @@ void disable_cpufreq(void); u64 get_cpu_idle_time(unsigned int cpu, u64 *wall, int io_busy); int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu); -int cpufreq_update_policy(unsigned int cpu); +void cpufreq_update_policy(unsigned int cpu); bool have_governor_per_policy(void); struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy); void cpufreq_enable_fast_switch(struct cpufreq_policy *policy); @@ -234,6 +234,10 @@ __ATTR(_name, _perm, show_##_name, NULL) static struct freq_attr _name = \ __ATTR(_name, 0644, show_##_name, store_##_name) +#define cpufreq_freq_attr_wo(_name) \ +static struct freq_attr _name = \ +__ATTR(_name, 0200, NULL, store_##_name) + struct global_attr { struct attribute attr; ssize_t (*show)(struct kobject *kobj, diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index bb31373c3478..da346f2817a8 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -74,6 +74,7 @@ struct cpuidle_driver_kobj; struct cpuidle_device { unsigned int registered:1; unsigned int enabled:1; + unsigned int use_deepest_state:1; unsigned int cpu; int last_residency; @@ -192,11 +193,12 @@ static inline struct cpuidle_driver *cpuidle_get_cpu_driver( static inline struct cpuidle_device *cpuidle_get_device(void) {return NULL; } #endif -#if defined(CONFIG_CPU_IDLE) && defined(CONFIG_SUSPEND) +#ifdef CONFIG_CPU_IDLE extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv, struct cpuidle_device *dev); extern int cpuidle_enter_freeze(struct cpuidle_driver *drv, struct cpuidle_device *dev); +extern void cpuidle_use_deepest_state(bool enable); #else static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, struct cpuidle_device *dev) @@ -204,6 +206,9 @@ static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv, struct cpuidle_device *dev) {return -ENODEV; } +static inline void cpuidle_use_deepest_state(bool enable) +{ +} #endif /* kernel/sched/idle.c */ @@ -235,8 +240,6 @@ struct cpuidle_governor { int (*select) (struct cpuidle_driver *drv, struct cpuidle_device *dev); void (*reflect) (struct cpuidle_device *dev, int index); - - struct module *owner; }; #ifdef CONFIG_CPU_IDLE diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index a09fe5c009c8..81ece61075df 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -15,11 +15,11 @@ #include <linux/err.h> #include <linux/of.h> #include <linux/notifier.h> +#include <linux/spinlock.h> /* Defines used for the flags field in the struct generic_pm_domain */ #define GENPD_FLAG_PM_CLK (1U << 0) /* PM domain uses PM clk */ - -#define GENPD_MAX_NUM_STATES 8 /* Number of possible low power states */ +#define GENPD_FLAG_IRQ_SAFE (1U << 1) /* PM domain operates in atomic */ enum gpd_status { GPD_STATE_ACTIVE = 0, /* PM domain is active */ @@ -40,15 +40,18 @@ struct gpd_dev_ops { struct genpd_power_state { s64 power_off_latency_ns; s64 power_on_latency_ns; + s64 residency_ns; + struct fwnode_handle *fwnode; }; +struct genpd_lock_ops; + struct generic_pm_domain { struct dev_pm_domain domain; /* PM domain operations */ struct list_head gpd_list_node; /* Node in the global PM domains list */ struct list_head master_links; /* Links with PM domain as a master */ struct list_head slave_links; /* Links with PM domain as a slave */ struct list_head dev_list; /* List of devices */ - struct mutex lock; struct dev_power_governor *gov; struct work_struct power_off_work; struct fwnode_handle *provider; /* Identity of the domain provider */ @@ -70,9 +73,18 @@ struct generic_pm_domain { void (*detach_dev)(struct generic_pm_domain *domain, struct device *dev); unsigned int flags; /* Bit field of configs for genpd */ - struct genpd_power_state states[GENPD_MAX_NUM_STATES]; + struct genpd_power_state *states; unsigned int state_count; /* number of states */ unsigned int state_idx; /* state that genpd will go to when off */ + void *free; /* Free the state that was allocated for default */ + const struct genpd_lock_ops *lock_ops; + union { + struct mutex mlock; + struct { + spinlock_t slock; + unsigned long lock_flags; + }; + }; }; @@ -205,6 +217,8 @@ extern int of_genpd_add_device(struct of_phandle_args *args, extern int of_genpd_add_subdomain(struct of_phandle_args *parent, struct of_phandle_args *new_subdomain); extern struct generic_pm_domain *of_genpd_remove_last(struct device_node *np); +extern int of_genpd_parse_idle_states(struct device_node *dn, + struct genpd_power_state **states, int *n); int genpd_dev_pm_attach(struct device *dev); #else /* !CONFIG_PM_GENERIC_DOMAINS_OF */ @@ -234,6 +248,12 @@ static inline int of_genpd_add_subdomain(struct of_phandle_args *parent, return -ENODEV; } +static inline int of_genpd_parse_idle_states(struct device_node *dn, + struct genpd_power_state **states, int *n) +{ + return -ENODEV; +} + static inline int genpd_dev_pm_attach(struct device *dev) { return -ENODEV; diff --git a/include/linux/pm_opp.h b/include/linux/pm_opp.h index bca26157f5b6..0edd88f93904 100644 --- a/include/linux/pm_opp.h +++ b/include/linux/pm_opp.h @@ -17,13 +17,65 @@ #include <linux/err.h> #include <linux/notifier.h> +struct clk; +struct regulator; struct dev_pm_opp; struct device; +struct opp_table; enum dev_pm_opp_event { OPP_EVENT_ADD, OPP_EVENT_REMOVE, OPP_EVENT_ENABLE, OPP_EVENT_DISABLE, }; +/** + * struct dev_pm_opp_supply - Power supply voltage/current values + * @u_volt: Target voltage in microvolts corresponding to this OPP + * @u_volt_min: Minimum voltage in microvolts corresponding to this OPP + * @u_volt_max: Maximum voltage in microvolts corresponding to this OPP + * @u_amp: Maximum current drawn by the device in microamperes + * + * This structure stores the voltage/current values for a single power supply. + */ +struct dev_pm_opp_supply { + unsigned long u_volt; + unsigned long u_volt_min; + unsigned long u_volt_max; + unsigned long u_amp; +}; + +/** + * struct dev_pm_opp_info - OPP freq/voltage/current values + * @rate: Target clk rate in hz + * @supplies: Array of voltage/current values for all power supplies + * + * This structure stores the freq/voltage/current values for a single OPP. + */ +struct dev_pm_opp_info { + unsigned long rate; + struct dev_pm_opp_supply *supplies; +}; + +/** + * struct dev_pm_set_opp_data - Set OPP data + * @old_opp: Old OPP info + * @new_opp: New OPP info + * @regulators: Array of regulator pointers + * @regulator_count: Number of regulators + * @clk: Pointer to clk + * @dev: Pointer to the struct device + * + * This structure contains all information required for setting an OPP. + */ +struct dev_pm_set_opp_data { + struct dev_pm_opp_info old_opp; + struct dev_pm_opp_info new_opp; + + struct regulator **regulators; + unsigned int regulator_count; + struct clk *clk; + struct device *dev; +}; + #if defined(CONFIG_PM_OPP) unsigned long dev_pm_opp_get_voltage(struct dev_pm_opp *opp); @@ -62,8 +114,10 @@ int dev_pm_opp_set_supported_hw(struct device *dev, const u32 *versions, void dev_pm_opp_put_supported_hw(struct device *dev); int dev_pm_opp_set_prop_name(struct device *dev, const char *name); void dev_pm_opp_put_prop_name(struct device *dev); -int dev_pm_opp_set_regulator(struct device *dev, const char *name); -void dev_pm_opp_put_regulator(struct device *dev); +struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count); +void dev_pm_opp_put_regulators(struct opp_table *opp_table); +int dev_pm_opp_register_set_opp_helper(struct device *dev, int (*set_opp)(struct dev_pm_set_opp_data *data)); +void dev_pm_opp_register_put_opp_helper(struct device *dev); int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq); int dev_pm_opp_set_sharing_cpus(struct device *cpu_dev, const struct cpumask *cpumask); int dev_pm_opp_get_sharing_cpus(struct device *cpu_dev, struct cpumask *cpumask); @@ -163,6 +217,14 @@ static inline int dev_pm_opp_set_supported_hw(struct device *dev, static inline void dev_pm_opp_put_supported_hw(struct device *dev) {} +static inline int dev_pm_opp_register_set_opp_helper(struct device *dev, + int (*set_opp)(struct dev_pm_set_opp_data *data)) +{ + return -ENOTSUPP; +} + +static inline void dev_pm_opp_register_put_opp_helper(struct device *dev) {} + static inline int dev_pm_opp_set_prop_name(struct device *dev, const char *name) { return -ENOTSUPP; @@ -170,12 +232,12 @@ static inline int dev_pm_opp_set_prop_name(struct device *dev, const char *name) static inline void dev_pm_opp_put_prop_name(struct device *dev) {} -static inline int dev_pm_opp_set_regulator(struct device *dev, const char *name) +static inline struct opp_table *dev_pm_opp_set_regulators(struct device *dev, const char * const names[], unsigned int count) { - return -ENOTSUPP; + return ERR_PTR(-ENOTSUPP); } -static inline void dev_pm_opp_put_regulator(struct device *dev) {} +static inline void dev_pm_opp_put_regulators(struct opp_table *opp_table) {} static inline int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) { diff --git a/include/linux/pm_runtime.h b/include/linux/pm_runtime.h index 2e14d2667b6c..4957fc185ea9 100644 --- a/include/linux/pm_runtime.h +++ b/include/linux/pm_runtime.h @@ -61,12 +61,6 @@ static inline void pm_suspend_ignore_children(struct device *dev, bool enable) dev->power.ignore_children = enable; } -static inline bool pm_children_suspended(struct device *dev) -{ - return dev->power.ignore_children - || !atomic_read(&dev->power.child_count); -} - static inline void pm_runtime_get_noresume(struct device *dev) { atomic_inc(&dev->power.usage_count); @@ -162,7 +156,6 @@ static inline void pm_runtime_allow(struct device *dev) {} static inline void pm_runtime_forbid(struct device *dev) {} static inline void pm_suspend_ignore_children(struct device *dev, bool enable) {} -static inline bool pm_children_suspended(struct device *dev) { return false; } static inline void pm_runtime_get_noresume(struct device *dev) {} static inline void pm_runtime_put_noidle(struct device *dev) {} static inline bool device_run_wake(struct device *dev) { return false; } @@ -265,9 +258,9 @@ static inline int pm_runtime_set_active(struct device *dev) return __pm_runtime_set_status(dev, RPM_ACTIVE); } -static inline void pm_runtime_set_suspended(struct device *dev) +static inline int pm_runtime_set_suspended(struct device *dev) { - __pm_runtime_set_status(dev, RPM_SUSPENDED); + return __pm_runtime_set_status(dev, RPM_SUSPENDED); } static inline void pm_runtime_disable(struct device *dev) diff --git a/include/linux/sched.h b/include/linux/sched.h index 0e90f2973719..5ccbbfe41345 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2287,6 +2287,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, /* * Per process flags */ +#define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ @@ -2648,7 +2649,7 @@ extern struct task_struct *idle_task(int cpu); */ static inline bool is_idle_task(const struct task_struct *p) { - return p->pid == 0; + return !!(p->flags & PF_IDLE); } extern struct task_struct *curr_task(int cpu); extern void ia64_set_curr_task(int cpu, struct task_struct *p); diff --git a/include/linux/suspend.h b/include/linux/suspend.h index d9718378a8be..0c729c3c8549 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -194,6 +194,8 @@ struct platform_freeze_ops { }; #ifdef CONFIG_SUSPEND +extern suspend_state_t mem_sleep_default; + /** * suspend_set_ops - set platform dependent suspend operations * @ops: The new suspend operations to set. diff --git a/kernel/fork.c b/kernel/fork.c index 7377f414f3ce..a439ac429669 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1544,7 +1544,7 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ - p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); + p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE); p->flags |= PF_FORKNOEXEC; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); diff --git a/kernel/power/main.c b/kernel/power/main.c index 281a697fd458..d401c21136d1 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -78,6 +78,78 @@ static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, power_attr(pm_async); +#ifdef CONFIG_SUSPEND +static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + char *s = buf; + suspend_state_t i; + + for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) + if (mem_sleep_states[i]) { + const char *label = mem_sleep_states[i]; + + if (mem_sleep_current == i) + s += sprintf(s, "[%s] ", label); + else + s += sprintf(s, "%s ", label); + } + + /* Convert the last space to a newline if needed. */ + if (s != buf) + *(s-1) = '\n'; + + return (s - buf); +} + +static suspend_state_t decode_suspend_state(const char *buf, size_t n) +{ + suspend_state_t state; + char *p; + int len; + + p = memchr(buf, '\n', n); + len = p ? p - buf : n; + + for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) { + const char *label = mem_sleep_states[state]; + + if (label && len == strlen(label) && !strncmp(buf, label, len)) + return state; + } + + return PM_SUSPEND_ON; +} + +static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + suspend_state_t state; + int error; + + error = pm_autosleep_lock(); + if (error) + return error; + + if (pm_autosleep_state() > PM_SUSPEND_ON) { + error = -EBUSY; + goto out; + } + + state = decode_suspend_state(buf, n); + if (state < PM_SUSPEND_MAX && state > PM_SUSPEND_ON) + mem_sleep_current = state; + else + error = -EINVAL; + + out: + pm_autosleep_unlock(); + return error ? error : n; +} + +power_attr(mem_sleep); +#endif /* CONFIG_SUSPEND */ + #ifdef CONFIG_PM_DEBUG int pm_test_level = TEST_NONE; @@ -368,12 +440,16 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, } state = decode_state(buf, n); - if (state < PM_SUSPEND_MAX) + if (state < PM_SUSPEND_MAX) { + if (state == PM_SUSPEND_MEM) + state = mem_sleep_current; + error = pm_suspend(state); - else if (state == PM_SUSPEND_MAX) + } else if (state == PM_SUSPEND_MAX) { error = hibernate(); - else + } else { error = -EINVAL; + } out: pm_autosleep_unlock(); @@ -485,6 +561,9 @@ static ssize_t autosleep_store(struct kobject *kobj, && strcmp(buf, "off") && strcmp(buf, "off\n")) return -EINVAL; + if (state == PM_SUSPEND_MEM) + state = mem_sleep_current; + error = pm_autosleep_set_state(state); return error ? error : n; } @@ -602,6 +681,9 @@ static struct attribute * g[] = { #ifdef CONFIG_PM_SLEEP &pm_async_attr.attr, &wakeup_count_attr.attr, +#ifdef CONFIG_SUSPEND + &mem_sleep_attr.attr, +#endif #ifdef CONFIG_PM_AUTOSLEEP &autosleep_attr.attr, #endif diff --git a/kernel/power/power.h b/kernel/power/power.h index 56d1d0dedf76..1dfa0da827d3 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -189,11 +189,15 @@ extern void swsusp_show_speed(ktime_t, ktime_t, unsigned int, char *); #ifdef CONFIG_SUSPEND /* kernel/power/suspend.c */ -extern const char *pm_labels[]; +extern const char * const pm_labels[]; extern const char *pm_states[]; +extern const char *mem_sleep_states[]; +extern suspend_state_t mem_sleep_current; extern int suspend_devices_and_enter(suspend_state_t state); #else /* !CONFIG_SUSPEND */ +#define mem_sleep_current PM_SUSPEND_ON + static inline int suspend_devices_and_enter(suspend_state_t state) { return -ENOSYS; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 6ccb08f57fcb..f67ceb7768b8 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -32,8 +32,21 @@ #include "power.h" -const char *pm_labels[] = { "mem", "standby", "freeze", NULL }; +const char * const pm_labels[] = { + [PM_SUSPEND_FREEZE] = "freeze", + [PM_SUSPEND_STANDBY] = "standby", + [PM_SUSPEND_MEM] = "mem", +}; const char *pm_states[PM_SUSPEND_MAX]; +static const char * const mem_sleep_labels[] = { + [PM_SUSPEND_FREEZE] = "s2idle", + [PM_SUSPEND_STANDBY] = "shallow", + [PM_SUSPEND_MEM] = "deep", +}; +const char *mem_sleep_states[PM_SUSPEND_MAX]; + +suspend_state_t mem_sleep_current = PM_SUSPEND_FREEZE; +suspend_state_t mem_sleep_default = PM_SUSPEND_MAX; unsigned int pm_suspend_global_flags; EXPORT_SYMBOL_GPL(pm_suspend_global_flags); @@ -110,30 +123,32 @@ static bool valid_state(suspend_state_t state) return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); } -/* - * If this is set, the "mem" label always corresponds to the deepest sleep state - * available, the "standby" label corresponds to the second deepest sleep state - * available (if any), and the "freeze" label corresponds to the remaining - * available sleep state (if there is one). - */ -static bool relative_states; - void __init pm_states_init(void) { + /* "mem" and "freeze" are always present in /sys/power/state. */ + pm_states[PM_SUSPEND_MEM] = pm_labels[PM_SUSPEND_MEM]; + pm_states[PM_SUSPEND_FREEZE] = pm_labels[PM_SUSPEND_FREEZE]; /* - * freeze state should be supported even without any suspend_ops, - * initialize pm_states accordingly here + * Suspend-to-idle should be supported even without any suspend_ops, + * initialize mem_sleep_states[] accordingly here. */ - pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2]; + mem_sleep_states[PM_SUSPEND_FREEZE] = mem_sleep_labels[PM_SUSPEND_FREEZE]; } -static int __init sleep_states_setup(char *str) +static int __init mem_sleep_default_setup(char *str) { - relative_states = !strncmp(str, "1", 1); + suspend_state_t state; + + for (state = PM_SUSPEND_FREEZE; state <= PM_SUSPEND_MEM; state++) + if (mem_sleep_labels[state] && + !strcmp(str, mem_sleep_labels[state])) { + mem_sleep_default = state; + break; + } + return 1; } - -__setup("relative_sleep_states=", sleep_states_setup); +__setup("mem_sleep_default=", mem_sleep_default_setup); /** * suspend_set_ops - Set the global suspend method table. @@ -141,21 +156,21 @@ __setup("relative_sleep_states=", sleep_states_setup); */ void suspend_set_ops(const struct platform_suspend_ops *ops) { - suspend_state_t i; - int j = 0; - lock_system_sleep(); suspend_ops = ops; - for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--) - if (valid_state(i)) { - pm_states[i] = pm_labels[j++]; - } else if (!relative_states) { - pm_states[i] = NULL; - j++; - } - pm_states[PM_SUSPEND_FREEZE] = pm_labels[j]; + if (valid_state(PM_SUSPEND_STANDBY)) { + mem_sleep_states[PM_SUSPEND_STANDBY] = mem_sleep_labels[PM_SUSPEND_STANDBY]; + pm_states[PM_SUSPEND_STANDBY] = pm_labels[PM_SUSPEND_STANDBY]; + if (mem_sleep_default == PM_SUSPEND_STANDBY) + mem_sleep_current = PM_SUSPEND_STANDBY; + } + if (valid_state(PM_SUSPEND_MEM)) { + mem_sleep_states[PM_SUSPEND_MEM] = mem_sleep_labels[PM_SUSPEND_MEM]; + if (mem_sleep_default >= PM_SUSPEND_MEM) + mem_sleep_current = PM_SUSPEND_MEM; + } unlock_system_sleep(); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d18804491d9f..966556ebdbb3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5280,6 +5280,7 @@ void init_idle(struct task_struct *idle, int cpu) __sched_fork(0, idle); idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); + idle->flags |= PF_IDLE; kasan_unpoison_task_stack(idle); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 69e06898997d..fd4659313640 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -12,11 +12,14 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/cpufreq.h> +#include <linux/kthread.h> #include <linux/slab.h> #include <trace/events/power.h> #include "sched.h" +#define SUGOV_KTHREAD_PRIORITY 50 + struct sugov_tunables { struct gov_attr_set attr_set; unsigned int rate_limit_us; @@ -35,8 +38,10 @@ struct sugov_policy { /* The next fields are only needed if fast switch cannot be used. */ struct irq_work irq_work; - struct work_struct work; + struct kthread_work work; struct mutex work_lock; + struct kthread_worker worker; + struct task_struct *thread; bool work_in_progress; bool need_freq_update; @@ -291,7 +296,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, raw_spin_unlock(&sg_policy->update_lock); } -static void sugov_work(struct work_struct *work) +static void sugov_work(struct kthread_work *work) { struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work); @@ -308,7 +313,21 @@ static void sugov_irq_work(struct irq_work *irq_work) struct sugov_policy *sg_policy; sg_policy = container_of(irq_work, struct sugov_policy, irq_work); - schedule_work_on(smp_processor_id(), &sg_policy->work); + + /* + * For RT and deadline tasks, the schedutil governor shoots the + * frequency to maximum. Special care must be taken to ensure that this + * kthread doesn't result in the same behavior. + * + * This is (mostly) guaranteed by the work_in_progress flag. The flag is + * updated only at the end of the sugov_work() function and before that + * the schedutil governor rejects all other frequency scaling requests. + * + * There is a very rare case though, where the RT thread yields right + * after the work_in_progress flag is cleared. The effects of that are + * neglected for now. + */ + kthread_queue_work(&sg_policy->worker, &sg_policy->work); } /************************** sysfs interface ************************/ @@ -371,19 +390,64 @@ static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy) return NULL; sg_policy->policy = policy; - init_irq_work(&sg_policy->irq_work, sugov_irq_work); - INIT_WORK(&sg_policy->work, sugov_work); - mutex_init(&sg_policy->work_lock); raw_spin_lock_init(&sg_policy->update_lock); return sg_policy; } static void sugov_policy_free(struct sugov_policy *sg_policy) { - mutex_destroy(&sg_policy->work_lock); kfree(sg_policy); } +static int sugov_kthread_create(struct sugov_policy *sg_policy) +{ + struct task_struct *thread; + struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 }; + struct cpufreq_policy *policy = sg_policy->policy; + int ret; + + /* kthread only required for slow path */ + if (policy->fast_switch_enabled) + return 0; + + kthread_init_work(&sg_policy->work, sugov_work); + kthread_init_worker(&sg_policy->worker); + thread = kthread_create(kthread_worker_fn, &sg_policy->worker, + "sugov:%d", + cpumask_first(policy->related_cpus)); + if (IS_ERR(thread)) { + pr_err("failed to create sugov thread: %ld\n", PTR_ERR(thread)); + return PTR_ERR(thread); + } + + ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, ¶m); + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + return ret; + } + + sg_policy->thread = thread; + kthread_bind_mask(thread, policy->related_cpus); + init_irq_work(&sg_policy->irq_work, sugov_irq_work); + mutex_init(&sg_policy->work_lock); + + wake_up_process(thread); + + return 0; +} + +static void sugov_kthread_stop(struct sugov_policy *sg_policy) +{ + /* kthread only required for slow path */ + if (sg_policy->policy->fast_switch_enabled) + return; + + kthread_flush_worker(&sg_policy->worker); + kthread_stop(sg_policy->thread); + mutex_destroy(&sg_policy->work_lock); +} + static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy) { struct sugov_tunables *tunables; @@ -416,16 +480,24 @@ static int sugov_init(struct cpufreq_policy *policy) if (policy->governor_data) return -EBUSY; + cpufreq_enable_fast_switch(policy); + sg_policy = sugov_policy_alloc(policy); - if (!sg_policy) - return -ENOMEM; + if (!sg_policy) { + ret = -ENOMEM; + goto disable_fast_switch; + } + + ret = sugov_kthread_create(sg_policy); + if (ret) + goto free_sg_policy; mutex_lock(&global_tunables_lock); if (global_tunables) { if (WARN_ON(have_governor_per_policy())) { ret = -EINVAL; - goto free_sg_policy; + goto stop_kthread; } policy->governor_data = sg_policy; sg_policy->tunables = global_tunables; @@ -437,7 +509,7 @@ static int sugov_init(struct cpufreq_policy *policy) tunables = sugov_tunables_alloc(sg_policy); if (!tunables) { ret = -ENOMEM; - goto free_sg_policy; + goto stop_kthread; } tunables->rate_limit_us = LATENCY_MULTIPLIER; @@ -454,20 +526,25 @@ static int sugov_init(struct cpufreq_policy *policy) if (ret) goto fail; - out: +out: mutex_unlock(&global_tunables_lock); - - cpufreq_enable_fast_switch(policy); return 0; - fail: +fail: policy->governor_data = NULL; sugov_tunables_free(tunables); - free_sg_policy: +stop_kthread: + sugov_kthread_stop(sg_policy); + +free_sg_policy: mutex_unlock(&global_tunables_lock); sugov_policy_free(sg_policy); + +disable_fast_switch: + cpufreq_disable_fast_switch(policy); + pr_err("initialization failed (error %d)\n", ret); return ret; } @@ -478,8 +555,6 @@ static void sugov_exit(struct cpufreq_policy *policy) struct sugov_tunables *tunables = sg_policy->tunables; unsigned int count; - cpufreq_disable_fast_switch(policy); - mutex_lock(&global_tunables_lock); count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook); @@ -489,7 +564,9 @@ static void sugov_exit(struct cpufreq_policy *policy) mutex_unlock(&global_tunables_lock); + sugov_kthread_stop(sg_policy); sugov_policy_free(sg_policy); + cpufreq_disable_fast_switch(policy); } static int sugov_start(struct cpufreq_policy *policy) @@ -535,8 +612,10 @@ static void sugov_stop(struct cpufreq_policy *policy) synchronize_sched(); - irq_work_sync(&sg_policy->irq_work); - cancel_work_sync(&sg_policy->work); + if (!policy->fast_switch_enabled) { + irq_work_sync(&sg_policy->irq_work); + kthread_cancel_work_sync(&sg_policy->work); + } } static void sugov_limits(struct cpufreq_policy *policy) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 1d8718d5300d..6a4bae0a649d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -164,11 +164,14 @@ static void cpuidle_idle_call(void) * timekeeping to prevent timer interrupts from kicking us out of idle * until a proper wakeup interrupt happens. */ - if (idle_should_freeze()) { - entered_state = cpuidle_enter_freeze(drv, dev); - if (entered_state > 0) { - local_irq_enable(); - goto exit_idle; + + if (idle_should_freeze() || dev->use_deepest_state) { + if (idle_should_freeze()) { + entered_state = cpuidle_enter_freeze(drv, dev); + if (entered_state > 0) { + local_irq_enable(); + goto exit_idle; + } } next_state = cpuidle_find_deepest_state(drv, dev); @@ -202,76 +205,65 @@ exit_idle: * * Called with polling cleared. */ -static void cpu_idle_loop(void) +static void do_idle(void) { - int cpu = smp_processor_id(); - - while (1) { - /* - * If the arch has a polling bit, we maintain an invariant: - * - * Our polling bit is clear if we're not scheduled (i.e. if - * rq->curr != rq->idle). This means that, if rq->idle has - * the polling bit set, then setting need_resched is - * guaranteed to cause the cpu to reschedule. - */ - - __current_set_polling(); - quiet_vmstat(); - tick_nohz_idle_enter(); + /* + * If the arch has a polling bit, we maintain an invariant: + * + * Our polling bit is clear if we're not scheduled (i.e. if rq->curr != + * rq->idle). This means that, if rq->idle has the polling bit set, + * then setting need_resched is guaranteed to cause the CPU to + * reschedule. + */ - while (!need_resched()) { - check_pgt_cache(); - rmb(); + __current_set_polling(); + tick_nohz_idle_enter(); - if (cpu_is_offline(cpu)) { - cpuhp_report_idle_dead(); - arch_cpu_idle_dead(); - } + while (!need_resched()) { + check_pgt_cache(); + rmb(); - local_irq_disable(); - arch_cpu_idle_enter(); - - /* - * In poll mode we reenable interrupts and spin. - * - * Also if we detected in the wakeup from idle - * path that the tick broadcast device expired - * for us, we don't want to go deep idle as we - * know that the IPI is going to arrive right - * away - */ - if (cpu_idle_force_poll || tick_check_broadcast_expired()) - cpu_idle_poll(); - else - cpuidle_idle_call(); - - arch_cpu_idle_exit(); + if (cpu_is_offline(smp_processor_id())) { + cpuhp_report_idle_dead(); + arch_cpu_idle_dead(); } - /* - * Since we fell out of the loop above, we know - * TIF_NEED_RESCHED must be set, propagate it into - * PREEMPT_NEED_RESCHED. - * - * This is required because for polling idle loops we will - * not have had an IPI to fold the state for us. - */ - preempt_set_need_resched(); - tick_nohz_idle_exit(); - __current_clr_polling(); + local_irq_disable(); + arch_cpu_idle_enter(); /* - * We promise to call sched_ttwu_pending and reschedule - * if need_resched is set while polling is set. That - * means that clearing polling needs to be visible - * before doing these things. + * In poll mode we reenable interrupts and spin. Also if we + * detected in the wakeup from idle path that the tick + * broadcast device expired for us, we don't want to go deep + * idle as we know that the IPI is going to arrive right away. */ - smp_mb__after_atomic(); - - sched_ttwu_pending(); - schedule_preempt_disabled(); + if (cpu_idle_force_poll || tick_check_broadcast_expired()) + cpu_idle_poll(); + else + cpuidle_idle_call(); + arch_cpu_idle_exit(); } + + /* + * Since we fell out of the loop above, we know TIF_NEED_RESCHED must + * be set, propagate it into PREEMPT_NEED_RESCHED. + * + * This is required because for polling idle loops we will not have had + * an IPI to fold the state for us. + */ + preempt_set_need_resched(); + tick_nohz_idle_exit(); + __current_clr_polling(); + + /* + * We promise to call sched_ttwu_pending() and reschedule if + * need_resched() is set while polling is set. That means that clearing + * polling needs to be visible before doing these things. + */ + smp_mb__after_atomic(); + + sched_ttwu_pending(); + schedule_preempt_disabled(); } bool cpu_in_idle(unsigned long pc) @@ -280,6 +272,56 @@ bool cpu_in_idle(unsigned long pc) pc < (unsigned long)__cpuidle_text_end; } +struct idle_timer { + struct hrtimer timer; + int done; +}; + +static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer) +{ + struct idle_timer *it = container_of(timer, struct idle_timer, timer); + + WRITE_ONCE(it->done, 1); + set_tsk_need_resched(current); + + return HRTIMER_NORESTART; +} + +void play_idle(unsigned long duration_ms) +{ + struct idle_timer it; + + /* + * Only FIFO tasks can disable the tick since they don't need the forced + * preemption. + */ + WARN_ON_ONCE(current->policy != SCHED_FIFO); + WARN_ON_ONCE(current->nr_cpus_allowed != 1); + WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); + WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); + WARN_ON_ONCE(!duration_ms); + + rcu_sleep_check(); + preempt_disable(); + current->flags |= PF_IDLE; + cpuidle_use_deepest_state(true); + + it.done = 0; + hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + it.timer.function = idle_inject_timer_fn; + hrtimer_start(&it.timer, ms_to_ktime(duration_ms), HRTIMER_MODE_REL_PINNED); + + while (!READ_ONCE(it.done)) + do_idle(); + + cpuidle_use_deepest_state(false); + current->flags &= ~PF_IDLE; + + preempt_fold_need_resched(); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(play_idle); + void cpu_startup_entry(enum cpuhp_state state) { /* @@ -299,5 +341,6 @@ void cpu_startup_entry(enum cpuhp_state state) #endif arch_cpu_idle_prepare(); cpuhp_online_idle(state); - cpu_idle_loop(); + while (1) + do_idle(); } diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 0e9505f66ec1..b2a0cff2bb35 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -80,7 +80,14 @@ void kasan_unpoison_task_stack(struct task_struct *task) /* Unpoison the stack for the current task beyond a watermark sp value. */ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) { - __kasan_unpoison_stack(current, watermark); + /* + * Calculate the task stack base address. Avoid using 'current' + * because this function is called by early resume code which hasn't + * yet set up the percpu register (%gs). + */ + void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1)); + + kasan_unpoison_shadow(base, watermark - base); } /* |