From 1af89dedc8a58006d8e385b1e0d2cd24df8a3b69 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 6 Jun 2024 20:27:19 +0200 Subject: thermal: core: Do not fail cdev registration because of invalid initial state It is reported that commit 31a0fa0019b0 ("thermal/debugfs: Pass cooling device state to thermal_debug_cdev_add()") causes the ACPI fan driver to fail probing on some systems which turns out to be due to the _FST control method returning an invalid value until _FSL is first evaluated for the given fan. If this happens, the .get_cur_state() cooling device callback returns an error and __thermal_cooling_device_register() fails as uses that callback after commit 31a0fa0019b0. Arguably, _FST should not return an invalid value even if it is evaluated before _FSL, so this may be regarded as a platform firmware issue, but at the same time it is not a good enough reason for failing the cooling device registration where the initial cooling device state is only needed to initialize a thermal debug facility. Accordingly, modify __thermal_cooling_device_register() to avoid calling thermal_debug_cdev_add() instead of returning an error if the initial .get_cur_state() callback invocation fails. Fixes: 31a0fa0019b0 ("thermal/debugfs: Pass cooling device state to thermal_debug_cdev_add()") Closes: https://lore.kernel.org/linux-acpi/20240530153727.843378-1-laura.nao@collabora.com Reported-by: Laura Nao Signed-off-by: Rafael J. Wysocki Acked-by: Daniel Lezcano Tested-by: Laura Nao --- drivers/thermal/thermal_core.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 30567b499455..d70e76dd3c94 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -999,9 +999,17 @@ __thermal_cooling_device_register(struct device_node *np, if (ret) goto out_cdev_type; + /* + * The cooling device's current state is only needed for debug + * initialization below, so a failure to get it does not cause + * the entire cooling device initialization to fail. However, + * the debug will not work for the device if its initial state + * cannot be determined and drivers are responsible for ensuring + * that this will not happen. + */ ret = cdev->ops->get_cur_state(cdev, ¤t_state); if (ret) - goto out_cdev_type; + current_state = ULONG_MAX; thermal_cooling_device_setup_sysfs(cdev); @@ -1016,7 +1024,8 @@ __thermal_cooling_device_register(struct device_node *np, return ERR_PTR(ret); } - thermal_debug_cdev_add(cdev, current_state); + if (current_state <= cdev->max_state) + thermal_debug_cdev_add(cdev, current_state); /* Add 'this' new cdev to the global cdev list */ mutex_lock(&thermal_list_lock); -- cgit v1.2.3 From 7f18bd49cb6b6a3ab6d860fefccdc94f2a247db0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 6 Jun 2024 20:27:30 +0200 Subject: thermal: ACPI: Invalidate trip points with temperature of 0 or below It is reported that commit 950210887670 ("thermal: core: Drop trips_disabled bitmask") causes the maximum frequency of CPUs to drop further down with every system sleep-wake cycle on Intel Core i7-4710HQ. This turns out to be due to a trip point whose temperature is equal to 0 degrees Celsius which is acted on every time the system wakes from sleep. Before commit 950210887670 this trip point would be disabled wia the trips_disabled bitmask, but now it is treated as a valid one. Since ACPI thermal control is generally about protection against overheating, trip points with temperature of 0 centigrade or below are not particularly useful there, so initialize them all as invalid which fixes the problem at hand. Fixes: 950210887670 ("thermal: core: Drop trips_disabled bitmask") Closes: https://lore.kernel.org/linux-pm/3f71747b-f852-4ee0-b384-cf46b2aefa3f@gmx.com Reported-by: Tibor Billes Tested-by: Tibor Billes Cc: 6.7+ # 6.7+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/thermal.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index d67881b50bca..a0cfc857fb55 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -168,11 +168,17 @@ static int acpi_thermal_get_polling_frequency(struct acpi_thermal *tz) static int acpi_thermal_temp(struct acpi_thermal *tz, int temp_deci_k) { + int temp; + if (temp_deci_k == THERMAL_TEMP_INVALID) return THERMAL_TEMP_INVALID; - return deci_kelvin_to_millicelsius_with_offset(temp_deci_k, + temp = deci_kelvin_to_millicelsius_with_offset(temp_deci_k, tz->kelvin_offset); + if (temp <= 0) + return THERMAL_TEMP_INVALID; + + return temp; } static bool acpi_thermal_trip_valid(struct acpi_thermal_trip *acpi_trip) -- cgit v1.2.3 From b6846826982b9f2f2ad0e79540521b517469ee92 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 11 Jun 2024 16:51:23 +0200 Subject: thermal: gov_step_wise: Restore passive polling management Consider a thermal zone with one passive trip point, a cooling device with 3 states (0, 1, 2) bound to it, passive polling enabled (nonzero passive_delay_jiffies) and no regular polling (polling_delay_jiffies equal to 0) that is managed by the Step-Wise governor. Suppose that the initial state of the cooling device is 0 and the zone temperature is below the trip point to start with. When the trip point is crossed, tz->passive is incremented by the thermal core and the governor's .manage() callback is invoked. It sets 'throttle' to 'true' for the trip in question and get_target_state() returns 1 for the instance corresponding to the cooling device (say that 'upper' and 'lower' are set to 2 and 0 for it, respectively), so its state changes to 1. Passive polling is still active for the zone, so next time the temperature is updated, the governor's .manage() callback will be invoked again. If the temperature is still rising, it will change the state of the cooling device to 2. Now suppose that next time the zone temperature is updated, it falls below the trip point, so tz->passive is decremented for the zone (say it becomes 0 then) and the governor's .manage() callbacks runs. It finds that the temperature trend for the zone is 'falling' and 'throttle' will be set to 'false' for the trip in question, so the cooling device's state will be changed to 1. However, because tz->polling is 0 for the zone, the governor's .manage() callback may not be invoked again for a long time and the cooling device's state will not be reset back to 0. This can happen because commit 042a3d80f118 ("thermal: core: Move passive polling management to the core") removed passive polling management from the Step-Wise governor. Before that change, thermal_zone_trip_update() would bump up tz->passive when changing the target state for a thermal instance from "no target" to a specific value and it would drop tz->passive when changing it back to "no target" which would cause passive polling to be active for the zone until the governor has reset the states of all cooling devices. In particular, in the example above tz->passive would be incremented when changing the state of the cooling device from 0 to 1 and then it would be still nonzero when the state of the cooling device was changed from 2 to 1. To prevent this problem from occurring, restore the passive polling management in the Step-Wise governor by partially reverting the commit in question and update the comment in the restored code to explain its role more clearly. Fixes: 042a3d80f118 ("thermal: core: Move passive polling management to the core") Closes: https://lore.kernel.org/linux-pm/ZmVfcEOxmjUHZTSX@hovoldconsulting.com Reported-by: Johan Hovold Tested-by: Johan Hovold Signed-off-by: Rafael J. Wysocki --- drivers/thermal/gov_step_wise.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/thermal/gov_step_wise.c b/drivers/thermal/gov_step_wise.c index e0fdc497bfcc..65974fe8be0d 100644 --- a/drivers/thermal/gov_step_wise.c +++ b/drivers/thermal/gov_step_wise.c @@ -93,6 +93,23 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, if (instance->initialized && old_target == instance->target) continue; + if (trip->type == THERMAL_TRIP_PASSIVE) { + /* + * If the target state for this thermal instance + * changes from THERMAL_NO_TARGET to something else, + * ensure that the zone temperature will be updated + * (assuming enabled passive cooling) until it becomes + * THERMAL_NO_TARGET again, or the cooling device may + * not be reset to its initial state. + */ + if (old_target == THERMAL_NO_TARGET && + instance->target != THERMAL_NO_TARGET) + tz->passive++; + else if (old_target != THERMAL_NO_TARGET && + instance->target == THERMAL_NO_TARGET) + tz->passive--; + } + instance->initialized = true; mutex_lock(&instance->cdev->lock); -- cgit v1.2.3