summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
authorBjorn Helgaas <bhelgaas@google.com>2024-11-25 13:40:43 -0600
committerBjorn Helgaas <bhelgaas@google.com>2024-11-25 13:40:43 -0600
commitd957ff7acaf27674e73db716a2dc0ae8170144cd (patch)
treef90c056ad3e7176feab39c6aa33e80100593ba48 /drivers
parent018247100d90e6f4a219150bc89792d9ed6a5ac0 (diff)
parentba58eee1c57b2ad45c36f782861c18faef170a55 (diff)
downloadlwn-d957ff7acaf27674e73db716a2dc0ae8170144cd.tar.gz
lwn-d957ff7acaf27674e73db716a2dc0ae8170144cd.zip
Merge branch 'pci/bwctrl'
- Add read/modify/write locking for Link Control 2, which is used to manage Link speed (Ilpo Järvinen) - Cache all supported Link speeds for use by the PCIe bandwidth controller (Ilpo Järvinen) - Extract the Link Bandwidth Management Status check into pcie_lbms_seen(), where it can be shared between the bandwidth controller and quirks that use it to help retrain failed links (Ilpo Järvinen) - Re-add Link Bandwidth notification support with updates to address the reasons it was previously reverted (Alexandru Gagniuc, Ilpo Järvinen) - Add pcie_set_target_speed() and related functionality to manage PCIe Link speed based on thermal constraints (Ilpo Järvinen) - Add a thermal cooling driver to throttle PCIe Links via the existing thermal management framework (Ilpo Järvinen) - Add a userspace selftest for the PCIe bandwidth controller (Ilpo Järvinen) - Drop duplicate pcie_get_speed_cap(), pcie_get_width_cap() declarations (Bjorn Helgaas) * pci/bwctrl: PCI: Drop duplicate pcie_get_speed_cap(), pcie_get_width_cap() declarations selftests/pcie_bwctrl: Create selftests thermal: Add PCIe cooling driver PCI/bwctrl: Add pcie_set_target_speed() to set PCIe Link Speed PCI/bwctrl: Re-add BW notification portdrv as PCIe BW controller PCI: Abstract LBMS seen check into pcie_lbms_seen() PCI: Refactor pcie_update_link_speed() PCI: Store all PCIe Supported Link Speeds PCI: Protect Link Control 2 Register with RMW locking Documentation PCI: Reformat RMW ops documentation
Diffstat (limited to 'drivers')
-rw-r--r--drivers/pci/hotplug/pciehp_ctrl.c5
-rw-r--r--drivers/pci/hotplug/pciehp_hpc.c2
-rw-r--r--drivers/pci/pci.c60
-rw-r--r--drivers/pci/pci.h41
-rw-r--r--drivers/pci/pcie/Makefile2
-rw-r--r--drivers/pci/pcie/bwctrl.c366
-rw-r--r--drivers/pci/pcie/portdrv.c9
-rw-r--r--drivers/pci/pcie/portdrv.h6
-rw-r--r--drivers/pci/probe.c15
-rw-r--r--drivers/pci/quirks.c32
-rw-r--r--drivers/thermal/Kconfig9
-rw-r--r--drivers/thermal/Makefile2
-rw-r--r--drivers/thermal/pcie_cooling.c80
13 files changed, 581 insertions, 48 deletions
diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c
index dcdbfcf404dd..d603a7aa7483 100644
--- a/drivers/pci/hotplug/pciehp_ctrl.c
+++ b/drivers/pci/hotplug/pciehp_ctrl.c
@@ -19,6 +19,8 @@
#include <linux/types.h>
#include <linux/pm_runtime.h>
#include <linux/pci.h>
+
+#include "../pci.h"
#include "pciehp.h"
/* The following routines constitute the bulk of the
@@ -127,6 +129,9 @@ static void remove_board(struct controller *ctrl, bool safe_removal)
pciehp_set_indicators(ctrl, PCI_EXP_SLTCTL_PWR_IND_OFF,
INDICATOR_NOOP);
+
+ /* Don't carry LBMS indications across */
+ pcie_reset_lbms_count(ctrl->pcie->port);
}
static int pciehp_enable_slot(struct controller *ctrl);
diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 736ad8baa2a5..bb5a8d9f03ad 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -319,7 +319,7 @@ int pciehp_check_link_status(struct controller *ctrl)
return -1;
}
- pcie_update_link_speed(ctrl->pcie->port->subordinate, lnk_status);
+ __pcie_update_link_speed(ctrl->pcie->port->subordinate, lnk_status);
if (!found) {
ctrl_info(ctrl, "Slot(%s): No device found\n",
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 7d85c04fbba2..f85f380cdb9b 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -4740,7 +4740,7 @@ int pcie_retrain_link(struct pci_dev *pdev, bool use_lt)
* to track link speed or width changes made by hardware itself
* in attempt to correct unreliable link operation.
*/
- pcie_capability_write_word(pdev, PCI_EXP_LNKSTA, PCI_EXP_LNKSTA_LBMS);
+ pcie_reset_lbms_count(pdev);
return rc;
}
@@ -6189,38 +6189,64 @@ u32 pcie_bandwidth_available(struct pci_dev *dev, struct pci_dev **limiting_dev,
EXPORT_SYMBOL(pcie_bandwidth_available);
/**
- * pcie_get_speed_cap - query for the PCI device's link speed capability
+ * pcie_get_supported_speeds - query Supported Link Speed Vector
* @dev: PCI device to query
*
- * Query the PCI device speed capability. Return the maximum link speed
- * supported by the device.
+ * Query @dev supported link speeds.
+ *
+ * Implementation Note in PCIe r6.0 sec 7.5.3.18 recommends determining
+ * supported link speeds using the Supported Link Speeds Vector in the Link
+ * Capabilities 2 Register (when available).
+ *
+ * Link Capabilities 2 was added in PCIe r3.0, sec 7.8.18.
+ *
+ * Without Link Capabilities 2, i.e., prior to PCIe r3.0, Supported Link
+ * Speeds field in Link Capabilities is used and only 2.5 GT/s and 5.0 GT/s
+ * speeds were defined.
+ *
+ * For @dev without Supported Link Speed Vector, the field is synthesized
+ * from the Max Link Speed field in the Link Capabilities Register.
+ *
+ * Return: Supported Link Speeds Vector (+ reserved 0 at LSB).
*/
-enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev)
+u8 pcie_get_supported_speeds(struct pci_dev *dev)
{
u32 lnkcap2, lnkcap;
+ u8 speeds;
/*
- * Link Capabilities 2 was added in PCIe r3.0, sec 7.8.18. The
- * implementation note there recommends using the Supported Link
- * Speeds Vector in Link Capabilities 2 when supported.
- *
- * Without Link Capabilities 2, i.e., prior to PCIe r3.0, software
- * should use the Supported Link Speeds field in Link Capabilities,
- * where only 2.5 GT/s and 5.0 GT/s speeds were defined.
+ * Speeds retain the reserved 0 at LSB before PCIe Supported Link
+ * Speeds Vector to allow using SLS Vector bit defines directly.
*/
pcie_capability_read_dword(dev, PCI_EXP_LNKCAP2, &lnkcap2);
+ speeds = lnkcap2 & PCI_EXP_LNKCAP2_SLS;
/* PCIe r3.0-compliant */
- if (lnkcap2)
- return PCIE_LNKCAP2_SLS2SPEED(lnkcap2);
+ if (speeds)
+ return speeds;
pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap);
+
+ /* Synthesize from the Max Link Speed field */
if ((lnkcap & PCI_EXP_LNKCAP_SLS) == PCI_EXP_LNKCAP_SLS_5_0GB)
- return PCIE_SPEED_5_0GT;
+ speeds = PCI_EXP_LNKCAP2_SLS_5_0GB | PCI_EXP_LNKCAP2_SLS_2_5GB;
else if ((lnkcap & PCI_EXP_LNKCAP_SLS) == PCI_EXP_LNKCAP_SLS_2_5GB)
- return PCIE_SPEED_2_5GT;
+ speeds = PCI_EXP_LNKCAP2_SLS_2_5GB;
- return PCI_SPEED_UNKNOWN;
+ return speeds;
+}
+
+/**
+ * pcie_get_speed_cap - query for the PCI device's link speed capability
+ * @dev: PCI device to query
+ *
+ * Query the PCI device speed capability.
+ *
+ * Return: the maximum link speed supported by the device.
+ */
+enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev)
+{
+ return PCIE_LNKCAP2_SLS2SPEED(dev->supported_speeds);
}
EXPORT_SYMBOL(pcie_get_speed_cap);
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 14d00ce45bfa..1d5c519e19b1 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -331,6 +331,17 @@ void pci_disable_bridge_window(struct pci_dev *dev);
struct pci_bus *pci_bus_get(struct pci_bus *bus);
void pci_bus_put(struct pci_bus *bus);
+#define PCIE_LNKCAP_SLS2SPEED(lnkcap) \
+({ \
+ ((lnkcap) == PCI_EXP_LNKCAP_SLS_64_0GB ? PCIE_SPEED_64_0GT : \
+ (lnkcap) == PCI_EXP_LNKCAP_SLS_32_0GB ? PCIE_SPEED_32_0GT : \
+ (lnkcap) == PCI_EXP_LNKCAP_SLS_16_0GB ? PCIE_SPEED_16_0GT : \
+ (lnkcap) == PCI_EXP_LNKCAP_SLS_8_0GB ? PCIE_SPEED_8_0GT : \
+ (lnkcap) == PCI_EXP_LNKCAP_SLS_5_0GB ? PCIE_SPEED_5_0GT : \
+ (lnkcap) == PCI_EXP_LNKCAP_SLS_2_5GB ? PCIE_SPEED_2_5GT : \
+ PCI_SPEED_UNKNOWN); \
+})
+
/* PCIe link information from Link Capabilities 2 */
#define PCIE_LNKCAP2_SLS2SPEED(lnkcap2) \
((lnkcap2) & PCI_EXP_LNKCAP2_SLS_64_0GB ? PCIE_SPEED_64_0GT : \
@@ -341,6 +352,15 @@ void pci_bus_put(struct pci_bus *bus);
(lnkcap2) & PCI_EXP_LNKCAP2_SLS_2_5GB ? PCIE_SPEED_2_5GT : \
PCI_SPEED_UNKNOWN)
+#define PCIE_LNKCTL2_TLS2SPEED(lnkctl2) \
+ ((lnkctl2) == PCI_EXP_LNKCTL2_TLS_64_0GT ? PCIE_SPEED_64_0GT : \
+ (lnkctl2) == PCI_EXP_LNKCTL2_TLS_32_0GT ? PCIE_SPEED_32_0GT : \
+ (lnkctl2) == PCI_EXP_LNKCTL2_TLS_16_0GT ? PCIE_SPEED_16_0GT : \
+ (lnkctl2) == PCI_EXP_LNKCTL2_TLS_8_0GT ? PCIE_SPEED_8_0GT : \
+ (lnkctl2) == PCI_EXP_LNKCTL2_TLS_5_0GT ? PCIE_SPEED_5_0GT : \
+ (lnkctl2) == PCI_EXP_LNKCTL2_TLS_2_5GT ? PCIE_SPEED_2_5GT : \
+ PCI_SPEED_UNKNOWN)
+
/* PCIe speed to Mb/s reduced by encoding overhead */
#define PCIE_SPEED2MBS_ENC(speed) \
((speed) == PCIE_SPEED_64_0GT ? 64000*1/1 : \
@@ -373,12 +393,16 @@ static inline int pcie_dev_speed_mbps(enum pci_bus_speed speed)
return -EINVAL;
}
+u8 pcie_get_supported_speeds(struct pci_dev *dev);
const char *pci_speed_string(enum pci_bus_speed speed);
-enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev);
-enum pcie_link_width pcie_get_width_cap(struct pci_dev *dev);
void __pcie_print_link_status(struct pci_dev *dev, bool verbose);
void pcie_report_downtraining(struct pci_dev *dev);
-void pcie_update_link_speed(struct pci_bus *bus, u16 link_status);
+
+static inline void __pcie_update_link_speed(struct pci_bus *bus, u16 linksta)
+{
+ bus->cur_bus_speed = pcie_link_speed[linksta & PCI_EXP_LNKSTA_CLS];
+}
+void pcie_update_link_speed(struct pci_bus *bus);
/* Single Root I/O Virtualization */
struct pci_sriov {
@@ -692,6 +716,17 @@ static inline void pcie_set_ecrc_checking(struct pci_dev *dev) { }
static inline void pcie_ecrc_get_policy(char *str) { }
#endif
+#ifdef CONFIG_PCIEPORTBUS
+void pcie_reset_lbms_count(struct pci_dev *port);
+int pcie_lbms_count(struct pci_dev *port, unsigned long *val);
+#else
+static inline void pcie_reset_lbms_count(struct pci_dev *port) {}
+static inline int pcie_lbms_count(struct pci_dev *port, unsigned long *val)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
struct pci_dev_reset_methods {
u16 vendor;
u16 device;
diff --git a/drivers/pci/pcie/Makefile b/drivers/pci/pcie/Makefile
index 6461aa93fe76..53ccab62314d 100644
--- a/drivers/pci/pcie/Makefile
+++ b/drivers/pci/pcie/Makefile
@@ -4,7 +4,7 @@
pcieportdrv-y := portdrv.o rcec.o
-obj-$(CONFIG_PCIEPORTBUS) += pcieportdrv.o
+obj-$(CONFIG_PCIEPORTBUS) += pcieportdrv.o bwctrl.o
obj-y += aspm.o
obj-$(CONFIG_PCIEAER) += aer.o err.o
diff --git a/drivers/pci/pcie/bwctrl.c b/drivers/pci/pcie/bwctrl.c
new file mode 100644
index 000000000000..b59cacc740fa
--- /dev/null
+++ b/drivers/pci/pcie/bwctrl.c
@@ -0,0 +1,366 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * PCIe bandwidth controller
+ *
+ * Author: Alexandru Gagniuc <mr.nuke.me@gmail.com>
+ *
+ * Copyright (C) 2019 Dell Inc
+ * Copyright (C) 2023-2024 Intel Corporation
+ *
+ * The PCIe bandwidth controller provides a way to alter PCIe Link Speeds
+ * and notify the operating system when the Link Width or Speed changes. The
+ * notification capability is required for all Root Ports and Downstream
+ * Ports supporting Link Width wider than x1 and/or multiple Link Speeds.
+ *
+ * This service port driver hooks into the Bandwidth Notification interrupt
+ * watching for changes or links becoming degraded in operation. It updates
+ * the cached Current Link Speed that is exposed to user space through sysfs.
+ */
+
+#define dev_fmt(fmt) "bwctrl: " fmt
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/bits.h>
+#include <linux/cleanup.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/pci-bwctrl.h>
+#include <linux/rwsem.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+
+#include "../pci.h"
+#include "portdrv.h"
+
+/**
+ * struct pcie_bwctrl_data - PCIe bandwidth controller
+ * @set_speed_mutex: Serializes link speed changes
+ * @lbms_count: Count for LBMS (since last reset)
+ * @cdev: Thermal cooling device associated with the port
+ */
+struct pcie_bwctrl_data {
+ struct mutex set_speed_mutex;
+ atomic_t lbms_count;
+ struct thermal_cooling_device *cdev;
+};
+
+/*
+ * Prevent port removal during LBMS count accessors and Link Speed changes.
+ *
+ * These have to be differentiated because pcie_bwctrl_change_speed() calls
+ * pcie_retrain_link() which uses LBMS count reset accessor on success
+ * (using just one rwsem triggers "possible recursive locking detected"
+ * warning).
+ */
+static DECLARE_RWSEM(pcie_bwctrl_lbms_rwsem);
+static DECLARE_RWSEM(pcie_bwctrl_setspeed_rwsem);
+
+static bool pcie_valid_speed(enum pci_bus_speed speed)
+{
+ return (speed >= PCIE_SPEED_2_5GT) && (speed <= PCIE_SPEED_64_0GT);
+}
+
+static u16 pci_bus_speed2lnkctl2(enum pci_bus_speed speed)
+{
+ static const u8 speed_conv[] = {
+ [PCIE_SPEED_2_5GT] = PCI_EXP_LNKCTL2_TLS_2_5GT,
+ [PCIE_SPEED_5_0GT] = PCI_EXP_LNKCTL2_TLS_5_0GT,
+ [PCIE_SPEED_8_0GT] = PCI_EXP_LNKCTL2_TLS_8_0GT,
+ [PCIE_SPEED_16_0GT] = PCI_EXP_LNKCTL2_TLS_16_0GT,
+ [PCIE_SPEED_32_0GT] = PCI_EXP_LNKCTL2_TLS_32_0GT,
+ [PCIE_SPEED_64_0GT] = PCI_EXP_LNKCTL2_TLS_64_0GT,
+ };
+
+ if (WARN_ON_ONCE(!pcie_valid_speed(speed)))
+ return 0;
+
+ return speed_conv[speed];
+}
+
+static inline u16 pcie_supported_speeds2target_speed(u8 supported_speeds)
+{
+ return __fls(supported_speeds);
+}
+
+/**
+ * pcie_bwctrl_select_speed - Select Target Link Speed
+ * @port: PCIe Port
+ * @speed_req: Requested PCIe Link Speed
+ *
+ * Select Target Link Speed by take into account Supported Link Speeds of
+ * both the Root Port and the Endpoint.
+ *
+ * Return: Target Link Speed (1=2.5GT/s, 2=5GT/s, 3=8GT/s, etc.)
+ */
+static u16 pcie_bwctrl_select_speed(struct pci_dev *port, enum pci_bus_speed speed_req)
+{
+ struct pci_bus *bus = port->subordinate;
+ u8 desired_speeds, supported_speeds;
+ struct pci_dev *dev;
+
+ desired_speeds = GENMASK(pci_bus_speed2lnkctl2(speed_req),
+ __fls(PCI_EXP_LNKCAP2_SLS_2_5GB));
+
+ supported_speeds = port->supported_speeds;
+ if (bus) {
+ down_read(&pci_bus_sem);
+ dev = list_first_entry_or_null(&bus->devices, struct pci_dev, bus_list);
+ if (dev)
+ supported_speeds &= dev->supported_speeds;
+ up_read(&pci_bus_sem);
+ }
+ if (!supported_speeds)
+ return PCI_EXP_LNKCAP2_SLS_2_5GB;
+
+ return pcie_supported_speeds2target_speed(supported_speeds & desired_speeds);
+}
+
+static int pcie_bwctrl_change_speed(struct pci_dev *port, u16 target_speed, bool use_lt)
+{
+ int ret;
+
+ ret = pcie_capability_clear_and_set_word(port, PCI_EXP_LNKCTL2,
+ PCI_EXP_LNKCTL2_TLS, target_speed);
+ if (ret != PCIBIOS_SUCCESSFUL)
+ return pcibios_err_to_errno(ret);
+
+ ret = pcie_retrain_link(port, use_lt);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Ensure link speed updates also with platforms that have problems
+ * with notifications.
+ */
+ if (port->subordinate)
+ pcie_update_link_speed(port->subordinate);
+
+ return 0;
+}
+
+/**
+ * pcie_set_target_speed - Set downstream Link Speed for PCIe Port
+ * @port: PCIe Port
+ * @speed_req: Requested PCIe Link Speed
+ * @use_lt: Wait for the LT or DLLLA bit to detect the end of link training
+ *
+ * Attempt to set PCIe Port Link Speed to @speed_req. @speed_req may be
+ * adjusted downwards to the best speed supported by both the Port and PCIe
+ * Device underneath it.
+ *
+ * Return:
+ * * 0 - on success
+ * * -EINVAL - @speed_req is not a PCIe Link Speed
+ * * -ENODEV - @port is not controllable
+ * * -ETIMEDOUT - changing Link Speed took too long
+ * * -EAGAIN - Link Speed was changed but @speed_req was not achieved
+ */
+int pcie_set_target_speed(struct pci_dev *port, enum pci_bus_speed speed_req,
+ bool use_lt)
+{
+ struct pci_bus *bus = port->subordinate;
+ u16 target_speed;
+ int ret;
+
+ if (WARN_ON_ONCE(!pcie_valid_speed(speed_req)))
+ return -EINVAL;
+
+ if (bus && bus->cur_bus_speed == speed_req)
+ return 0;
+
+ target_speed = pcie_bwctrl_select_speed(port, speed_req);
+
+ scoped_guard(rwsem_read, &pcie_bwctrl_setspeed_rwsem) {
+ struct pcie_bwctrl_data *data = port->link_bwctrl;
+
+ /*
+ * port->link_bwctrl is NULL during initial scan when called
+ * e.g. from the Target Speed quirk.
+ */
+ if (data)
+ mutex_lock(&data->set_speed_mutex);
+
+ ret = pcie_bwctrl_change_speed(port, target_speed, use_lt);
+
+ if (data)
+ mutex_unlock(&data->set_speed_mutex);
+ }
+
+ /*
+ * Despite setting higher speed into the Target Link Speed, empty
+ * bus won't train to 5GT+ speeds.
+ */
+ if (!ret && bus && bus->cur_bus_speed != speed_req &&
+ !list_empty(&bus->devices))
+ ret = -EAGAIN;
+
+ return ret;
+}
+
+static void pcie_bwnotif_enable(struct pcie_device *srv)
+{
+ struct pcie_bwctrl_data *data = srv->port->link_bwctrl;
+ struct pci_dev *port = srv->port;
+ u16 link_status;
+ int ret;
+
+ /* Count LBMS seen so far as one */
+ ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status);
+ if (ret == PCIBIOS_SUCCESSFUL && link_status & PCI_EXP_LNKSTA_LBMS)
+ atomic_inc(&data->lbms_count);
+
+ pcie_capability_set_word(port, PCI_EXP_LNKCTL,
+ PCI_EXP_LNKCTL_LBMIE | PCI_EXP_LNKCTL_LABIE);
+ pcie_capability_write_word(port, PCI_EXP_LNKSTA,
+ PCI_EXP_LNKSTA_LBMS | PCI_EXP_LNKSTA_LABS);
+
+ /*
+ * Update after enabling notifications & clearing status bits ensures
+ * link speed is up to date.
+ */
+ pcie_update_link_speed(port->subordinate);
+}
+
+static void pcie_bwnotif_disable(struct pci_dev *port)
+{
+ pcie_capability_clear_word(port, PCI_EXP_LNKCTL,
+ PCI_EXP_LNKCTL_LBMIE | PCI_EXP_LNKCTL_LABIE);
+}
+
+static irqreturn_t pcie_bwnotif_irq(int irq, void *context)
+{
+ struct pcie_device *srv = context;
+ struct pcie_bwctrl_data *data = srv->port->link_bwctrl;
+ struct pci_dev *port = srv->port;
+ u16 link_status, events;
+ int ret;
+
+ ret = pcie_capability_read_word(port, PCI_EXP_LNKSTA, &link_status);
+ if (ret != PCIBIOS_SUCCESSFUL)
+ return IRQ_NONE;
+
+ events = link_status & (PCI_EXP_LNKSTA_LBMS | PCI_EXP_LNKSTA_LABS);
+ if (!events)
+ return IRQ_NONE;
+
+ if (events & PCI_EXP_LNKSTA_LBMS)
+ atomic_inc(&data->lbms_count);
+
+ pcie_capability_write_word(port, PCI_EXP_LNKSTA, events);
+
+ /*
+ * Interrupts will not be triggered from any further Link Speed
+ * change until LBMS is cleared by the write. Therefore, re-read the
+ * speed (inside pcie_update_link_speed()) after LBMS has been
+ * cleared to avoid missing link speed changes.
+ */
+ pcie_update_link_speed(port->subordinate);
+
+ return IRQ_HANDLED;
+}
+
+void pcie_reset_lbms_count(struct pci_dev *port)
+{
+ struct pcie_bwctrl_data *data;
+
+ guard(rwsem_read)(&pcie_bwctrl_lbms_rwsem);
+ data = port->link_bwctrl;
+ if (data)
+ atomic_set(&data->lbms_count, 0);
+ else
+ pcie_capability_write_word(port, PCI_EXP_LNKSTA,
+ PCI_EXP_LNKSTA_LBMS);
+}
+
+int pcie_lbms_count(struct pci_dev *port, unsigned long *val)
+{
+ struct pcie_bwctrl_data *data;
+
+ guard(rwsem_read)(&pcie_bwctrl_lbms_rwsem);
+ data = port->link_bwctrl;
+ if (!data)
+ return -ENOTTY;
+
+ *val = atomic_read(&data->lbms_count);
+
+ return 0;
+}
+
+static int pcie_bwnotif_probe(struct pcie_device *srv)
+{
+ struct pci_dev *port = srv->port;
+ int ret;
+
+ struct pcie_bwctrl_data *data = devm_kzalloc(&srv->device,
+ sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ ret = devm_mutex_init(&srv->device, &data->set_speed_mutex);
+ if (ret)
+ return ret;
+
+ ret = devm_request_irq(&srv->device, srv->irq, pcie_bwnotif_irq,
+ IRQF_SHARED, "PCIe bwctrl", srv);
+ if (ret)
+ return ret;
+
+ scoped_guard(rwsem_write, &pcie_bwctrl_setspeed_rwsem) {
+ scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem) {
+ port->link_bwctrl = no_free_ptr(data);
+ pcie_bwnotif_enable(srv);
+ }
+ }
+
+ pci_dbg(port, "enabled with IRQ %d\n", srv->irq);
+
+ /* Don't fail on errors. Don't leave IS_ERR() "pointer" into ->cdev */
+ port->link_bwctrl->cdev = pcie_cooling_device_register(port);
+ if (IS_ERR(port->link_bwctrl->cdev))
+ port->link_bwctrl->cdev = NULL;
+
+ return 0;
+}
+
+static void pcie_bwnotif_remove(struct pcie_device *srv)
+{
+ struct pcie_bwctrl_data *data = srv->port->link_bwctrl;
+
+ pcie_cooling_device_unregister(data->cdev);
+
+ pcie_bwnotif_disable(srv->port);
+
+ scoped_guard(rwsem_write, &pcie_bwctrl_setspeed_rwsem)
+ scoped_guard(rwsem_write, &pcie_bwctrl_lbms_rwsem)
+ srv->port->link_bwctrl = NULL;
+}
+
+static int pcie_bwnotif_suspend(struct pcie_device *srv)
+{
+ pcie_bwnotif_disable(srv->port);
+ return 0;
+}
+
+static int pcie_bwnotif_resume(struct pcie_device *srv)
+{
+ pcie_bwnotif_enable(srv);
+ return 0;
+}
+
+static struct pcie_port_service_driver pcie_bwctrl_driver = {
+ .name = "pcie_bwctrl",
+ .port_type = PCIE_ANY_PORT,
+ .service = PCIE_PORT_SERVICE_BWCTRL,
+ .probe = pcie_bwnotif_probe,
+ .suspend = pcie_bwnotif_suspend,
+ .resume = pcie_bwnotif_resume,
+ .remove = pcie_bwnotif_remove,
+};
+
+int __init pcie_bwctrl_init(void)
+{
+ return pcie_port_service_register(&pcie_bwctrl_driver);
+}
diff --git a/drivers/pci/pcie/portdrv.c b/drivers/pci/pcie/portdrv.c
index 6af5e0425872..5e10306b6308 100644
--- a/drivers/pci/pcie/portdrv.c
+++ b/drivers/pci/pcie/portdrv.c
@@ -68,7 +68,7 @@ static int pcie_message_numbers(struct pci_dev *dev, int mask,
*/
if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP |
- PCIE_PORT_SERVICE_BWNOTIF)) {
+ PCIE_PORT_SERVICE_BWCTRL)) {
pcie_capability_read_word(dev, PCI_EXP_FLAGS, &reg16);
*pme = FIELD_GET(PCI_EXP_FLAGS_IRQ, reg16);
nvec = *pme + 1;
@@ -150,11 +150,11 @@ static int pcie_port_enable_irq_vec(struct pci_dev *dev, int *irqs, int mask)
/* PME, hotplug and bandwidth notification share an MSI/MSI-X vector */
if (mask & (PCIE_PORT_SERVICE_PME | PCIE_PORT_SERVICE_HP |
- PCIE_PORT_SERVICE_BWNOTIF)) {
+ PCIE_PORT_SERVICE_BWCTRL)) {
pcie_irq = pci_irq_vector(dev, pme);
irqs[PCIE_PORT_SERVICE_PME_SHIFT] = pcie_irq;
irqs[PCIE_PORT_SERVICE_HP_SHIFT] = pcie_irq;
- irqs[PCIE_PORT_SERVICE_BWNOTIF_SHIFT] = pcie_irq;
+ irqs[PCIE_PORT_SERVICE_BWCTRL_SHIFT] = pcie_irq;
}
if (mask & PCIE_PORT_SERVICE_AER)
@@ -271,7 +271,7 @@ static int get_port_device_capability(struct pci_dev *dev)
pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &linkcap);
if (linkcap & PCI_EXP_LNKCAP_LBNC)
- services |= PCIE_PORT_SERVICE_BWNOTIF;
+ services |= PCIE_PORT_SERVICE_BWCTRL;
}
return services;
@@ -828,6 +828,7 @@ static void __init pcie_init_services(void)
pcie_aer_init();
pcie_pme_init();
pcie_dpc_init();
+ pcie_bwctrl_init();
pcie_hp_init();
}
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index 12c89ea0313b..bd29d1cc7b8b 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -20,8 +20,8 @@
#define PCIE_PORT_SERVICE_HP (1 << PCIE_PORT_SERVICE_HP_SHIFT)
#define PCIE_PORT_SERVICE_DPC_SHIFT 3 /* Downstream Port Containment */
#define PCIE_PORT_SERVICE_DPC (1 << PCIE_PORT_SERVICE_DPC_SHIFT)
-#define PCIE_PORT_SERVICE_BWNOTIF_SHIFT 4 /* Bandwidth notification */
-#define PCIE_PORT_SERVICE_BWNOTIF (1 << PCIE_PORT_SERVICE_BWNOTIF_SHIFT)
+#define PCIE_PORT_SERVICE_BWCTRL_SHIFT 4 /* Bandwidth Controller (notifications) */
+#define PCIE_PORT_SERVICE_BWCTRL (1 << PCIE_PORT_SERVICE_BWCTRL_SHIFT)
#define PCIE_PORT_DEVICE_MAXSERVICES 5
@@ -51,6 +51,8 @@ int pcie_dpc_init(void);
static inline int pcie_dpc_init(void) { return 0; }
#endif
+int pcie_bwctrl_init(void);
+
/* Port Type */
#define PCIE_ANY_PORT (~0)
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 4f68414c3086..c138daf78961 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -742,9 +742,13 @@ const char *pci_speed_string(enum pci_bus_speed speed)
}
EXPORT_SYMBOL_GPL(pci_speed_string);
-void pcie_update_link_speed(struct pci_bus *bus, u16 linksta)
+void pcie_update_link_speed(struct pci_bus *bus)
{
- bus->cur_bus_speed = pcie_link_speed[linksta & PCI_EXP_LNKSTA_CLS];
+ struct pci_dev *bridge = bus->self;
+ u16 linksta;
+
+ pcie_capability_read_word(bridge, PCI_EXP_LNKSTA, &linksta);
+ __pcie_update_link_speed(bus, linksta);
}
EXPORT_SYMBOL_GPL(pcie_update_link_speed);
@@ -827,13 +831,11 @@ static void pci_set_bus_speed(struct pci_bus *bus)
if (pci_is_pcie(bridge)) {
u32 linkcap;
- u16 linksta;
pcie_capability_read_dword(bridge, PCI_EXP_LNKCAP, &linkcap);
bus->max_bus_speed = pcie_link_speed[linkcap & PCI_EXP_LNKCAP_SLS];
- pcie_capability_read_word(bridge, PCI_EXP_LNKSTA, &linksta);
- pcie_update_link_speed(bus, linksta);
+ pcie_update_link_speed(bus);
}
}
@@ -1947,6 +1949,9 @@ int pci_setup_device(struct pci_dev *dev)
set_pcie_untrusted(dev);
+ if (pci_is_pcie(dev))
+ dev->supported_speeds = pcie_get_supported_speeds(dev);
+
/* "Unknown power state" */
dev->current_state = PCI_UNKNOWN;
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index dccb60c1d9cc..dcf1c86a5488 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -33,6 +33,18 @@
#include <linux/switchtec.h>
#include "pci.h"
+static bool pcie_lbms_seen(struct pci_dev *dev, u16 lnksta)
+{
+ unsigned long count;
+ int ret;
+
+ ret = pcie_lbms_count(dev, &count);
+ if (ret < 0)
+ return lnksta & PCI_EXP_LNKSTA_LBMS;
+
+ return count > 0;
+}
+
/*
* Retrain the link of a downstream PCIe port by hand if necessary.
*
@@ -96,22 +108,16 @@ int pcie_failed_link_retrain(struct pci_dev *dev)
pcie_capability_read_word(dev, PCI_EXP_LNKCTL2, &lnkctl2);
pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &lnksta);
- if ((lnksta & (PCI_EXP_LNKSTA_LBMS | PCI_EXP_LNKSTA_DLLLA)) ==
- PCI_EXP_LNKSTA_LBMS) {
+ if (!(lnksta & PCI_EXP_LNKSTA_DLLLA) && pcie_lbms_seen(dev, lnksta)) {
u16 oldlnkctl2 = lnkctl2;
pci_info(dev, "broken device, retraining non-functional downstream link at 2.5GT/s\n");
- lnkctl2 &= ~PCI_EXP_LNKCTL2_TLS;
- lnkctl2 |= PCI_EXP_LNKCTL2_TLS_2_5GT;
- pcie_capability_write_word(dev, PCI_EXP_LNKCTL2, lnkctl2);
-
- ret = pcie_retrain_link(dev, false);
+ ret = pcie_set_target_speed(dev, PCIE_SPEED_2_5GT, false);
if (ret) {
pci_info(dev, "retraining failed\n");
- pcie_capability_write_word(dev, PCI_EXP_LNKCTL2,
- oldlnkctl2);
- pcie_retrain_link(dev, true);
+ pcie_set_target_speed(dev, PCIE_LNKCTL2_TLS2SPEED(oldlnkctl2),
+ true);
return ret;
}
@@ -125,11 +131,7 @@ int pcie_failed_link_retrain(struct pci_dev *dev)
pci_info(dev, "removing 2.5GT/s downstream link speed restriction\n");
pcie_capability_read_dword(dev, PCI_EXP_LNKCAP, &lnkcap);
- lnkctl2 &= ~PCI_EXP_LNKCTL2_TLS;
- lnkctl2 |= lnkcap & PCI_EXP_LNKCAP_SLS;
- pcie_capability_write_word(dev, PCI_EXP_LNKCTL2, lnkctl2);
-
- ret = pcie_retrain_link(dev, false);
+ ret = pcie_set_target_speed(dev, PCIE_LNKCAP_SLS2SPEED(lnkcap), false);
if (ret) {
pci_info(dev, "retraining failed\n");
return ret;
diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index 61e7ae524b1f..d3f9686e26e7 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -220,6 +220,15 @@ config DEVFREQ_THERMAL
If you want this support, you should say Y here.
+config PCIE_THERMAL
+ bool "PCIe cooling support"
+ depends on PCIEPORTBUS
+ help
+ This implements PCIe cooling mechanism through bandwidth reduction
+ for PCIe devices.
+
+ If you want this support, you should say Y here.
+
config THERMAL_EMULATION
bool "Thermal emulation mode support"
help
diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index 41c4d56beb40..210c16c91461 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -31,6 +31,8 @@ thermal_sys-$(CONFIG_CPU_IDLE_THERMAL) += cpuidle_cooling.o
# devfreq cooling
thermal_sys-$(CONFIG_DEVFREQ_THERMAL) += devfreq_cooling.o
+thermal_sys-$(CONFIG_PCIE_THERMAL) += pcie_cooling.o
+
obj-$(CONFIG_K3_THERMAL) += k3_bandgap.o k3_j72xx_bandgap.o
# platform thermal drivers
obj-y += broadcom/
diff --git a/drivers/thermal/pcie_cooling.c b/drivers/thermal/pcie_cooling.c
new file mode 100644
index 000000000000..a876d64f1582
--- /dev/null
+++ b/drivers/thermal/pcie_cooling.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * PCIe cooling device
+ *
+ * Copyright (C) 2023-2024 Intel Corporation
+ */
+
+#include <linux/build_bug.h>
+#include <linux/cleanup.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/pci-bwctrl.h>
+#include <linux/slab.h>
+#include <linux/sprintf.h>
+#include <linux/thermal.h>
+
+#define COOLING_DEV_TYPE_PREFIX "PCIe_Port_Link_Speed_"
+
+static int pcie_cooling_get_max_level(struct thermal_cooling_device *cdev, unsigned long *state)
+{
+ struct pci_dev *port = cdev->devdata;
+
+ /* cooling state 0 is same as the maximum PCIe speed */
+ *state = port->subordinate->max_bus_speed - PCIE_SPEED_2_5GT;
+
+ return 0;
+}
+
+static int pcie_cooling_get_cur_level(struct thermal_cooling_device *cdev, unsigned long *state)
+{
+ struct pci_dev *port = cdev->devdata;
+
+ /* cooling state 0 is same as the maximum PCIe speed */
+ *state = cdev->max_state - (port->subordinate->cur_bus_speed - PCIE_SPEED_2_5GT);
+
+ return 0;
+}
+
+static int pcie_cooling_set_cur_level(struct thermal_cooling_device *cdev, unsigned long state)
+{
+ struct pci_dev *port = cdev->devdata;
+ enum pci_bus_speed speed;
+
+ /* cooling state 0 is same as the maximum PCIe speed */
+ speed = (cdev->max_state - state) + PCIE_SPEED_2_5GT;
+
+ return pcie_set_target_speed(port, speed, true);
+}
+
+static struct thermal_cooling_device_ops pcie_cooling_ops = {
+ .get_max_state = pcie_cooling_get_max_level,
+ .get_cur_state = pcie_cooling_get_cur_level,
+ .set_cur_state = pcie_cooling_set_cur_level,
+};
+
+struct thermal_cooling_device *pcie_cooling_device_register(struct pci_dev *port)
+{
+ char *name __free(kfree) =
+ kasprintf(GFP_KERNEL, COOLING_DEV_TYPE_PREFIX "%s", pci_name(port));
+ if (!name)
+ return ERR_PTR(-ENOMEM);
+
+ return thermal_cooling_device_register(name, port, &pcie_cooling_ops);
+}
+
+void pcie_cooling_device_unregister(struct thermal_cooling_device *cdev)
+{
+ thermal_cooling_device_unregister(cdev);
+}
+
+/* For bus_speed <-> state arithmetic */
+static_assert(PCIE_SPEED_2_5GT + 1 == PCIE_SPEED_5_0GT);
+static_assert(PCIE_SPEED_5_0GT + 1 == PCIE_SPEED_8_0GT);
+static_assert(PCIE_SPEED_8_0GT + 1 == PCIE_SPEED_16_0GT);
+static_assert(PCIE_SPEED_16_0GT + 1 == PCIE_SPEED_32_0GT);
+static_assert(PCIE_SPEED_32_0GT + 1 == PCIE_SPEED_64_0GT);
+
+MODULE_AUTHOR("Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>");
+MODULE_DESCRIPTION("PCIe cooling driver");