summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-04-14 16:48:56 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2026-04-14 16:48:56 -0700
commitc43267e6794a36013fd495a4d81bf7f748fe4615 (patch)
tree52de9204f6c6c94f1b419de234834ec0f4b454d2 /drivers
parent508fed6795411f5ab277fd1edc0d7adca4946f23 (diff)
parent480a9e57cceaf42db6ff874dbfe91de201935035 (diff)
downloadlwn-c43267e6794a36013fd495a4d81bf7f748fe4615.tar.gz
lwn-c43267e6794a36013fd495a4d81bf7f748fe4615.zip
Merge tag 'arm64-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux
Pull arm64 updates from Catalin Marinas: "The biggest changes are MPAM enablement in drivers/resctrl and new PMU support under drivers/perf. On the core side, FEAT_LSUI lets futex atomic operations with EL0 permissions, avoiding PAN toggling. The rest is mostly TLB invalidation refactoring, further generic entry work, sysreg updates and a few fixes. Core features: - Add support for FEAT_LSUI, allowing futex atomic operations without toggling Privileged Access Never (PAN) - Further refactor the arm64 exception handling code towards the generic entry infrastructure - Optimise __READ_ONCE() with CONFIG_LTO=y and allow alias analysis through it Memory management: - Refactor the arm64 TLB invalidation API and implementation for better control over barrier placement and level-hinted invalidation - Enable batched TLB flushes during memory hot-unplug - Fix rodata=full block mapping support for realm guests (when BBML2_NOABORT is available) Perf and PMU: - Add support for a whole bunch of system PMUs featured in NVIDIA's Tegra410 SoC (cspmu extensions for the fabric and PCIe, new drivers for CPU/C2C memory latency PMUs) - Clean up iomem resource handling in the Arm CMN driver - Fix signedness handling of AA64DFR0.{PMUVer,PerfMon} MPAM (Memory Partitioning And Monitoring): - Add architecture context-switch and hiding of the feature from KVM - Add interface to allow MPAM to be exposed to user-space using resctrl - Add errata workaround for some existing platforms - Add documentation for using MPAM and what shape of platforms can use resctrl Miscellaneous: - Check DAIF (and PMR, where relevant) at task-switch time - Skip TFSR_EL1 checks and barriers in synchronous MTE tag check mode (only relevant to asynchronous or asymmetric tag check modes) - Remove a duplicate allocation in the kexec code - Remove redundant save/restore of SCS SP on entry to/from EL0 - Generate the KERNEL_HWCAP_ definitions from the arm64 hwcap descriptions - Add kselftest coverage for cmpbr_sigill() - Update sysreg definitions" * tag 'arm64-upstream' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux: (109 commits) arm64: rsi: use linear-map alias for realm config buffer arm64: Kconfig: fix duplicate word in CMDLINE help text arm64: mte: Skip TFSR_EL1 checks and barriers in synchronous tag check mode arm64/sysreg: Update ID_AA64SMFR0_EL1 description to DDI0601 2025-12 arm64/sysreg: Update ID_AA64ZFR0_EL1 description to DDI0601 2025-12 arm64/sysreg: Update ID_AA64FPFR0_EL1 description to DDI0601 2025-12 arm64/sysreg: Update ID_AA64ISAR2_EL1 description to DDI0601 2025-12 arm64/sysreg: Update ID_AA64ISAR0_EL1 description to DDI0601 2025-12 arm64/hwcap: Generate the KERNEL_HWCAP_ definitions for the hwcaps arm64: kexec: Remove duplicate allocation for trans_pgd ACPI: AGDI: fix missing newline in error message arm64: Check DAIF (and PMR) at task-switch time arm64: entry: Use split preemption logic arm64: entry: Use irqentry_{enter_from,exit_to}_kernel_mode() arm64: entry: Consistently prefix arm64-specific wrappers arm64: entry: Don't preempt with SError or Debug masked entry: Split preemption from irqentry_exit_to_kernel_mode() entry: Split kernel mode logic from irqentry_{enter,exit}() entry: Move irqentry_enter() prototype later entry: Remove local_irq_{enable,disable}_exit_to_user() ...
Diffstat (limited to 'drivers')
-rw-r--r--drivers/acpi/arm64/agdi.c2
-rw-r--r--drivers/perf/Kconfig14
-rw-r--r--drivers/perf/Makefile2
-rw-r--r--drivers/perf/arm-cmn.c68
-rw-r--r--drivers/perf/arm_cspmu/arm_cspmu.c19
-rw-r--r--drivers/perf/arm_cspmu/arm_cspmu.h17
-rw-r--r--drivers/perf/arm_cspmu/nvidia_cspmu.c618
-rw-r--r--drivers/perf/nvidia_t410_c2c_pmu.c1051
-rw-r--r--drivers/perf/nvidia_t410_cmem_latency_pmu.c736
-rw-r--r--drivers/resctrl/Kconfig9
-rw-r--r--drivers/resctrl/Makefile1
-rw-r--r--drivers/resctrl/mpam_devices.c303
-rw-r--r--drivers/resctrl/mpam_internal.h108
-rw-r--r--drivers/resctrl/mpam_resctrl.c1704
-rw-r--r--drivers/resctrl/test_mpam_resctrl.c315
15 files changed, 4875 insertions, 92 deletions
diff --git a/drivers/acpi/arm64/agdi.c b/drivers/acpi/arm64/agdi.c
index feb4b2cb4618..0c2d9d6c160b 100644
--- a/drivers/acpi/arm64/agdi.c
+++ b/drivers/acpi/arm64/agdi.c
@@ -36,7 +36,7 @@ static int agdi_sdei_probe(struct platform_device *pdev,
err = sdei_event_register(adata->sdei_event, agdi_sdei_handler, pdev);
if (err) {
- dev_err(&pdev->dev, "Failed to register for SDEI event %d",
+ dev_err(&pdev->dev, "Failed to register for SDEI event %d\n",
adata->sdei_event);
return err;
}
diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 638321fc9800..ab90932fc2d0 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -311,4 +311,18 @@ config MARVELL_PEM_PMU
Enable support for PCIe Interface performance monitoring
on Marvell platform.
+config NVIDIA_TEGRA410_CMEM_LATENCY_PMU
+ tristate "NVIDIA Tegra410 CPU Memory Latency PMU"
+ depends on ARM64 && ACPI
+ help
+ Enable perf support for CPU memory latency counters monitoring on
+ NVIDIA Tegra410 SoC.
+
+config NVIDIA_TEGRA410_C2C_PMU
+ tristate "NVIDIA Tegra410 C2C PMU"
+ depends on ARM64 && ACPI
+ help
+ Enable perf support for counters in NVIDIA C2C interface of NVIDIA
+ Tegra410 SoC.
+
endmenu
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index ea52711a87e3..eb8a022dad9a 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -35,3 +35,5 @@ obj-$(CONFIG_DWC_PCIE_PMU) += dwc_pcie_pmu.o
obj-$(CONFIG_ARM_CORESIGHT_PMU_ARCH_SYSTEM_PMU) += arm_cspmu/
obj-$(CONFIG_MESON_DDR_PMU) += amlogic/
obj-$(CONFIG_CXL_PMU) += cxl_pmu.o
+obj-$(CONFIG_NVIDIA_TEGRA410_CMEM_LATENCY_PMU) += nvidia_t410_cmem_latency_pmu.o
+obj-$(CONFIG_NVIDIA_TEGRA410_C2C_PMU) += nvidia_t410_c2c_pmu.o
diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 40c05c519a1d..f5305c8fdca4 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -2132,6 +2132,8 @@ static void arm_cmn_init_dtm(struct arm_cmn_dtm *dtm, struct arm_cmn_node *xp, i
static int arm_cmn_init_dtc(struct arm_cmn *cmn, struct arm_cmn_node *dn, int idx)
{
struct arm_cmn_dtc *dtc = cmn->dtc + idx;
+ const struct resource *cfg;
+ resource_size_t base, size;
dtc->pmu_base = dn->pmu_base;
dtc->base = dtc->pmu_base - arm_cmn_pmu_offset(cmn, dn);
@@ -2139,6 +2141,13 @@ static int arm_cmn_init_dtc(struct arm_cmn *cmn, struct arm_cmn_node *dn, int id
if (dtc->irq < 0)
return dtc->irq;
+ cfg = platform_get_resource(to_platform_device(cmn->dev), IORESOURCE_MEM, 0);
+ base = dtc->base - cmn->base + cfg->start;
+ size = cmn->part == PART_CMN600 ? SZ_16K : SZ_64K;
+ if (!devm_request_mem_region(cmn->dev, base, size, dev_name(cmn->dev)))
+ return dev_err_probe(cmn->dev, -EBUSY,
+ "Failed to request DTC region 0x%pa\n", &base);
+
writel_relaxed(CMN_DT_DTC_CTL_DT_EN, dtc->base + CMN_DT_DTC_CTL);
writel_relaxed(CMN_DT_PMCR_PMU_EN | CMN_DT_PMCR_OVFL_INTR_EN, CMN_DT_PMCR(dtc));
writeq_relaxed(0, CMN_DT_PMCCNTR(dtc));
@@ -2525,43 +2534,26 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
return 0;
}
-static int arm_cmn600_acpi_probe(struct platform_device *pdev, struct arm_cmn *cmn)
+static int arm_cmn_get_root(struct arm_cmn *cmn, const struct resource *cfg)
{
- struct resource *cfg, *root;
-
- cfg = platform_get_resource(pdev, IORESOURCE_MEM, 0);
- if (!cfg)
- return -EINVAL;
-
- root = platform_get_resource(pdev, IORESOURCE_MEM, 1);
- if (!root)
- return -EINVAL;
-
- if (!resource_contains(cfg, root))
- swap(cfg, root);
- /*
- * Note that devm_ioremap_resource() is dumb and won't let the platform
- * device claim cfg when the ACPI companion device has already claimed
- * root within it. But since they *are* already both claimed in the
- * appropriate name, we don't really need to do it again here anyway.
- */
- cmn->base = devm_ioremap(cmn->dev, cfg->start, resource_size(cfg));
- if (!cmn->base)
- return -ENOMEM;
+ const struct device_node *np = cmn->dev->of_node;
+ const struct resource *root;
+ u32 rootnode;
- return root->start - cfg->start;
-}
+ if (cmn->part != PART_CMN600)
+ return 0;
-static int arm_cmn600_of_probe(struct device_node *np)
-{
- u32 rootnode;
+ if (np)
+ return of_property_read_u32(np, "arm,root-node", &rootnode) ?: rootnode;
- return of_property_read_u32(np, "arm,root-node", &rootnode) ?: rootnode;
+ root = platform_get_resource(to_platform_device(cmn->dev), IORESOURCE_MEM, 1);
+ return root ? root->start - cfg->start : -EINVAL;
}
static int arm_cmn_probe(struct platform_device *pdev)
{
struct arm_cmn *cmn;
+ const struct resource *cfg;
const char *name;
static atomic_t id;
int err, rootnode, this_id;
@@ -2575,16 +2567,16 @@ static int arm_cmn_probe(struct platform_device *pdev)
cmn->cpu = cpumask_local_spread(0, dev_to_node(cmn->dev));
platform_set_drvdata(pdev, cmn);
- if (cmn->part == PART_CMN600 && has_acpi_companion(cmn->dev)) {
- rootnode = arm_cmn600_acpi_probe(pdev, cmn);
- } else {
- rootnode = 0;
- cmn->base = devm_platform_ioremap_resource(pdev, 0);
- if (IS_ERR(cmn->base))
- return PTR_ERR(cmn->base);
- if (cmn->part == PART_CMN600)
- rootnode = arm_cmn600_of_probe(pdev->dev.of_node);
- }
+ cfg = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (!cfg)
+ return -EINVAL;
+
+ /* Map the whole region now, claim the DTCs once we've found them */
+ cmn->base = devm_ioremap(cmn->dev, cfg->start, resource_size(cfg));
+ if (!cmn->base)
+ return -ENOMEM;
+
+ rootnode = arm_cmn_get_root(cmn, cfg);
if (rootnode < 0)
return rootnode;
diff --git a/drivers/perf/arm_cspmu/arm_cspmu.c b/drivers/perf/arm_cspmu/arm_cspmu.c
index ed72c3d1f796..80fb314d5135 100644
--- a/drivers/perf/arm_cspmu/arm_cspmu.c
+++ b/drivers/perf/arm_cspmu/arm_cspmu.c
@@ -16,7 +16,7 @@
* The user should refer to the vendor technical documentation to get details
* about the supported events.
*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
*/
@@ -1134,6 +1134,23 @@ static int arm_cspmu_acpi_get_cpus(struct arm_cspmu *cspmu)
return 0;
}
+
+struct acpi_device *arm_cspmu_acpi_dev_get(const struct arm_cspmu *cspmu)
+{
+ char hid[16] = {};
+ char uid[16] = {};
+ const struct acpi_apmt_node *apmt_node;
+
+ apmt_node = arm_cspmu_apmt_node(cspmu->dev);
+ if (!apmt_node || apmt_node->type != ACPI_APMT_NODE_TYPE_ACPI)
+ return NULL;
+
+ memcpy(hid, &apmt_node->inst_primary, sizeof(apmt_node->inst_primary));
+ snprintf(uid, sizeof(uid), "%u", apmt_node->inst_secondary);
+
+ return acpi_dev_get_first_match_dev(hid, uid, -1);
+}
+EXPORT_SYMBOL_GPL(arm_cspmu_acpi_dev_get);
#else
static int arm_cspmu_acpi_get_cpus(struct arm_cspmu *cspmu)
{
diff --git a/drivers/perf/arm_cspmu/arm_cspmu.h b/drivers/perf/arm_cspmu/arm_cspmu.h
index cd65a58dbd88..3fc5c8d77266 100644
--- a/drivers/perf/arm_cspmu/arm_cspmu.h
+++ b/drivers/perf/arm_cspmu/arm_cspmu.h
@@ -1,13 +1,14 @@
/* SPDX-License-Identifier: GPL-2.0
*
* ARM CoreSight Architecture PMU driver.
- * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
*/
#ifndef __ARM_CSPMU_H__
#define __ARM_CSPMU_H__
+#include <linux/acpi.h>
#include <linux/bitfield.h>
#include <linux/cpumask.h>
#include <linux/device.h>
@@ -255,4 +256,18 @@ int arm_cspmu_impl_register(const struct arm_cspmu_impl_match *impl_match);
/* Unregister vendor backend. */
void arm_cspmu_impl_unregister(const struct arm_cspmu_impl_match *impl_match);
+#if defined(CONFIG_ACPI) && defined(CONFIG_ARM64)
+/**
+ * Get ACPI device associated with the PMU.
+ * The caller is responsible for calling acpi_dev_put() on the returned device.
+ */
+struct acpi_device *arm_cspmu_acpi_dev_get(const struct arm_cspmu *cspmu);
+#else
+static inline struct acpi_device *
+arm_cspmu_acpi_dev_get(const struct arm_cspmu *cspmu)
+{
+ return NULL;
+}
+#endif
+
#endif /* __ARM_CSPMU_H__ */
diff --git a/drivers/perf/arm_cspmu/nvidia_cspmu.c b/drivers/perf/arm_cspmu/nvidia_cspmu.c
index e06a06d3407b..bac83e424d6d 100644
--- a/drivers/perf/arm_cspmu/nvidia_cspmu.c
+++ b/drivers/perf/arm_cspmu/nvidia_cspmu.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
*/
@@ -8,6 +8,7 @@
#include <linux/io.h>
#include <linux/module.h>
+#include <linux/property.h>
#include <linux/topology.h>
#include "arm_cspmu.h"
@@ -21,6 +22,44 @@
#define NV_CNVL_PORT_COUNT 4ULL
#define NV_CNVL_FILTER_ID_MASK GENMASK_ULL(NV_CNVL_PORT_COUNT - 1, 0)
+#define NV_UCF_SRC_COUNT 3ULL
+#define NV_UCF_DST_COUNT 4ULL
+#define NV_UCF_FILTER_ID_MASK GENMASK_ULL(11, 0)
+#define NV_UCF_FILTER_SRC GENMASK_ULL(2, 0)
+#define NV_UCF_FILTER_DST GENMASK_ULL(11, 8)
+#define NV_UCF_FILTER_DEFAULT (NV_UCF_FILTER_SRC | NV_UCF_FILTER_DST)
+
+#define NV_PCIE_V2_PORT_COUNT 8ULL
+#define NV_PCIE_V2_FILTER_ID_MASK GENMASK_ULL(24, 0)
+#define NV_PCIE_V2_FILTER_PORT GENMASK_ULL(NV_PCIE_V2_PORT_COUNT - 1, 0)
+#define NV_PCIE_V2_FILTER_BDF_VAL GENMASK_ULL(23, NV_PCIE_V2_PORT_COUNT)
+#define NV_PCIE_V2_FILTER_BDF_EN BIT(24)
+#define NV_PCIE_V2_FILTER_BDF_VAL_EN GENMASK_ULL(24, NV_PCIE_V2_PORT_COUNT)
+#define NV_PCIE_V2_FILTER_DEFAULT NV_PCIE_V2_FILTER_PORT
+
+#define NV_PCIE_V2_DST_COUNT 5ULL
+#define NV_PCIE_V2_FILTER2_ID_MASK GENMASK_ULL(4, 0)
+#define NV_PCIE_V2_FILTER2_DST GENMASK_ULL(NV_PCIE_V2_DST_COUNT - 1, 0)
+#define NV_PCIE_V2_FILTER2_DEFAULT NV_PCIE_V2_FILTER2_DST
+
+#define NV_PCIE_TGT_PORT_COUNT 8ULL
+#define NV_PCIE_TGT_EV_TYPE_CC 0x4
+#define NV_PCIE_TGT_EV_TYPE_COUNT 3ULL
+#define NV_PCIE_TGT_EV_TYPE_MASK GENMASK_ULL(NV_PCIE_TGT_EV_TYPE_COUNT - 1, 0)
+#define NV_PCIE_TGT_FILTER2_MASK GENMASK_ULL(NV_PCIE_TGT_PORT_COUNT, 0)
+#define NV_PCIE_TGT_FILTER2_PORT GENMASK_ULL(NV_PCIE_TGT_PORT_COUNT - 1, 0)
+#define NV_PCIE_TGT_FILTER2_ADDR_EN BIT(NV_PCIE_TGT_PORT_COUNT)
+#define NV_PCIE_TGT_FILTER2_ADDR GENMASK_ULL(15, NV_PCIE_TGT_PORT_COUNT)
+#define NV_PCIE_TGT_FILTER2_DEFAULT NV_PCIE_TGT_FILTER2_PORT
+
+#define NV_PCIE_TGT_ADDR_COUNT 8ULL
+#define NV_PCIE_TGT_ADDR_STRIDE 20
+#define NV_PCIE_TGT_ADDR_CTRL 0xD38
+#define NV_PCIE_TGT_ADDR_BASE_LO 0xD3C
+#define NV_PCIE_TGT_ADDR_BASE_HI 0xD40
+#define NV_PCIE_TGT_ADDR_MASK_LO 0xD44
+#define NV_PCIE_TGT_ADDR_MASK_HI 0xD48
+
#define NV_GENERIC_FILTER_ID_MASK GENMASK_ULL(31, 0)
#define NV_PRODID_MASK (PMIIDR_PRODUCTID | PMIIDR_VARIANT | PMIIDR_REVISION)
@@ -124,6 +163,55 @@ static struct attribute *mcf_pmu_event_attrs[] = {
NULL,
};
+static struct attribute *ucf_pmu_event_attrs[] = {
+ ARM_CSPMU_EVENT_ATTR(bus_cycles, 0x1D),
+
+ ARM_CSPMU_EVENT_ATTR(slc_allocate, 0xF0),
+ ARM_CSPMU_EVENT_ATTR(slc_wb, 0xF3),
+ ARM_CSPMU_EVENT_ATTR(slc_refill_rd, 0x109),
+ ARM_CSPMU_EVENT_ATTR(slc_refill_wr, 0x10A),
+ ARM_CSPMU_EVENT_ATTR(slc_hit_rd, 0x119),
+
+ ARM_CSPMU_EVENT_ATTR(slc_access_dataless, 0x183),
+ ARM_CSPMU_EVENT_ATTR(slc_access_atomic, 0x184),
+
+ ARM_CSPMU_EVENT_ATTR(slc_access_rd, 0x111),
+ ARM_CSPMU_EVENT_ATTR(slc_access_wr, 0x112),
+ ARM_CSPMU_EVENT_ATTR(slc_bytes_rd, 0x113),
+ ARM_CSPMU_EVENT_ATTR(slc_bytes_wr, 0x114),
+
+ ARM_CSPMU_EVENT_ATTR(mem_access_rd, 0x121),
+ ARM_CSPMU_EVENT_ATTR(mem_access_wr, 0x122),
+ ARM_CSPMU_EVENT_ATTR(mem_bytes_rd, 0x123),
+ ARM_CSPMU_EVENT_ATTR(mem_bytes_wr, 0x124),
+
+ ARM_CSPMU_EVENT_ATTR(local_snoop, 0x180),
+ ARM_CSPMU_EVENT_ATTR(ext_snp_access, 0x181),
+ ARM_CSPMU_EVENT_ATTR(ext_snp_evict, 0x182),
+
+ ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT),
+ NULL
+};
+
+static struct attribute *pcie_v2_pmu_event_attrs[] = {
+ ARM_CSPMU_EVENT_ATTR(rd_bytes, 0x0),
+ ARM_CSPMU_EVENT_ATTR(wr_bytes, 0x1),
+ ARM_CSPMU_EVENT_ATTR(rd_req, 0x2),
+ ARM_CSPMU_EVENT_ATTR(wr_req, 0x3),
+ ARM_CSPMU_EVENT_ATTR(rd_cum_outs, 0x4),
+ ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT),
+ NULL
+};
+
+static struct attribute *pcie_tgt_pmu_event_attrs[] = {
+ ARM_CSPMU_EVENT_ATTR(rd_bytes, 0x0),
+ ARM_CSPMU_EVENT_ATTR(wr_bytes, 0x1),
+ ARM_CSPMU_EVENT_ATTR(rd_req, 0x2),
+ ARM_CSPMU_EVENT_ATTR(wr_req, 0x3),
+ ARM_CSPMU_EVENT_ATTR(cycles, NV_PCIE_TGT_EV_TYPE_CC),
+ NULL
+};
+
static struct attribute *generic_pmu_event_attrs[] = {
ARM_CSPMU_EVENT_ATTR(cycles, ARM_CSPMU_EVT_CYCLES_DEFAULT),
NULL,
@@ -152,6 +240,40 @@ static struct attribute *cnvlink_pmu_format_attrs[] = {
NULL,
};
+static struct attribute *ucf_pmu_format_attrs[] = {
+ ARM_CSPMU_FORMAT_EVENT_ATTR,
+ ARM_CSPMU_FORMAT_ATTR(src_loc_noncpu, "config1:0"),
+ ARM_CSPMU_FORMAT_ATTR(src_loc_cpu, "config1:1"),
+ ARM_CSPMU_FORMAT_ATTR(src_rem, "config1:2"),
+ ARM_CSPMU_FORMAT_ATTR(dst_loc_cmem, "config1:8"),
+ ARM_CSPMU_FORMAT_ATTR(dst_loc_gmem, "config1:9"),
+ ARM_CSPMU_FORMAT_ATTR(dst_loc_other, "config1:10"),
+ ARM_CSPMU_FORMAT_ATTR(dst_rem, "config1:11"),
+ NULL
+};
+
+static struct attribute *pcie_v2_pmu_format_attrs[] = {
+ ARM_CSPMU_FORMAT_EVENT_ATTR,
+ ARM_CSPMU_FORMAT_ATTR(src_rp_mask, "config1:0-7"),
+ ARM_CSPMU_FORMAT_ATTR(src_bdf, "config1:8-23"),
+ ARM_CSPMU_FORMAT_ATTR(src_bdf_en, "config1:24"),
+ ARM_CSPMU_FORMAT_ATTR(dst_loc_cmem, "config2:0"),
+ ARM_CSPMU_FORMAT_ATTR(dst_loc_gmem, "config2:1"),
+ ARM_CSPMU_FORMAT_ATTR(dst_loc_pcie_p2p, "config2:2"),
+ ARM_CSPMU_FORMAT_ATTR(dst_loc_pcie_cxl, "config2:3"),
+ ARM_CSPMU_FORMAT_ATTR(dst_rem, "config2:4"),
+ NULL
+};
+
+static struct attribute *pcie_tgt_pmu_format_attrs[] = {
+ ARM_CSPMU_FORMAT_ATTR(event, "config:0-2"),
+ ARM_CSPMU_FORMAT_ATTR(dst_rp_mask, "config:3-10"),
+ ARM_CSPMU_FORMAT_ATTR(dst_addr_en, "config:11"),
+ ARM_CSPMU_FORMAT_ATTR(dst_addr_base, "config1:0-63"),
+ ARM_CSPMU_FORMAT_ATTR(dst_addr_mask, "config2:0-63"),
+ NULL
+};
+
static struct attribute *generic_pmu_format_attrs[] = {
ARM_CSPMU_FORMAT_EVENT_ATTR,
ARM_CSPMU_FORMAT_FILTER_ATTR,
@@ -183,6 +305,32 @@ nv_cspmu_get_name(const struct arm_cspmu *cspmu)
return ctx->name;
}
+#if defined(CONFIG_ACPI) && defined(CONFIG_ARM64)
+static int nv_cspmu_get_inst_id(const struct arm_cspmu *cspmu, u32 *id)
+{
+ struct fwnode_handle *fwnode;
+ struct acpi_device *adev;
+ int ret;
+
+ adev = arm_cspmu_acpi_dev_get(cspmu);
+ if (!adev)
+ return -ENODEV;
+
+ fwnode = acpi_fwnode_handle(adev);
+ ret = fwnode_property_read_u32(fwnode, "instance_id", id);
+ if (ret)
+ dev_err(cspmu->dev, "Failed to get instance ID\n");
+
+ acpi_dev_put(adev);
+ return ret;
+}
+#else
+static int nv_cspmu_get_inst_id(const struct arm_cspmu *cspmu, u32 *id)
+{
+ return -EINVAL;
+}
+#endif
+
static u32 nv_cspmu_event_filter(const struct perf_event *event)
{
const struct nv_cspmu_ctx *ctx =
@@ -228,6 +376,20 @@ static void nv_cspmu_set_ev_filter(struct arm_cspmu *cspmu,
}
}
+static void nv_cspmu_reset_ev_filter(struct arm_cspmu *cspmu,
+ const struct perf_event *event)
+{
+ const struct nv_cspmu_ctx *ctx =
+ to_nv_cspmu_ctx(to_arm_cspmu(event->pmu));
+ const u32 offset = 4 * event->hw.idx;
+
+ if (ctx->get_filter)
+ writel(0, cspmu->base0 + PMEVFILTR + offset);
+
+ if (ctx->get_filter2)
+ writel(0, cspmu->base0 + PMEVFILT2R + offset);
+}
+
static void nv_cspmu_set_cc_filter(struct arm_cspmu *cspmu,
const struct perf_event *event)
{
@@ -236,10 +398,386 @@ static void nv_cspmu_set_cc_filter(struct arm_cspmu *cspmu,
writel(filter, cspmu->base0 + PMCCFILTR);
}
+static u32 ucf_pmu_event_filter(const struct perf_event *event)
+{
+ u32 ret, filter, src, dst;
+
+ filter = nv_cspmu_event_filter(event);
+
+ /* Monitor all sources if none is selected. */
+ src = FIELD_GET(NV_UCF_FILTER_SRC, filter);
+ if (src == 0)
+ src = GENMASK_ULL(NV_UCF_SRC_COUNT - 1, 0);
+
+ /* Monitor all destinations if none is selected. */
+ dst = FIELD_GET(NV_UCF_FILTER_DST, filter);
+ if (dst == 0)
+ dst = GENMASK_ULL(NV_UCF_DST_COUNT - 1, 0);
+
+ ret = FIELD_PREP(NV_UCF_FILTER_SRC, src);
+ ret |= FIELD_PREP(NV_UCF_FILTER_DST, dst);
+
+ return ret;
+}
+
+static u32 pcie_v2_pmu_bdf_val_en(u32 filter)
+{
+ const u32 bdf_en = FIELD_GET(NV_PCIE_V2_FILTER_BDF_EN, filter);
+
+ /* Returns both BDF value and enable bit if BDF filtering is enabled. */
+ if (bdf_en)
+ return FIELD_GET(NV_PCIE_V2_FILTER_BDF_VAL_EN, filter);
+
+ /* Ignore the BDF value if BDF filter is not enabled. */
+ return 0;
+}
+
+static u32 pcie_v2_pmu_event_filter(const struct perf_event *event)
+{
+ u32 filter, lead_filter, lead_bdf;
+ struct perf_event *leader;
+ const struct nv_cspmu_ctx *ctx =
+ to_nv_cspmu_ctx(to_arm_cspmu(event->pmu));
+
+ filter = event->attr.config1 & ctx->filter_mask;
+ if (filter != 0)
+ return filter;
+
+ leader = event->group_leader;
+
+ /* Use leader's filter value if its BDF filtering is enabled. */
+ if (event != leader) {
+ lead_filter = pcie_v2_pmu_event_filter(leader);
+ lead_bdf = pcie_v2_pmu_bdf_val_en(lead_filter);
+ if (lead_bdf != 0)
+ return lead_filter;
+ }
+
+ /* Otherwise, return default filter value. */
+ return ctx->filter_default_val;
+}
+
+static int pcie_v2_pmu_validate_event(struct arm_cspmu *cspmu,
+ struct perf_event *new_ev)
+{
+ /*
+ * Make sure the events are using same BDF filter since the PCIE-SRC PMU
+ * only supports one common BDF filter setting for all of the counters.
+ */
+
+ int idx;
+ u32 new_filter, new_rp, new_bdf, new_lead_filter, new_lead_bdf;
+ struct perf_event *new_leader;
+
+ if (cspmu->impl.ops.is_cycle_counter_event(new_ev))
+ return 0;
+
+ new_leader = new_ev->group_leader;
+
+ new_filter = pcie_v2_pmu_event_filter(new_ev);
+ new_lead_filter = pcie_v2_pmu_event_filter(new_leader);
+
+ new_bdf = pcie_v2_pmu_bdf_val_en(new_filter);
+ new_lead_bdf = pcie_v2_pmu_bdf_val_en(new_lead_filter);
+
+ new_rp = FIELD_GET(NV_PCIE_V2_FILTER_PORT, new_filter);
+
+ if (new_rp != 0 && new_bdf != 0) {
+ dev_err(cspmu->dev,
+ "RP and BDF filtering are mutually exclusive\n");
+ return -EINVAL;
+ }
+
+ if (new_bdf != new_lead_bdf) {
+ dev_err(cspmu->dev,
+ "sibling and leader BDF value should be equal\n");
+ return -EINVAL;
+ }
+
+ /* Compare BDF filter on existing events. */
+ idx = find_first_bit(cspmu->hw_events.used_ctrs,
+ cspmu->cycle_counter_logical_idx);
+
+ if (idx != cspmu->cycle_counter_logical_idx) {
+ struct perf_event *leader = cspmu->hw_events.events[idx]->group_leader;
+
+ const u32 lead_filter = pcie_v2_pmu_event_filter(leader);
+ const u32 lead_bdf = pcie_v2_pmu_bdf_val_en(lead_filter);
+
+ if (new_lead_bdf != lead_bdf) {
+ dev_err(cspmu->dev, "only one BDF value is supported\n");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+struct pcie_tgt_addr_filter {
+ u32 refcount;
+ u64 base;
+ u64 mask;
+};
+
+struct pcie_tgt_data {
+ struct pcie_tgt_addr_filter addr_filter[NV_PCIE_TGT_ADDR_COUNT];
+ void __iomem *addr_filter_reg;
+};
+
+#if defined(CONFIG_ACPI) && defined(CONFIG_ARM64)
+static int pcie_tgt_init_data(struct arm_cspmu *cspmu)
+{
+ int ret;
+ struct acpi_device *adev;
+ struct pcie_tgt_data *data;
+ struct list_head resource_list;
+ struct resource_entry *rentry;
+ struct nv_cspmu_ctx *ctx = to_nv_cspmu_ctx(cspmu);
+ struct device *dev = cspmu->dev;
+
+ data = devm_kzalloc(dev, sizeof(struct pcie_tgt_data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ adev = arm_cspmu_acpi_dev_get(cspmu);
+ if (!adev) {
+ dev_err(dev, "failed to get associated PCIE-TGT device\n");
+ return -ENODEV;
+ }
+
+ INIT_LIST_HEAD(&resource_list);
+ ret = acpi_dev_get_memory_resources(adev, &resource_list);
+ if (ret < 0) {
+ dev_err(dev, "failed to get PCIE-TGT device memory resources\n");
+ acpi_dev_put(adev);
+ return ret;
+ }
+
+ rentry = list_first_entry_or_null(
+ &resource_list, struct resource_entry, node);
+ if (rentry) {
+ data->addr_filter_reg = devm_ioremap_resource(dev, rentry->res);
+ ret = 0;
+ }
+
+ if (IS_ERR(data->addr_filter_reg)) {
+ dev_err(dev, "failed to get address filter resource\n");
+ ret = PTR_ERR(data->addr_filter_reg);
+ }
+
+ acpi_dev_free_resource_list(&resource_list);
+ acpi_dev_put(adev);
+
+ ctx->data = data;
+
+ return ret;
+}
+#else
+static int pcie_tgt_init_data(struct arm_cspmu *cspmu)
+{
+ return -ENODEV;
+}
+#endif
+
+static struct pcie_tgt_data *pcie_tgt_get_data(struct arm_cspmu *cspmu)
+{
+ struct nv_cspmu_ctx *ctx = to_nv_cspmu_ctx(cspmu);
+
+ return ctx->data;
+}
+
+/* Find the first available address filter slot. */
+static int pcie_tgt_find_addr_idx(struct arm_cspmu *cspmu, u64 base, u64 mask,
+ bool is_reset)
+{
+ int i;
+ struct pcie_tgt_data *data = pcie_tgt_get_data(cspmu);
+
+ for (i = 0; i < NV_PCIE_TGT_ADDR_COUNT; i++) {
+ if (!is_reset && data->addr_filter[i].refcount == 0)
+ return i;
+
+ if (data->addr_filter[i].base == base &&
+ data->addr_filter[i].mask == mask)
+ return i;
+ }
+
+ return -ENODEV;
+}
+
+static u32 pcie_tgt_pmu_event_filter(const struct perf_event *event)
+{
+ u32 filter;
+
+ filter = (event->attr.config >> NV_PCIE_TGT_EV_TYPE_COUNT) &
+ NV_PCIE_TGT_FILTER2_MASK;
+
+ return filter;
+}
+
+static bool pcie_tgt_pmu_addr_en(const struct perf_event *event)
+{
+ u32 filter = pcie_tgt_pmu_event_filter(event);
+
+ return FIELD_GET(NV_PCIE_TGT_FILTER2_ADDR_EN, filter) != 0;
+}
+
+static u32 pcie_tgt_pmu_port_filter(const struct perf_event *event)
+{
+ u32 filter = pcie_tgt_pmu_event_filter(event);
+
+ return FIELD_GET(NV_PCIE_TGT_FILTER2_PORT, filter);
+}
+
+static u64 pcie_tgt_pmu_dst_addr_base(const struct perf_event *event)
+{
+ return event->attr.config1;
+}
+
+static u64 pcie_tgt_pmu_dst_addr_mask(const struct perf_event *event)
+{
+ return event->attr.config2;
+}
+
+static int pcie_tgt_pmu_validate_event(struct arm_cspmu *cspmu,
+ struct perf_event *new_ev)
+{
+ u64 base, mask;
+ int idx;
+
+ if (!pcie_tgt_pmu_addr_en(new_ev))
+ return 0;
+
+ /* Make sure there is a slot available for the address filter. */
+ base = pcie_tgt_pmu_dst_addr_base(new_ev);
+ mask = pcie_tgt_pmu_dst_addr_mask(new_ev);
+ idx = pcie_tgt_find_addr_idx(cspmu, base, mask, false);
+ if (idx < 0)
+ return -EINVAL;
+
+ return 0;
+}
+
+static void pcie_tgt_pmu_config_addr_filter(struct arm_cspmu *cspmu,
+ bool en, u64 base, u64 mask, int idx)
+{
+ struct pcie_tgt_data *data;
+ struct pcie_tgt_addr_filter *filter;
+ void __iomem *filter_reg;
+
+ data = pcie_tgt_get_data(cspmu);
+ filter = &data->addr_filter[idx];
+ filter_reg = data->addr_filter_reg + (idx * NV_PCIE_TGT_ADDR_STRIDE);
+
+ if (en) {
+ filter->refcount++;
+ if (filter->refcount == 1) {
+ filter->base = base;
+ filter->mask = mask;
+
+ writel(lower_32_bits(base), filter_reg + NV_PCIE_TGT_ADDR_BASE_LO);
+ writel(upper_32_bits(base), filter_reg + NV_PCIE_TGT_ADDR_BASE_HI);
+ writel(lower_32_bits(mask), filter_reg + NV_PCIE_TGT_ADDR_MASK_LO);
+ writel(upper_32_bits(mask), filter_reg + NV_PCIE_TGT_ADDR_MASK_HI);
+ writel(1, filter_reg + NV_PCIE_TGT_ADDR_CTRL);
+ }
+ } else {
+ filter->refcount--;
+ if (filter->refcount == 0) {
+ writel(0, filter_reg + NV_PCIE_TGT_ADDR_CTRL);
+ writel(0, filter_reg + NV_PCIE_TGT_ADDR_BASE_LO);
+ writel(0, filter_reg + NV_PCIE_TGT_ADDR_BASE_HI);
+ writel(0, filter_reg + NV_PCIE_TGT_ADDR_MASK_LO);
+ writel(0, filter_reg + NV_PCIE_TGT_ADDR_MASK_HI);
+
+ filter->base = 0;
+ filter->mask = 0;
+ }
+ }
+}
+
+static void pcie_tgt_pmu_set_ev_filter(struct arm_cspmu *cspmu,
+ const struct perf_event *event)
+{
+ bool addr_filter_en;
+ int idx;
+ u32 filter2_val, filter2_offset, port_filter;
+ u64 base, mask;
+
+ filter2_val = 0;
+ filter2_offset = PMEVFILT2R + (4 * event->hw.idx);
+
+ addr_filter_en = pcie_tgt_pmu_addr_en(event);
+ if (addr_filter_en) {
+ base = pcie_tgt_pmu_dst_addr_base(event);
+ mask = pcie_tgt_pmu_dst_addr_mask(event);
+ idx = pcie_tgt_find_addr_idx(cspmu, base, mask, false);
+
+ if (idx < 0) {
+ dev_err(cspmu->dev,
+ "Unable to find a slot for address filtering\n");
+ writel(0, cspmu->base0 + filter2_offset);
+ return;
+ }
+
+ /* Configure address range filter registers.*/
+ pcie_tgt_pmu_config_addr_filter(cspmu, true, base, mask, idx);
+
+ /* Config the counter to use the selected address filter slot. */
+ filter2_val |= FIELD_PREP(NV_PCIE_TGT_FILTER2_ADDR, 1U << idx);
+ }
+
+ port_filter = pcie_tgt_pmu_port_filter(event);
+
+ /* Monitor all ports if no filter is selected. */
+ if (!addr_filter_en && port_filter == 0)
+ port_filter = NV_PCIE_TGT_FILTER2_PORT;
+
+ filter2_val |= FIELD_PREP(NV_PCIE_TGT_FILTER2_PORT, port_filter);
+
+ writel(filter2_val, cspmu->base0 + filter2_offset);
+}
+
+static void pcie_tgt_pmu_reset_ev_filter(struct arm_cspmu *cspmu,
+ const struct perf_event *event)
+{
+ bool addr_filter_en;
+ u64 base, mask;
+ int idx;
+
+ addr_filter_en = pcie_tgt_pmu_addr_en(event);
+ if (!addr_filter_en)
+ return;
+
+ base = pcie_tgt_pmu_dst_addr_base(event);
+ mask = pcie_tgt_pmu_dst_addr_mask(event);
+ idx = pcie_tgt_find_addr_idx(cspmu, base, mask, true);
+
+ if (idx < 0) {
+ dev_err(cspmu->dev,
+ "Unable to find the address filter slot to reset\n");
+ return;
+ }
+
+ pcie_tgt_pmu_config_addr_filter(cspmu, false, base, mask, idx);
+}
+
+static u32 pcie_tgt_pmu_event_type(const struct perf_event *event)
+{
+ return event->attr.config & NV_PCIE_TGT_EV_TYPE_MASK;
+}
+
+static bool pcie_tgt_pmu_is_cycle_counter_event(const struct perf_event *event)
+{
+ u32 event_type = pcie_tgt_pmu_event_type(event);
+
+ return event_type == NV_PCIE_TGT_EV_TYPE_CC;
+}
enum nv_cspmu_name_fmt {
NAME_FMT_GENERIC,
- NAME_FMT_SOCKET
+ NAME_FMT_SOCKET,
+ NAME_FMT_SOCKET_INST,
};
struct nv_cspmu_match {
@@ -343,6 +881,63 @@ static const struct nv_cspmu_match nv_cspmu_match[] = {
},
},
{
+ .prodid = 0x2CF20000,
+ .prodid_mask = NV_PRODID_MASK,
+ .name_pattern = "nvidia_ucf_pmu_%u",
+ .name_fmt = NAME_FMT_SOCKET,
+ .template_ctx = {
+ .event_attr = ucf_pmu_event_attrs,
+ .format_attr = ucf_pmu_format_attrs,
+ .filter_mask = NV_UCF_FILTER_ID_MASK,
+ .filter_default_val = NV_UCF_FILTER_DEFAULT,
+ .filter2_mask = 0x0,
+ .filter2_default_val = 0x0,
+ .get_filter = ucf_pmu_event_filter,
+ },
+ },
+ {
+ .prodid = 0x10301000,
+ .prodid_mask = NV_PRODID_MASK,
+ .name_pattern = "nvidia_pcie_pmu_%u_rc_%u",
+ .name_fmt = NAME_FMT_SOCKET_INST,
+ .template_ctx = {
+ .event_attr = pcie_v2_pmu_event_attrs,
+ .format_attr = pcie_v2_pmu_format_attrs,
+ .filter_mask = NV_PCIE_V2_FILTER_ID_MASK,
+ .filter_default_val = NV_PCIE_V2_FILTER_DEFAULT,
+ .filter2_mask = NV_PCIE_V2_FILTER2_ID_MASK,
+ .filter2_default_val = NV_PCIE_V2_FILTER2_DEFAULT,
+ .get_filter = pcie_v2_pmu_event_filter,
+ .get_filter2 = nv_cspmu_event_filter2,
+ },
+ .ops = {
+ .validate_event = pcie_v2_pmu_validate_event,
+ .reset_ev_filter = nv_cspmu_reset_ev_filter,
+ }
+ },
+ {
+ .prodid = 0x10700000,
+ .prodid_mask = NV_PRODID_MASK,
+ .name_pattern = "nvidia_pcie_tgt_pmu_%u_rc_%u",
+ .name_fmt = NAME_FMT_SOCKET_INST,
+ .template_ctx = {
+ .event_attr = pcie_tgt_pmu_event_attrs,
+ .format_attr = pcie_tgt_pmu_format_attrs,
+ .filter_mask = 0x0,
+ .filter_default_val = 0x0,
+ .filter2_mask = NV_PCIE_TGT_FILTER2_MASK,
+ .filter2_default_val = NV_PCIE_TGT_FILTER2_DEFAULT,
+ .init_data = pcie_tgt_init_data
+ },
+ .ops = {
+ .is_cycle_counter_event = pcie_tgt_pmu_is_cycle_counter_event,
+ .event_type = pcie_tgt_pmu_event_type,
+ .validate_event = pcie_tgt_pmu_validate_event,
+ .set_ev_filter = pcie_tgt_pmu_set_ev_filter,
+ .reset_ev_filter = pcie_tgt_pmu_reset_ev_filter,
+ }
+ },
+ {
.prodid = 0,
.prodid_mask = 0,
.name_pattern = "nvidia_uncore_pmu_%u",
@@ -365,7 +960,7 @@ static const struct nv_cspmu_match nv_cspmu_match[] = {
static char *nv_cspmu_format_name(const struct arm_cspmu *cspmu,
const struct nv_cspmu_match *match)
{
- char *name;
+ char *name = NULL;
struct device *dev = cspmu->dev;
static atomic_t pmu_generic_idx = {0};
@@ -379,13 +974,20 @@ static char *nv_cspmu_format_name(const struct arm_cspmu *cspmu,
socket);
break;
}
+ case NAME_FMT_SOCKET_INST: {
+ const int cpu = cpumask_first(&cspmu->associated_cpus);
+ const int socket = cpu_to_node(cpu);
+ u32 inst_id;
+
+ if (!nv_cspmu_get_inst_id(cspmu, &inst_id))
+ name = devm_kasprintf(dev, GFP_KERNEL,
+ match->name_pattern, socket, inst_id);
+ break;
+ }
case NAME_FMT_GENERIC:
name = devm_kasprintf(dev, GFP_KERNEL, match->name_pattern,
atomic_fetch_inc(&pmu_generic_idx));
break;
- default:
- name = NULL;
- break;
}
return name;
@@ -426,8 +1028,12 @@ static int nv_cspmu_init_ops(struct arm_cspmu *cspmu)
cspmu->impl.ctx = ctx;
/* NVIDIA specific callbacks. */
+ SET_OP(validate_event, impl_ops, match, NULL);
+ SET_OP(event_type, impl_ops, match, NULL);
+ SET_OP(is_cycle_counter_event, impl_ops, match, NULL);
SET_OP(set_cc_filter, impl_ops, match, nv_cspmu_set_cc_filter);
SET_OP(set_ev_filter, impl_ops, match, nv_cspmu_set_ev_filter);
+ SET_OP(reset_ev_filter, impl_ops, match, NULL);
SET_OP(get_event_attrs, impl_ops, match, nv_cspmu_get_event_attrs);
SET_OP(get_format_attrs, impl_ops, match, nv_cspmu_get_format_attrs);
SET_OP(get_name, impl_ops, match, nv_cspmu_get_name);
diff --git a/drivers/perf/nvidia_t410_c2c_pmu.c b/drivers/perf/nvidia_t410_c2c_pmu.c
new file mode 100644
index 000000000000..411987153ff3
--- /dev/null
+++ b/drivers/perf/nvidia_t410_c2c_pmu.c
@@ -0,0 +1,1051 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVIDIA Tegra410 C2C PMU driver.
+ *
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ */
+
+#include <linux/acpi.h>
+#include <linux/bitops.h>
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
+#include <linux/property.h>
+
+/* The C2C interface types in Tegra410. */
+#define C2C_TYPE_NVLINK 0x0
+#define C2C_TYPE_NVCLINK 0x1
+#define C2C_TYPE_NVDLINK 0x2
+#define C2C_TYPE_COUNT 0x3
+
+/* The type of the peer device connected to the C2C interface. */
+#define C2C_PEER_TYPE_CPU 0x0
+#define C2C_PEER_TYPE_GPU 0x1
+#define C2C_PEER_TYPE_CXLMEM 0x2
+#define C2C_PEER_TYPE_COUNT 0x3
+
+/* The number of peer devices can be connected to the C2C interface. */
+#define C2C_NR_PEER_CPU 0x1
+#define C2C_NR_PEER_GPU 0x2
+#define C2C_NR_PEER_CXLMEM 0x1
+#define C2C_NR_PEER_MAX 0x2
+
+/* Number of instances on each interface. */
+#define C2C_NR_INST_NVLINK 14
+#define C2C_NR_INST_NVCLINK 12
+#define C2C_NR_INST_NVDLINK 16
+#define C2C_NR_INST_MAX 16
+
+/* Register offsets. */
+#define C2C_CTRL 0x864
+#define C2C_IN_STATUS 0x868
+#define C2C_CYCLE_CNTR 0x86c
+#define C2C_IN_RD_CUM_OUTS_CNTR 0x874
+#define C2C_IN_RD_REQ_CNTR 0x87c
+#define C2C_IN_WR_CUM_OUTS_CNTR 0x884
+#define C2C_IN_WR_REQ_CNTR 0x88c
+#define C2C_OUT_STATUS 0x890
+#define C2C_OUT_RD_CUM_OUTS_CNTR 0x898
+#define C2C_OUT_RD_REQ_CNTR 0x8a0
+#define C2C_OUT_WR_CUM_OUTS_CNTR 0x8a8
+#define C2C_OUT_WR_REQ_CNTR 0x8b0
+
+/* C2C_IN_STATUS register field. */
+#define C2C_IN_STATUS_CYCLE_OVF BIT(0)
+#define C2C_IN_STATUS_IN_RD_CUM_OUTS_OVF BIT(1)
+#define C2C_IN_STATUS_IN_RD_REQ_OVF BIT(2)
+#define C2C_IN_STATUS_IN_WR_CUM_OUTS_OVF BIT(3)
+#define C2C_IN_STATUS_IN_WR_REQ_OVF BIT(4)
+
+/* C2C_OUT_STATUS register field. */
+#define C2C_OUT_STATUS_OUT_RD_CUM_OUTS_OVF BIT(0)
+#define C2C_OUT_STATUS_OUT_RD_REQ_OVF BIT(1)
+#define C2C_OUT_STATUS_OUT_WR_CUM_OUTS_OVF BIT(2)
+#define C2C_OUT_STATUS_OUT_WR_REQ_OVF BIT(3)
+
+/* Events. */
+#define C2C_EVENT_CYCLES 0x0
+#define C2C_EVENT_IN_RD_CUM_OUTS 0x1
+#define C2C_EVENT_IN_RD_REQ 0x2
+#define C2C_EVENT_IN_WR_CUM_OUTS 0x3
+#define C2C_EVENT_IN_WR_REQ 0x4
+#define C2C_EVENT_OUT_RD_CUM_OUTS 0x5
+#define C2C_EVENT_OUT_RD_REQ 0x6
+#define C2C_EVENT_OUT_WR_CUM_OUTS 0x7
+#define C2C_EVENT_OUT_WR_REQ 0x8
+
+#define C2C_NUM_EVENTS 0x9
+#define C2C_MASK_EVENT 0xFF
+#define C2C_MAX_ACTIVE_EVENTS 32
+
+#define C2C_ACTIVE_CPU_MASK 0x0
+#define C2C_ASSOCIATED_CPU_MASK 0x1
+
+/*
+ * Maximum poll count for reading counter value using high-low-high sequence.
+ */
+#define HILOHI_MAX_POLL 1000
+
+static unsigned long nv_c2c_pmu_cpuhp_state;
+
+/* PMU descriptor. */
+
+/* C2C type information. */
+struct nv_c2c_pmu_data {
+ unsigned int c2c_type;
+ unsigned int nr_inst;
+ const char *name_fmt;
+};
+
+static const struct nv_c2c_pmu_data nv_c2c_pmu_data[] = {
+ [C2C_TYPE_NVLINK] = {
+ .c2c_type = C2C_TYPE_NVLINK,
+ .nr_inst = C2C_NR_INST_NVLINK,
+ .name_fmt = "nvidia_nvlink_c2c_pmu_%u",
+ },
+ [C2C_TYPE_NVCLINK] = {
+ .c2c_type = C2C_TYPE_NVCLINK,
+ .nr_inst = C2C_NR_INST_NVCLINK,
+ .name_fmt = "nvidia_nvclink_pmu_%u",
+ },
+ [C2C_TYPE_NVDLINK] = {
+ .c2c_type = C2C_TYPE_NVDLINK,
+ .nr_inst = C2C_NR_INST_NVDLINK,
+ .name_fmt = "nvidia_nvdlink_pmu_%u",
+ },
+};
+
+/* Tracks the events assigned to the PMU for a given logical index. */
+struct nv_c2c_pmu_hw_events {
+ /* The events that are active. */
+ struct perf_event *events[C2C_MAX_ACTIVE_EVENTS];
+
+ /*
+ * Each bit indicates a logical counter is being used (or not) for an
+ * event.
+ */
+ DECLARE_BITMAP(used_ctrs, C2C_MAX_ACTIVE_EVENTS);
+};
+
+struct nv_c2c_pmu {
+ struct pmu pmu;
+ struct device *dev;
+ struct acpi_device *acpi_dev;
+
+ const char *name;
+ const char *identifier;
+
+ const struct nv_c2c_pmu_data *data;
+ unsigned int peer_type;
+ unsigned int socket;
+ unsigned int nr_peer;
+ unsigned long peer_insts[C2C_NR_PEER_MAX][BITS_TO_LONGS(C2C_NR_INST_MAX)];
+ u32 filter_default;
+
+ struct nv_c2c_pmu_hw_events hw_events;
+
+ cpumask_t associated_cpus;
+ cpumask_t active_cpu;
+
+ struct hlist_node cpuhp_node;
+
+ const struct attribute_group **attr_groups;
+
+ void __iomem *base_broadcast;
+ void __iomem *base[C2C_NR_INST_MAX];
+};
+
+#define to_c2c_pmu(p) (container_of(p, struct nv_c2c_pmu, pmu))
+
+/* Get event type from perf_event. */
+static inline u32 get_event_type(struct perf_event *event)
+{
+ return (event->attr.config) & C2C_MASK_EVENT;
+}
+
+static inline u32 get_filter_mask(struct perf_event *event)
+{
+ u32 filter;
+ struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu);
+
+ filter = ((u32)event->attr.config1) & c2c_pmu->filter_default;
+ if (filter == 0)
+ filter = c2c_pmu->filter_default;
+
+ return filter;
+}
+
+/* PMU operations. */
+
+static int nv_c2c_pmu_get_event_idx(struct nv_c2c_pmu_hw_events *hw_events,
+ struct perf_event *event)
+{
+ u32 idx;
+
+ idx = find_first_zero_bit(hw_events->used_ctrs, C2C_MAX_ACTIVE_EVENTS);
+ if (idx >= C2C_MAX_ACTIVE_EVENTS)
+ return -EAGAIN;
+
+ set_bit(idx, hw_events->used_ctrs);
+
+ return idx;
+}
+
+static bool
+nv_c2c_pmu_validate_event(struct pmu *pmu,
+ struct nv_c2c_pmu_hw_events *hw_events,
+ struct perf_event *event)
+{
+ if (is_software_event(event))
+ return true;
+
+ /* Reject groups spanning multiple HW PMUs. */
+ if (event->pmu != pmu)
+ return false;
+
+ return nv_c2c_pmu_get_event_idx(hw_events, event) >= 0;
+}
+
+/*
+ * Make sure the group of events can be scheduled at once
+ * on the PMU.
+ */
+static bool nv_c2c_pmu_validate_group(struct perf_event *event)
+{
+ struct perf_event *sibling, *leader = event->group_leader;
+ struct nv_c2c_pmu_hw_events fake_hw_events;
+
+ if (event->group_leader == event)
+ return true;
+
+ memset(&fake_hw_events, 0, sizeof(fake_hw_events));
+
+ if (!nv_c2c_pmu_validate_event(event->pmu, &fake_hw_events, leader))
+ return false;
+
+ for_each_sibling_event(sibling, leader) {
+ if (!nv_c2c_pmu_validate_event(event->pmu, &fake_hw_events,
+ sibling))
+ return false;
+ }
+
+ return nv_c2c_pmu_validate_event(event->pmu, &fake_hw_events, event);
+}
+
+static int nv_c2c_pmu_event_init(struct perf_event *event)
+{
+ struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ u32 event_type = get_event_type(event);
+
+ if (event->attr.type != event->pmu->type ||
+ event_type >= C2C_NUM_EVENTS)
+ return -ENOENT;
+
+ /*
+ * Following other "uncore" PMUs, we do not support sampling mode or
+ * attach to a task (per-process mode).
+ */
+ if (is_sampling_event(event)) {
+ dev_dbg(c2c_pmu->pmu.dev, "Can't support sampling events\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (event->cpu < 0 || event->attach_state & PERF_ATTACH_TASK) {
+ dev_dbg(c2c_pmu->pmu.dev, "Can't support per-task counters\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Make sure the CPU assignment is on one of the CPUs associated with
+ * this PMU.
+ */
+ if (!cpumask_test_cpu(event->cpu, &c2c_pmu->associated_cpus)) {
+ dev_dbg(c2c_pmu->pmu.dev,
+ "Requested cpu is not associated with the PMU\n");
+ return -EINVAL;
+ }
+
+ /* Enforce the current active CPU to handle the events in this PMU. */
+ event->cpu = cpumask_first(&c2c_pmu->active_cpu);
+ if (event->cpu >= nr_cpu_ids)
+ return -EINVAL;
+
+ if (!nv_c2c_pmu_validate_group(event))
+ return -EINVAL;
+
+ hwc->idx = -1;
+ hwc->config = event_type;
+
+ return 0;
+}
+
+/*
+ * Read 64-bit register as a pair of 32-bit registers using hi-lo-hi sequence.
+ */
+static u64 read_reg64_hilohi(const void __iomem *addr, u32 max_poll_count)
+{
+ u32 val_lo, val_hi;
+ u64 val;
+
+ /* Use high-low-high sequence to avoid tearing */
+ do {
+ if (max_poll_count-- == 0) {
+ pr_err("NV C2C PMU: timeout hi-low-high sequence\n");
+ return 0;
+ }
+
+ val_hi = readl(addr + 4);
+ val_lo = readl(addr);
+ } while (val_hi != readl(addr + 4));
+
+ val = (((u64)val_hi << 32) | val_lo);
+
+ return val;
+}
+
+static void nv_c2c_pmu_check_status(struct nv_c2c_pmu *c2c_pmu, u32 instance)
+{
+ u32 in_status, out_status;
+
+ in_status = readl(c2c_pmu->base[instance] + C2C_IN_STATUS);
+ out_status = readl(c2c_pmu->base[instance] + C2C_OUT_STATUS);
+
+ if (in_status || out_status)
+ dev_warn(c2c_pmu->dev,
+ "C2C PMU overflow in: 0x%x, out: 0x%x\n",
+ in_status, out_status);
+}
+
+static u32 nv_c2c_ctr_offset[C2C_NUM_EVENTS] = {
+ [C2C_EVENT_CYCLES] = C2C_CYCLE_CNTR,
+ [C2C_EVENT_IN_RD_CUM_OUTS] = C2C_IN_RD_CUM_OUTS_CNTR,
+ [C2C_EVENT_IN_RD_REQ] = C2C_IN_RD_REQ_CNTR,
+ [C2C_EVENT_IN_WR_CUM_OUTS] = C2C_IN_WR_CUM_OUTS_CNTR,
+ [C2C_EVENT_IN_WR_REQ] = C2C_IN_WR_REQ_CNTR,
+ [C2C_EVENT_OUT_RD_CUM_OUTS] = C2C_OUT_RD_CUM_OUTS_CNTR,
+ [C2C_EVENT_OUT_RD_REQ] = C2C_OUT_RD_REQ_CNTR,
+ [C2C_EVENT_OUT_WR_CUM_OUTS] = C2C_OUT_WR_CUM_OUTS_CNTR,
+ [C2C_EVENT_OUT_WR_REQ] = C2C_OUT_WR_REQ_CNTR,
+};
+
+static u64 nv_c2c_pmu_read_counter(struct perf_event *event)
+{
+ u32 ctr_id, ctr_offset, filter_mask, filter_idx, inst_idx;
+ unsigned long *inst_mask;
+ DECLARE_BITMAP(filter_bitmap, C2C_NR_PEER_MAX);
+ struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu);
+ u64 val = 0;
+
+ filter_mask = get_filter_mask(event);
+ bitmap_from_arr32(filter_bitmap, &filter_mask, c2c_pmu->nr_peer);
+
+ ctr_id = event->hw.config;
+ ctr_offset = nv_c2c_ctr_offset[ctr_id];
+
+ for_each_set_bit(filter_idx, filter_bitmap, c2c_pmu->nr_peer) {
+ inst_mask = c2c_pmu->peer_insts[filter_idx];
+ for_each_set_bit(inst_idx, inst_mask, c2c_pmu->data->nr_inst) {
+ nv_c2c_pmu_check_status(c2c_pmu, inst_idx);
+
+ /*
+ * Each instance share same clock and the driver always
+ * enables all instances. So we can use the counts from
+ * one instance for cycle counter.
+ */
+ if (ctr_id == C2C_EVENT_CYCLES)
+ return read_reg64_hilohi(
+ c2c_pmu->base[inst_idx] + ctr_offset,
+ HILOHI_MAX_POLL);
+
+ /*
+ * For other events, sum up the counts from all instances.
+ */
+ val += read_reg64_hilohi(
+ c2c_pmu->base[inst_idx] + ctr_offset,
+ HILOHI_MAX_POLL);
+ }
+ }
+
+ return val;
+}
+
+static void nv_c2c_pmu_event_update(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ u64 prev, now;
+
+ do {
+ prev = local64_read(&hwc->prev_count);
+ now = nv_c2c_pmu_read_counter(event);
+ } while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
+
+ local64_add(now - prev, &event->count);
+}
+
+static void nv_c2c_pmu_start(struct perf_event *event, int pmu_flags)
+{
+ event->hw.state = 0;
+}
+
+static void nv_c2c_pmu_stop(struct perf_event *event, int pmu_flags)
+{
+ event->hw.state |= PERF_HES_STOPPED;
+}
+
+static int nv_c2c_pmu_add(struct perf_event *event, int flags)
+{
+ struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu);
+ struct nv_c2c_pmu_hw_events *hw_events = &c2c_pmu->hw_events;
+ struct hw_perf_event *hwc = &event->hw;
+ int idx;
+
+ if (WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(),
+ &c2c_pmu->associated_cpus)))
+ return -ENOENT;
+
+ idx = nv_c2c_pmu_get_event_idx(hw_events, event);
+ if (idx < 0)
+ return idx;
+
+ hw_events->events[idx] = event;
+ hwc->idx = idx;
+ hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+
+ if (flags & PERF_EF_START)
+ nv_c2c_pmu_start(event, PERF_EF_RELOAD);
+
+ /* Propagate changes to the userspace mapping. */
+ perf_event_update_userpage(event);
+
+ return 0;
+}
+
+static void nv_c2c_pmu_del(struct perf_event *event, int flags)
+{
+ struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(event->pmu);
+ struct nv_c2c_pmu_hw_events *hw_events = &c2c_pmu->hw_events;
+ struct hw_perf_event *hwc = &event->hw;
+ int idx = hwc->idx;
+
+ nv_c2c_pmu_stop(event, PERF_EF_UPDATE);
+
+ hw_events->events[idx] = NULL;
+
+ clear_bit(idx, hw_events->used_ctrs);
+
+ perf_event_update_userpage(event);
+}
+
+static void nv_c2c_pmu_read(struct perf_event *event)
+{
+ nv_c2c_pmu_event_update(event);
+}
+
+static void nv_c2c_pmu_enable(struct pmu *pmu)
+{
+ void __iomem *bcast;
+ struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(pmu);
+
+ /* Check if any filter is enabled. */
+ if (bitmap_empty(c2c_pmu->hw_events.used_ctrs, C2C_MAX_ACTIVE_EVENTS))
+ return;
+
+ /* Enable all the counters. */
+ bcast = c2c_pmu->base_broadcast;
+ writel(0x1UL, bcast + C2C_CTRL);
+}
+
+static void nv_c2c_pmu_disable(struct pmu *pmu)
+{
+ unsigned int idx;
+ void __iomem *bcast;
+ struct perf_event *event;
+ struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(pmu);
+
+ /* Disable all the counters. */
+ bcast = c2c_pmu->base_broadcast;
+ writel(0x0UL, bcast + C2C_CTRL);
+
+ /*
+ * The counters will start from 0 again on restart.
+ * Update the events immediately to avoid losing the counts.
+ */
+ for_each_set_bit(idx, c2c_pmu->hw_events.used_ctrs,
+ C2C_MAX_ACTIVE_EVENTS) {
+ event = c2c_pmu->hw_events.events[idx];
+
+ if (!event)
+ continue;
+
+ nv_c2c_pmu_event_update(event);
+
+ local64_set(&event->hw.prev_count, 0ULL);
+ }
+}
+
+/* PMU identifier attribute. */
+
+static ssize_t nv_c2c_pmu_identifier_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(dev_get_drvdata(dev));
+
+ return sysfs_emit(page, "%s\n", c2c_pmu->identifier);
+}
+
+static struct device_attribute nv_c2c_pmu_identifier_attr =
+ __ATTR(identifier, 0444, nv_c2c_pmu_identifier_show, NULL);
+
+static struct attribute *nv_c2c_pmu_identifier_attrs[] = {
+ &nv_c2c_pmu_identifier_attr.attr,
+ NULL,
+};
+
+static struct attribute_group nv_c2c_pmu_identifier_attr_group = {
+ .attrs = nv_c2c_pmu_identifier_attrs,
+};
+
+/* Peer attribute. */
+
+static ssize_t nv_c2c_pmu_peer_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ const char *peer_type[C2C_PEER_TYPE_COUNT] = {
+ [C2C_PEER_TYPE_CPU] = "cpu",
+ [C2C_PEER_TYPE_GPU] = "gpu",
+ [C2C_PEER_TYPE_CXLMEM] = "cxlmem",
+ };
+
+ struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(dev_get_drvdata(dev));
+ return sysfs_emit(page, "nr_%s=%u\n", peer_type[c2c_pmu->peer_type],
+ c2c_pmu->nr_peer);
+}
+
+static struct device_attribute nv_c2c_pmu_peer_attr =
+ __ATTR(peer, 0444, nv_c2c_pmu_peer_show, NULL);
+
+static struct attribute *nv_c2c_pmu_peer_attrs[] = {
+ &nv_c2c_pmu_peer_attr.attr,
+ NULL,
+};
+
+static struct attribute_group nv_c2c_pmu_peer_attr_group = {
+ .attrs = nv_c2c_pmu_peer_attrs,
+};
+
+/* Format attributes. */
+
+#define NV_C2C_PMU_EXT_ATTR(_name, _func, _config) \
+ (&((struct dev_ext_attribute[]){ \
+ { \
+ .attr = __ATTR(_name, 0444, _func, NULL), \
+ .var = (void *)_config \
+ } \
+ })[0].attr.attr)
+
+#define NV_C2C_PMU_FORMAT_ATTR(_name, _config) \
+ NV_C2C_PMU_EXT_ATTR(_name, device_show_string, _config)
+
+#define NV_C2C_PMU_FORMAT_EVENT_ATTR \
+ NV_C2C_PMU_FORMAT_ATTR(event, "config:0-3")
+
+static struct attribute *nv_c2c_pmu_gpu_formats[] = {
+ NV_C2C_PMU_FORMAT_EVENT_ATTR,
+ NV_C2C_PMU_FORMAT_ATTR(gpu_mask, "config1:0-1"),
+ NULL,
+};
+
+static const struct attribute_group nv_c2c_pmu_gpu_format_group = {
+ .name = "format",
+ .attrs = nv_c2c_pmu_gpu_formats,
+};
+
+static struct attribute *nv_c2c_pmu_formats[] = {
+ NV_C2C_PMU_FORMAT_EVENT_ATTR,
+ NULL,
+};
+
+static const struct attribute_group nv_c2c_pmu_format_group = {
+ .name = "format",
+ .attrs = nv_c2c_pmu_formats,
+};
+
+/* Event attributes. */
+
+static ssize_t nv_c2c_pmu_sysfs_event_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct perf_pmu_events_attr *pmu_attr;
+
+ pmu_attr = container_of(attr, typeof(*pmu_attr), attr);
+ return sysfs_emit(buf, "event=0x%llx\n", pmu_attr->id);
+}
+
+#define NV_C2C_PMU_EVENT_ATTR(_name, _config) \
+ PMU_EVENT_ATTR_ID(_name, nv_c2c_pmu_sysfs_event_show, _config)
+
+static struct attribute *nv_c2c_pmu_gpu_events[] = {
+ NV_C2C_PMU_EVENT_ATTR(cycles, C2C_EVENT_CYCLES),
+ NV_C2C_PMU_EVENT_ATTR(in_rd_cum_outs, C2C_EVENT_IN_RD_CUM_OUTS),
+ NV_C2C_PMU_EVENT_ATTR(in_rd_req, C2C_EVENT_IN_RD_REQ),
+ NV_C2C_PMU_EVENT_ATTR(in_wr_cum_outs, C2C_EVENT_IN_WR_CUM_OUTS),
+ NV_C2C_PMU_EVENT_ATTR(in_wr_req, C2C_EVENT_IN_WR_REQ),
+ NV_C2C_PMU_EVENT_ATTR(out_rd_cum_outs, C2C_EVENT_OUT_RD_CUM_OUTS),
+ NV_C2C_PMU_EVENT_ATTR(out_rd_req, C2C_EVENT_OUT_RD_REQ),
+ NV_C2C_PMU_EVENT_ATTR(out_wr_cum_outs, C2C_EVENT_OUT_WR_CUM_OUTS),
+ NV_C2C_PMU_EVENT_ATTR(out_wr_req, C2C_EVENT_OUT_WR_REQ),
+ NULL
+};
+
+static const struct attribute_group nv_c2c_pmu_gpu_events_group = {
+ .name = "events",
+ .attrs = nv_c2c_pmu_gpu_events,
+};
+
+static struct attribute *nv_c2c_pmu_cpu_events[] = {
+ NV_C2C_PMU_EVENT_ATTR(cycles, C2C_EVENT_CYCLES),
+ NV_C2C_PMU_EVENT_ATTR(in_rd_cum_outs, C2C_EVENT_IN_RD_CUM_OUTS),
+ NV_C2C_PMU_EVENT_ATTR(in_rd_req, C2C_EVENT_IN_RD_REQ),
+ NV_C2C_PMU_EVENT_ATTR(out_rd_cum_outs, C2C_EVENT_OUT_RD_CUM_OUTS),
+ NV_C2C_PMU_EVENT_ATTR(out_rd_req, C2C_EVENT_OUT_RD_REQ),
+ NULL
+};
+
+static const struct attribute_group nv_c2c_pmu_cpu_events_group = {
+ .name = "events",
+ .attrs = nv_c2c_pmu_cpu_events,
+};
+
+static struct attribute *nv_c2c_pmu_cxlmem_events[] = {
+ NV_C2C_PMU_EVENT_ATTR(cycles, C2C_EVENT_CYCLES),
+ NV_C2C_PMU_EVENT_ATTR(in_rd_cum_outs, C2C_EVENT_IN_RD_CUM_OUTS),
+ NV_C2C_PMU_EVENT_ATTR(in_rd_req, C2C_EVENT_IN_RD_REQ),
+ NULL
+};
+
+static const struct attribute_group nv_c2c_pmu_cxlmem_events_group = {
+ .name = "events",
+ .attrs = nv_c2c_pmu_cxlmem_events,
+};
+
+/* Cpumask attributes. */
+
+static ssize_t nv_c2c_pmu_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+ struct nv_c2c_pmu *c2c_pmu = to_c2c_pmu(pmu);
+ struct dev_ext_attribute *eattr =
+ container_of(attr, struct dev_ext_attribute, attr);
+ unsigned long mask_id = (unsigned long)eattr->var;
+ const cpumask_t *cpumask;
+
+ switch (mask_id) {
+ case C2C_ACTIVE_CPU_MASK:
+ cpumask = &c2c_pmu->active_cpu;
+ break;
+ case C2C_ASSOCIATED_CPU_MASK:
+ cpumask = &c2c_pmu->associated_cpus;
+ break;
+ default:
+ return 0;
+ }
+ return cpumap_print_to_pagebuf(true, buf, cpumask);
+}
+
+#define NV_C2C_PMU_CPUMASK_ATTR(_name, _config) \
+ NV_C2C_PMU_EXT_ATTR(_name, nv_c2c_pmu_cpumask_show, \
+ (unsigned long)_config)
+
+static struct attribute *nv_c2c_pmu_cpumask_attrs[] = {
+ NV_C2C_PMU_CPUMASK_ATTR(cpumask, C2C_ACTIVE_CPU_MASK),
+ NV_C2C_PMU_CPUMASK_ATTR(associated_cpus, C2C_ASSOCIATED_CPU_MASK),
+ NULL,
+};
+
+static const struct attribute_group nv_c2c_pmu_cpumask_attr_group = {
+ .attrs = nv_c2c_pmu_cpumask_attrs,
+};
+
+/* Attribute groups for C2C PMU connecting SoC and GPU */
+static const struct attribute_group *nv_c2c_pmu_gpu_attr_groups[] = {
+ &nv_c2c_pmu_gpu_format_group,
+ &nv_c2c_pmu_gpu_events_group,
+ &nv_c2c_pmu_cpumask_attr_group,
+ &nv_c2c_pmu_identifier_attr_group,
+ &nv_c2c_pmu_peer_attr_group,
+ NULL
+};
+
+/* Attribute groups for C2C PMU connecting multiple SoCs */
+static const struct attribute_group *nv_c2c_pmu_cpu_attr_groups[] = {
+ &nv_c2c_pmu_format_group,
+ &nv_c2c_pmu_cpu_events_group,
+ &nv_c2c_pmu_cpumask_attr_group,
+ &nv_c2c_pmu_identifier_attr_group,
+ &nv_c2c_pmu_peer_attr_group,
+ NULL
+};
+
+/* Attribute groups for C2C PMU connecting SoC and CXLMEM */
+static const struct attribute_group *nv_c2c_pmu_cxlmem_attr_groups[] = {
+ &nv_c2c_pmu_format_group,
+ &nv_c2c_pmu_cxlmem_events_group,
+ &nv_c2c_pmu_cpumask_attr_group,
+ &nv_c2c_pmu_identifier_attr_group,
+ &nv_c2c_pmu_peer_attr_group,
+ NULL
+};
+
+static int nv_c2c_pmu_online_cpu(unsigned int cpu, struct hlist_node *node)
+{
+ struct nv_c2c_pmu *c2c_pmu =
+ hlist_entry_safe(node, struct nv_c2c_pmu, cpuhp_node);
+
+ if (!cpumask_test_cpu(cpu, &c2c_pmu->associated_cpus))
+ return 0;
+
+ /* If the PMU is already managed, there is nothing to do */
+ if (!cpumask_empty(&c2c_pmu->active_cpu))
+ return 0;
+
+ /* Use this CPU for event counting */
+ cpumask_set_cpu(cpu, &c2c_pmu->active_cpu);
+
+ return 0;
+}
+
+static int nv_c2c_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node)
+{
+ unsigned int dst;
+
+ struct nv_c2c_pmu *c2c_pmu =
+ hlist_entry_safe(node, struct nv_c2c_pmu, cpuhp_node);
+
+ /* Nothing to do if this CPU doesn't own the PMU */
+ if (!cpumask_test_and_clear_cpu(cpu, &c2c_pmu->active_cpu))
+ return 0;
+
+ /* Choose a new CPU to migrate ownership of the PMU to */
+ dst = cpumask_any_and_but(&c2c_pmu->associated_cpus,
+ cpu_online_mask, cpu);
+ if (dst >= nr_cpu_ids)
+ return 0;
+
+ /* Use this CPU for event counting */
+ perf_pmu_migrate_context(&c2c_pmu->pmu, cpu, dst);
+ cpumask_set_cpu(dst, &c2c_pmu->active_cpu);
+
+ return 0;
+}
+
+static int nv_c2c_pmu_get_cpus(struct nv_c2c_pmu *c2c_pmu)
+{
+ int socket = c2c_pmu->socket, cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu_to_node(cpu) == socket)
+ cpumask_set_cpu(cpu, &c2c_pmu->associated_cpus);
+ }
+
+ if (cpumask_empty(&c2c_pmu->associated_cpus)) {
+ dev_dbg(c2c_pmu->dev,
+ "No cpu associated with C2C PMU socket-%u\n", socket);
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+static int nv_c2c_pmu_init_socket(struct nv_c2c_pmu *c2c_pmu)
+{
+ const char *uid_str;
+ int ret, socket;
+
+ uid_str = acpi_device_uid(c2c_pmu->acpi_dev);
+ if (!uid_str) {
+ dev_err(c2c_pmu->dev, "No ACPI device UID\n");
+ return -ENODEV;
+ }
+
+ ret = kstrtou32(uid_str, 0, &socket);
+ if (ret) {
+ dev_err(c2c_pmu->dev, "Failed to parse ACPI device UID\n");
+ return ret;
+ }
+
+ c2c_pmu->socket = socket;
+ return 0;
+}
+
+static int nv_c2c_pmu_init_id(struct nv_c2c_pmu *c2c_pmu)
+{
+ char *name;
+
+ name = devm_kasprintf(c2c_pmu->dev, GFP_KERNEL, c2c_pmu->data->name_fmt,
+ c2c_pmu->socket);
+ if (!name)
+ return -ENOMEM;
+
+ c2c_pmu->name = name;
+
+ c2c_pmu->identifier = acpi_device_hid(c2c_pmu->acpi_dev);
+
+ return 0;
+}
+
+static int nv_c2c_pmu_init_filter(struct nv_c2c_pmu *c2c_pmu)
+{
+ u32 cpu_en = 0;
+ struct device *dev = c2c_pmu->dev;
+ const struct nv_c2c_pmu_data *data = c2c_pmu->data;
+
+ if (data->c2c_type == C2C_TYPE_NVDLINK) {
+ c2c_pmu->peer_type = C2C_PEER_TYPE_CXLMEM;
+
+ c2c_pmu->peer_insts[0][0] = (1UL << data->nr_inst) - 1;
+
+ c2c_pmu->nr_peer = C2C_NR_PEER_CXLMEM;
+ c2c_pmu->filter_default = (1 << c2c_pmu->nr_peer) - 1;
+
+ c2c_pmu->attr_groups = nv_c2c_pmu_cxlmem_attr_groups;
+
+ return 0;
+ }
+
+ if (device_property_read_u32(dev, "cpu_en_mask", &cpu_en))
+ dev_dbg(dev, "no cpu_en_mask property\n");
+
+ if (cpu_en) {
+ c2c_pmu->peer_type = C2C_PEER_TYPE_CPU;
+
+ /* Fill peer_insts bitmap with instances connected to peer CPU. */
+ bitmap_from_arr32(c2c_pmu->peer_insts[0], &cpu_en, data->nr_inst);
+
+ c2c_pmu->nr_peer = 1;
+ c2c_pmu->attr_groups = nv_c2c_pmu_cpu_attr_groups;
+ } else {
+ u32 i;
+ const char *props[C2C_NR_PEER_MAX] = {
+ "gpu0_en_mask", "gpu1_en_mask"
+ };
+
+ for (i = 0; i < C2C_NR_PEER_MAX; i++) {
+ u32 gpu_en = 0;
+
+ if (device_property_read_u32(dev, props[i], &gpu_en))
+ dev_dbg(dev, "no %s property\n", props[i]);
+
+ if (gpu_en) {
+ /* Fill peer_insts bitmap with instances connected to peer GPU. */
+ bitmap_from_arr32(c2c_pmu->peer_insts[i], &gpu_en,
+ data->nr_inst);
+
+ c2c_pmu->nr_peer++;
+ }
+ }
+
+ if (c2c_pmu->nr_peer == 0) {
+ dev_err(dev, "No GPU is enabled\n");
+ return -EINVAL;
+ }
+
+ c2c_pmu->peer_type = C2C_PEER_TYPE_GPU;
+ c2c_pmu->attr_groups = nv_c2c_pmu_gpu_attr_groups;
+ }
+
+ c2c_pmu->filter_default = (1 << c2c_pmu->nr_peer) - 1;
+
+ return 0;
+}
+
+static void *nv_c2c_pmu_init_pmu(struct platform_device *pdev)
+{
+ int ret;
+ struct nv_c2c_pmu *c2c_pmu;
+ struct acpi_device *acpi_dev;
+ struct device *dev = &pdev->dev;
+
+ acpi_dev = ACPI_COMPANION(dev);
+ if (!acpi_dev)
+ return ERR_PTR(-ENODEV);
+
+ c2c_pmu = devm_kzalloc(dev, sizeof(*c2c_pmu), GFP_KERNEL);
+ if (!c2c_pmu)
+ return ERR_PTR(-ENOMEM);
+
+ c2c_pmu->dev = dev;
+ c2c_pmu->acpi_dev = acpi_dev;
+ c2c_pmu->data = (const struct nv_c2c_pmu_data *)device_get_match_data(dev);
+ if (!c2c_pmu->data)
+ return ERR_PTR(-EINVAL);
+
+ platform_set_drvdata(pdev, c2c_pmu);
+
+ ret = nv_c2c_pmu_init_socket(c2c_pmu);
+ if (ret)
+ return ERR_PTR(ret);
+
+ ret = nv_c2c_pmu_init_id(c2c_pmu);
+ if (ret)
+ return ERR_PTR(ret);
+
+ ret = nv_c2c_pmu_init_filter(c2c_pmu);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return c2c_pmu;
+}
+
+static int nv_c2c_pmu_init_mmio(struct nv_c2c_pmu *c2c_pmu)
+{
+ int i;
+ struct device *dev = c2c_pmu->dev;
+ struct platform_device *pdev = to_platform_device(dev);
+ const struct nv_c2c_pmu_data *data = c2c_pmu->data;
+
+ /* Map the address of all the instances. */
+ for (i = 0; i < data->nr_inst; i++) {
+ c2c_pmu->base[i] = devm_platform_ioremap_resource(pdev, i);
+ if (IS_ERR(c2c_pmu->base[i])) {
+ dev_err(dev, "Failed map address for instance %d\n", i);
+ return PTR_ERR(c2c_pmu->base[i]);
+ }
+ }
+
+ /* Map broadcast address. */
+ c2c_pmu->base_broadcast = devm_platform_ioremap_resource(pdev,
+ data->nr_inst);
+ if (IS_ERR(c2c_pmu->base_broadcast)) {
+ dev_err(dev, "Failed map broadcast address\n");
+ return PTR_ERR(c2c_pmu->base_broadcast);
+ }
+
+ return 0;
+}
+
+static int nv_c2c_pmu_register_pmu(struct nv_c2c_pmu *c2c_pmu)
+{
+ int ret;
+
+ ret = cpuhp_state_add_instance(nv_c2c_pmu_cpuhp_state,
+ &c2c_pmu->cpuhp_node);
+ if (ret) {
+ dev_err(c2c_pmu->dev, "Error %d registering hotplug\n", ret);
+ return ret;
+ }
+
+ c2c_pmu->pmu = (struct pmu) {
+ .parent = c2c_pmu->dev,
+ .task_ctx_nr = perf_invalid_context,
+ .pmu_enable = nv_c2c_pmu_enable,
+ .pmu_disable = nv_c2c_pmu_disable,
+ .event_init = nv_c2c_pmu_event_init,
+ .add = nv_c2c_pmu_add,
+ .del = nv_c2c_pmu_del,
+ .start = nv_c2c_pmu_start,
+ .stop = nv_c2c_pmu_stop,
+ .read = nv_c2c_pmu_read,
+ .attr_groups = c2c_pmu->attr_groups,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE |
+ PERF_PMU_CAP_NO_INTERRUPT,
+ };
+
+ ret = perf_pmu_register(&c2c_pmu->pmu, c2c_pmu->name, -1);
+ if (ret) {
+ dev_err(c2c_pmu->dev, "Failed to register C2C PMU: %d\n", ret);
+ cpuhp_state_remove_instance(nv_c2c_pmu_cpuhp_state,
+ &c2c_pmu->cpuhp_node);
+ return ret;
+ }
+
+ return 0;
+}
+
+static int nv_c2c_pmu_probe(struct platform_device *pdev)
+{
+ int ret;
+ struct nv_c2c_pmu *c2c_pmu;
+
+ c2c_pmu = nv_c2c_pmu_init_pmu(pdev);
+ if (IS_ERR(c2c_pmu))
+ return PTR_ERR(c2c_pmu);
+
+ ret = nv_c2c_pmu_init_mmio(c2c_pmu);
+ if (ret)
+ return ret;
+
+ ret = nv_c2c_pmu_get_cpus(c2c_pmu);
+ if (ret)
+ return ret;
+
+ ret = nv_c2c_pmu_register_pmu(c2c_pmu);
+ if (ret)
+ return ret;
+
+ dev_dbg(c2c_pmu->dev, "Registered %s PMU\n", c2c_pmu->name);
+
+ return 0;
+}
+
+static void nv_c2c_pmu_device_remove(struct platform_device *pdev)
+{
+ struct nv_c2c_pmu *c2c_pmu = platform_get_drvdata(pdev);
+
+ perf_pmu_unregister(&c2c_pmu->pmu);
+ cpuhp_state_remove_instance(nv_c2c_pmu_cpuhp_state, &c2c_pmu->cpuhp_node);
+}
+
+static const struct acpi_device_id nv_c2c_pmu_acpi_match[] = {
+ { "NVDA2023", (kernel_ulong_t)&nv_c2c_pmu_data[C2C_TYPE_NVLINK] },
+ { "NVDA2022", (kernel_ulong_t)&nv_c2c_pmu_data[C2C_TYPE_NVCLINK] },
+ { "NVDA2020", (kernel_ulong_t)&nv_c2c_pmu_data[C2C_TYPE_NVDLINK] },
+ { }
+};
+MODULE_DEVICE_TABLE(acpi, nv_c2c_pmu_acpi_match);
+
+static struct platform_driver nv_c2c_pmu_driver = {
+ .driver = {
+ .name = "nvidia-t410-c2c-pmu",
+ .acpi_match_table = nv_c2c_pmu_acpi_match,
+ .suppress_bind_attrs = true,
+ },
+ .probe = nv_c2c_pmu_probe,
+ .remove = nv_c2c_pmu_device_remove,
+};
+
+static int __init nv_c2c_pmu_init(void)
+{
+ int ret;
+
+ ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+ "perf/nvidia/c2c:online",
+ nv_c2c_pmu_online_cpu,
+ nv_c2c_pmu_cpu_teardown);
+ if (ret < 0)
+ return ret;
+
+ nv_c2c_pmu_cpuhp_state = ret;
+ return platform_driver_register(&nv_c2c_pmu_driver);
+}
+
+static void __exit nv_c2c_pmu_exit(void)
+{
+ platform_driver_unregister(&nv_c2c_pmu_driver);
+ cpuhp_remove_multi_state(nv_c2c_pmu_cpuhp_state);
+}
+
+module_init(nv_c2c_pmu_init);
+module_exit(nv_c2c_pmu_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("NVIDIA Tegra410 C2C PMU driver");
+MODULE_AUTHOR("Besar Wicaksono <bwicaksono@nvidia.com>");
diff --git a/drivers/perf/nvidia_t410_cmem_latency_pmu.c b/drivers/perf/nvidia_t410_cmem_latency_pmu.c
new file mode 100644
index 000000000000..acb8f5571522
--- /dev/null
+++ b/drivers/perf/nvidia_t410_cmem_latency_pmu.c
@@ -0,0 +1,736 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVIDIA Tegra410 CPU Memory (CMEM) Latency PMU driver.
+ *
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ */
+
+#include <linux/acpi.h>
+#include <linux/bitops.h>
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
+
+#define NUM_INSTANCES 14
+
+/* Register offsets. */
+#define CMEM_LAT_CG_CTRL 0x800
+#define CMEM_LAT_CTRL 0x808
+#define CMEM_LAT_STATUS 0x810
+#define CMEM_LAT_CYCLE_CNTR 0x818
+#define CMEM_LAT_MC0_REQ_CNTR 0x820
+#define CMEM_LAT_MC0_AOR_CNTR 0x830
+#define CMEM_LAT_MC1_REQ_CNTR 0x838
+#define CMEM_LAT_MC1_AOR_CNTR 0x848
+#define CMEM_LAT_MC2_REQ_CNTR 0x850
+#define CMEM_LAT_MC2_AOR_CNTR 0x860
+
+/* CMEM_LAT_CTRL values. */
+#define CMEM_LAT_CTRL_DISABLE 0x0ULL
+#define CMEM_LAT_CTRL_ENABLE 0x1ULL
+#define CMEM_LAT_CTRL_CLR 0x2ULL
+
+/* CMEM_LAT_CG_CTRL values. */
+#define CMEM_LAT_CG_CTRL_DISABLE 0x0ULL
+#define CMEM_LAT_CG_CTRL_ENABLE 0x1ULL
+
+/* CMEM_LAT_STATUS register field. */
+#define CMEM_LAT_STATUS_CYCLE_OVF BIT(0)
+#define CMEM_LAT_STATUS_MC0_AOR_OVF BIT(1)
+#define CMEM_LAT_STATUS_MC0_REQ_OVF BIT(3)
+#define CMEM_LAT_STATUS_MC1_AOR_OVF BIT(4)
+#define CMEM_LAT_STATUS_MC1_REQ_OVF BIT(6)
+#define CMEM_LAT_STATUS_MC2_AOR_OVF BIT(7)
+#define CMEM_LAT_STATUS_MC2_REQ_OVF BIT(9)
+
+/* Events. */
+#define CMEM_LAT_EVENT_CYCLES 0x0
+#define CMEM_LAT_EVENT_REQ 0x1
+#define CMEM_LAT_EVENT_AOR 0x2
+
+#define CMEM_LAT_NUM_EVENTS 0x3
+#define CMEM_LAT_MASK_EVENT 0x3
+#define CMEM_LAT_MAX_ACTIVE_EVENTS 32
+
+#define CMEM_LAT_ACTIVE_CPU_MASK 0x0
+#define CMEM_LAT_ASSOCIATED_CPU_MASK 0x1
+
+static unsigned long cmem_lat_pmu_cpuhp_state;
+
+struct cmem_lat_pmu_hw_events {
+ struct perf_event *events[CMEM_LAT_MAX_ACTIVE_EVENTS];
+ DECLARE_BITMAP(used_ctrs, CMEM_LAT_MAX_ACTIVE_EVENTS);
+};
+
+struct cmem_lat_pmu {
+ struct pmu pmu;
+ struct device *dev;
+ const char *name;
+ const char *identifier;
+ void __iomem *base_broadcast;
+ void __iomem *base[NUM_INSTANCES];
+ cpumask_t associated_cpus;
+ cpumask_t active_cpu;
+ struct hlist_node node;
+ struct cmem_lat_pmu_hw_events hw_events;
+};
+
+#define to_cmem_lat_pmu(p) \
+ container_of(p, struct cmem_lat_pmu, pmu)
+
+
+/* Get event type from perf_event. */
+static inline u32 get_event_type(struct perf_event *event)
+{
+ return (event->attr.config) & CMEM_LAT_MASK_EVENT;
+}
+
+/* PMU operations. */
+static int cmem_lat_pmu_get_event_idx(struct cmem_lat_pmu_hw_events *hw_events,
+ struct perf_event *event)
+{
+ unsigned int idx;
+
+ idx = find_first_zero_bit(hw_events->used_ctrs, CMEM_LAT_MAX_ACTIVE_EVENTS);
+ if (idx >= CMEM_LAT_MAX_ACTIVE_EVENTS)
+ return -EAGAIN;
+
+ set_bit(idx, hw_events->used_ctrs);
+
+ return idx;
+}
+
+static bool cmem_lat_pmu_validate_event(struct pmu *pmu,
+ struct cmem_lat_pmu_hw_events *hw_events,
+ struct perf_event *event)
+{
+ int ret;
+
+ if (is_software_event(event))
+ return true;
+
+ /* Reject groups spanning multiple HW PMUs. */
+ if (event->pmu != pmu)
+ return false;
+
+ ret = cmem_lat_pmu_get_event_idx(hw_events, event);
+ if (ret < 0)
+ return false;
+
+ return true;
+}
+
+/* Make sure the group of events can be scheduled at once on the PMU. */
+static bool cmem_lat_pmu_validate_group(struct perf_event *event)
+{
+ struct perf_event *sibling, *leader = event->group_leader;
+ struct cmem_lat_pmu_hw_events fake_hw_events;
+
+ if (event->group_leader == event)
+ return true;
+
+ memset(&fake_hw_events, 0, sizeof(fake_hw_events));
+
+ if (!cmem_lat_pmu_validate_event(event->pmu, &fake_hw_events, leader))
+ return false;
+
+ for_each_sibling_event(sibling, leader) {
+ if (!cmem_lat_pmu_validate_event(event->pmu, &fake_hw_events, sibling))
+ return false;
+ }
+
+ return cmem_lat_pmu_validate_event(event->pmu, &fake_hw_events, event);
+}
+
+static int cmem_lat_pmu_event_init(struct perf_event *event)
+{
+ struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu);
+ struct hw_perf_event *hwc = &event->hw;
+ u32 event_type = get_event_type(event);
+
+ if (event->attr.type != event->pmu->type ||
+ event_type >= CMEM_LAT_NUM_EVENTS)
+ return -ENOENT;
+
+ /*
+ * Sampling, per-process mode, and per-task counters are not supported
+ * since this PMU is shared across all CPUs.
+ */
+ if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) {
+ dev_dbg(cmem_lat_pmu->pmu.dev,
+ "Can't support sampling and per-process mode\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (event->cpu < 0) {
+ dev_dbg(cmem_lat_pmu->pmu.dev, "Can't support per-task counters\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Make sure the CPU assignment is on one of the CPUs associated with
+ * this PMU.
+ */
+ if (!cpumask_test_cpu(event->cpu, &cmem_lat_pmu->associated_cpus)) {
+ dev_dbg(cmem_lat_pmu->pmu.dev,
+ "Requested cpu is not associated with the PMU\n");
+ return -EINVAL;
+ }
+
+ /* Enforce the current active CPU to handle the events in this PMU. */
+ event->cpu = cpumask_first(&cmem_lat_pmu->active_cpu);
+ if (event->cpu >= nr_cpu_ids)
+ return -EINVAL;
+
+ if (!cmem_lat_pmu_validate_group(event))
+ return -EINVAL;
+
+ hwc->idx = -1;
+ hwc->config = event_type;
+
+ return 0;
+}
+
+static u64 cmem_lat_pmu_read_status(struct cmem_lat_pmu *cmem_lat_pmu,
+ unsigned int inst)
+{
+ return readq(cmem_lat_pmu->base[inst] + CMEM_LAT_STATUS);
+}
+
+static u64 cmem_lat_pmu_read_cycle_counter(struct perf_event *event)
+{
+ const unsigned int instance = 0;
+ u64 status;
+ struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu);
+ struct device *dev = cmem_lat_pmu->dev;
+
+ /*
+ * Use the reading from first instance since all instances are
+ * identical.
+ */
+ status = cmem_lat_pmu_read_status(cmem_lat_pmu, instance);
+ if (status & CMEM_LAT_STATUS_CYCLE_OVF)
+ dev_warn(dev, "Cycle counter overflow\n");
+
+ return readq(cmem_lat_pmu->base[instance] + CMEM_LAT_CYCLE_CNTR);
+}
+
+static u64 cmem_lat_pmu_read_req_counter(struct perf_event *event)
+{
+ unsigned int i;
+ u64 status, val = 0;
+ struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu);
+ struct device *dev = cmem_lat_pmu->dev;
+
+ /* Sum up the counts from all instances. */
+ for (i = 0; i < NUM_INSTANCES; i++) {
+ status = cmem_lat_pmu_read_status(cmem_lat_pmu, i);
+ if (status & CMEM_LAT_STATUS_MC0_REQ_OVF)
+ dev_warn(dev, "MC0 request counter overflow\n");
+ if (status & CMEM_LAT_STATUS_MC1_REQ_OVF)
+ dev_warn(dev, "MC1 request counter overflow\n");
+ if (status & CMEM_LAT_STATUS_MC2_REQ_OVF)
+ dev_warn(dev, "MC2 request counter overflow\n");
+
+ val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC0_REQ_CNTR);
+ val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC1_REQ_CNTR);
+ val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC2_REQ_CNTR);
+ }
+
+ return val;
+}
+
+static u64 cmem_lat_pmu_read_aor_counter(struct perf_event *event)
+{
+ unsigned int i;
+ u64 status, val = 0;
+ struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu);
+ struct device *dev = cmem_lat_pmu->dev;
+
+ /* Sum up the counts from all instances. */
+ for (i = 0; i < NUM_INSTANCES; i++) {
+ status = cmem_lat_pmu_read_status(cmem_lat_pmu, i);
+ if (status & CMEM_LAT_STATUS_MC0_AOR_OVF)
+ dev_warn(dev, "MC0 AOR counter overflow\n");
+ if (status & CMEM_LAT_STATUS_MC1_AOR_OVF)
+ dev_warn(dev, "MC1 AOR counter overflow\n");
+ if (status & CMEM_LAT_STATUS_MC2_AOR_OVF)
+ dev_warn(dev, "MC2 AOR counter overflow\n");
+
+ val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC0_AOR_CNTR);
+ val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC1_AOR_CNTR);
+ val += readq(cmem_lat_pmu->base[i] + CMEM_LAT_MC2_AOR_CNTR);
+ }
+
+ return val;
+}
+
+static u64 (*read_counter_fn[CMEM_LAT_NUM_EVENTS])(struct perf_event *) = {
+ [CMEM_LAT_EVENT_CYCLES] = cmem_lat_pmu_read_cycle_counter,
+ [CMEM_LAT_EVENT_REQ] = cmem_lat_pmu_read_req_counter,
+ [CMEM_LAT_EVENT_AOR] = cmem_lat_pmu_read_aor_counter,
+};
+
+static void cmem_lat_pmu_event_update(struct perf_event *event)
+{
+ u32 event_type;
+ u64 prev, now;
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (hwc->state & PERF_HES_STOPPED)
+ return;
+
+ event_type = hwc->config;
+
+ do {
+ prev = local64_read(&hwc->prev_count);
+ now = read_counter_fn[event_type](event);
+ } while (local64_cmpxchg(&hwc->prev_count, prev, now) != prev);
+
+ local64_add(now - prev, &event->count);
+
+ hwc->state |= PERF_HES_UPTODATE;
+}
+
+static void cmem_lat_pmu_start(struct perf_event *event, int pmu_flags)
+{
+ event->hw.state = 0;
+}
+
+static void cmem_lat_pmu_stop(struct perf_event *event, int pmu_flags)
+{
+ event->hw.state |= PERF_HES_STOPPED;
+}
+
+static int cmem_lat_pmu_add(struct perf_event *event, int flags)
+{
+ struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu);
+ struct cmem_lat_pmu_hw_events *hw_events = &cmem_lat_pmu->hw_events;
+ struct hw_perf_event *hwc = &event->hw;
+ int idx;
+
+ if (WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(),
+ &cmem_lat_pmu->associated_cpus)))
+ return -ENOENT;
+
+ idx = cmem_lat_pmu_get_event_idx(hw_events, event);
+ if (idx < 0)
+ return idx;
+
+ hw_events->events[idx] = event;
+ hwc->idx = idx;
+ hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+
+ if (flags & PERF_EF_START)
+ cmem_lat_pmu_start(event, PERF_EF_RELOAD);
+
+ /* Propagate changes to the userspace mapping. */
+ perf_event_update_userpage(event);
+
+ return 0;
+}
+
+static void cmem_lat_pmu_del(struct perf_event *event, int flags)
+{
+ struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(event->pmu);
+ struct cmem_lat_pmu_hw_events *hw_events = &cmem_lat_pmu->hw_events;
+ struct hw_perf_event *hwc = &event->hw;
+ int idx = hwc->idx;
+
+ cmem_lat_pmu_stop(event, PERF_EF_UPDATE);
+
+ hw_events->events[idx] = NULL;
+
+ clear_bit(idx, hw_events->used_ctrs);
+
+ perf_event_update_userpage(event);
+}
+
+static void cmem_lat_pmu_read(struct perf_event *event)
+{
+ cmem_lat_pmu_event_update(event);
+}
+
+static inline void cmem_lat_pmu_cg_ctrl(struct cmem_lat_pmu *cmem_lat_pmu,
+ u64 val)
+{
+ writeq(val, cmem_lat_pmu->base_broadcast + CMEM_LAT_CG_CTRL);
+}
+
+static inline void cmem_lat_pmu_ctrl(struct cmem_lat_pmu *cmem_lat_pmu, u64 val)
+{
+ writeq(val, cmem_lat_pmu->base_broadcast + CMEM_LAT_CTRL);
+}
+
+static void cmem_lat_pmu_enable(struct pmu *pmu)
+{
+ bool disabled;
+ struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(pmu);
+
+ disabled = bitmap_empty(cmem_lat_pmu->hw_events.used_ctrs,
+ CMEM_LAT_MAX_ACTIVE_EVENTS);
+
+ if (disabled)
+ return;
+
+ /* Enable all the counters. */
+ cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CMEM_LAT_CG_CTRL_ENABLE);
+ cmem_lat_pmu_ctrl(cmem_lat_pmu, CMEM_LAT_CTRL_ENABLE);
+}
+
+static void cmem_lat_pmu_disable(struct pmu *pmu)
+{
+ int idx;
+ struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(pmu);
+
+ /* Disable all the counters. */
+ cmem_lat_pmu_ctrl(cmem_lat_pmu, CMEM_LAT_CTRL_DISABLE);
+
+ /*
+ * The counters will start from 0 again on restart.
+ * Update the events immediately to avoid losing the counts.
+ */
+ for_each_set_bit(idx, cmem_lat_pmu->hw_events.used_ctrs,
+ CMEM_LAT_MAX_ACTIVE_EVENTS) {
+ struct perf_event *event = cmem_lat_pmu->hw_events.events[idx];
+
+ if (!event)
+ continue;
+
+ cmem_lat_pmu_event_update(event);
+
+ local64_set(&event->hw.prev_count, 0ULL);
+ }
+
+ cmem_lat_pmu_ctrl(cmem_lat_pmu, CMEM_LAT_CTRL_CLR);
+ cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CMEM_LAT_CG_CTRL_DISABLE);
+}
+
+/* PMU identifier attribute. */
+
+static ssize_t cmem_lat_pmu_identifier_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(dev_get_drvdata(dev));
+
+ return sysfs_emit(page, "%s\n", cmem_lat_pmu->identifier);
+}
+
+static struct device_attribute cmem_lat_pmu_identifier_attr =
+ __ATTR(identifier, 0444, cmem_lat_pmu_identifier_show, NULL);
+
+static struct attribute *cmem_lat_pmu_identifier_attrs[] = {
+ &cmem_lat_pmu_identifier_attr.attr,
+ NULL
+};
+
+static struct attribute_group cmem_lat_pmu_identifier_attr_group = {
+ .attrs = cmem_lat_pmu_identifier_attrs,
+};
+
+/* Format attributes. */
+
+#define NV_PMU_EXT_ATTR(_name, _func, _config) \
+ (&((struct dev_ext_attribute[]){ \
+ { \
+ .attr = __ATTR(_name, 0444, _func, NULL), \
+ .var = (void *)_config \
+ } \
+ })[0].attr.attr)
+
+static struct attribute *cmem_lat_pmu_formats[] = {
+ NV_PMU_EXT_ATTR(event, device_show_string, "config:0-1"),
+ NULL
+};
+
+static const struct attribute_group cmem_lat_pmu_format_group = {
+ .name = "format",
+ .attrs = cmem_lat_pmu_formats,
+};
+
+/* Event attributes. */
+
+static ssize_t cmem_lat_pmu_sysfs_event_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct perf_pmu_events_attr *pmu_attr;
+
+ pmu_attr = container_of(attr, typeof(*pmu_attr), attr);
+ return sysfs_emit(buf, "event=0x%llx\n", pmu_attr->id);
+}
+
+#define NV_PMU_EVENT_ATTR(_name, _config) \
+ PMU_EVENT_ATTR_ID(_name, cmem_lat_pmu_sysfs_event_show, _config)
+
+static struct attribute *cmem_lat_pmu_events[] = {
+ NV_PMU_EVENT_ATTR(cycles, CMEM_LAT_EVENT_CYCLES),
+ NV_PMU_EVENT_ATTR(rd_req, CMEM_LAT_EVENT_REQ),
+ NV_PMU_EVENT_ATTR(rd_cum_outs, CMEM_LAT_EVENT_AOR),
+ NULL
+};
+
+static const struct attribute_group cmem_lat_pmu_events_group = {
+ .name = "events",
+ .attrs = cmem_lat_pmu_events,
+};
+
+/* Cpumask attributes. */
+
+static ssize_t cmem_lat_pmu_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct pmu *pmu = dev_get_drvdata(dev);
+ struct cmem_lat_pmu *cmem_lat_pmu = to_cmem_lat_pmu(pmu);
+ struct dev_ext_attribute *eattr =
+ container_of(attr, struct dev_ext_attribute, attr);
+ unsigned long mask_id = (unsigned long)eattr->var;
+ const cpumask_t *cpumask;
+
+ switch (mask_id) {
+ case CMEM_LAT_ACTIVE_CPU_MASK:
+ cpumask = &cmem_lat_pmu->active_cpu;
+ break;
+ case CMEM_LAT_ASSOCIATED_CPU_MASK:
+ cpumask = &cmem_lat_pmu->associated_cpus;
+ break;
+ default:
+ return 0;
+ }
+ return cpumap_print_to_pagebuf(true, buf, cpumask);
+}
+
+#define NV_PMU_CPUMASK_ATTR(_name, _config) \
+ NV_PMU_EXT_ATTR(_name, cmem_lat_pmu_cpumask_show, \
+ (unsigned long)_config)
+
+static struct attribute *cmem_lat_pmu_cpumask_attrs[] = {
+ NV_PMU_CPUMASK_ATTR(cpumask, CMEM_LAT_ACTIVE_CPU_MASK),
+ NV_PMU_CPUMASK_ATTR(associated_cpus, CMEM_LAT_ASSOCIATED_CPU_MASK),
+ NULL
+};
+
+static const struct attribute_group cmem_lat_pmu_cpumask_attr_group = {
+ .attrs = cmem_lat_pmu_cpumask_attrs,
+};
+
+/* Per PMU device attribute groups. */
+
+static const struct attribute_group *cmem_lat_pmu_attr_groups[] = {
+ &cmem_lat_pmu_identifier_attr_group,
+ &cmem_lat_pmu_format_group,
+ &cmem_lat_pmu_events_group,
+ &cmem_lat_pmu_cpumask_attr_group,
+ NULL
+};
+
+static int cmem_lat_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
+{
+ struct cmem_lat_pmu *cmem_lat_pmu =
+ hlist_entry_safe(node, struct cmem_lat_pmu, node);
+
+ if (!cpumask_test_cpu(cpu, &cmem_lat_pmu->associated_cpus))
+ return 0;
+
+ /* If the PMU is already managed, there is nothing to do */
+ if (!cpumask_empty(&cmem_lat_pmu->active_cpu))
+ return 0;
+
+ /* Use this CPU for event counting */
+ cpumask_set_cpu(cpu, &cmem_lat_pmu->active_cpu);
+
+ return 0;
+}
+
+static int cmem_lat_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node)
+{
+ unsigned int dst;
+
+ struct cmem_lat_pmu *cmem_lat_pmu =
+ hlist_entry_safe(node, struct cmem_lat_pmu, node);
+
+ /* Nothing to do if this CPU doesn't own the PMU */
+ if (!cpumask_test_and_clear_cpu(cpu, &cmem_lat_pmu->active_cpu))
+ return 0;
+
+ /* Choose a new CPU to migrate ownership of the PMU to */
+ dst = cpumask_any_and_but(&cmem_lat_pmu->associated_cpus,
+ cpu_online_mask, cpu);
+ if (dst >= nr_cpu_ids)
+ return 0;
+
+ /* Use this CPU for event counting */
+ perf_pmu_migrate_context(&cmem_lat_pmu->pmu, cpu, dst);
+ cpumask_set_cpu(dst, &cmem_lat_pmu->active_cpu);
+
+ return 0;
+}
+
+static int cmem_lat_pmu_get_cpus(struct cmem_lat_pmu *cmem_lat_pmu,
+ unsigned int socket)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu_to_node(cpu) == socket)
+ cpumask_set_cpu(cpu, &cmem_lat_pmu->associated_cpus);
+ }
+
+ if (cpumask_empty(&cmem_lat_pmu->associated_cpus)) {
+ dev_dbg(cmem_lat_pmu->dev,
+ "No cpu associated with PMU socket-%u\n", socket);
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+static int cmem_lat_pmu_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct acpi_device *acpi_dev;
+ struct cmem_lat_pmu *cmem_lat_pmu;
+ char *name, *uid_str;
+ int ret, i;
+ u32 socket;
+
+ acpi_dev = ACPI_COMPANION(dev);
+ if (!acpi_dev)
+ return -ENODEV;
+
+ uid_str = acpi_device_uid(acpi_dev);
+ if (!uid_str)
+ return -ENODEV;
+
+ ret = kstrtou32(uid_str, 0, &socket);
+ if (ret)
+ return ret;
+
+ cmem_lat_pmu = devm_kzalloc(dev, sizeof(*cmem_lat_pmu), GFP_KERNEL);
+ name = devm_kasprintf(dev, GFP_KERNEL, "nvidia_cmem_latency_pmu_%u", socket);
+ if (!cmem_lat_pmu || !name)
+ return -ENOMEM;
+
+ cmem_lat_pmu->dev = dev;
+ cmem_lat_pmu->name = name;
+ cmem_lat_pmu->identifier = acpi_device_hid(acpi_dev);
+ platform_set_drvdata(pdev, cmem_lat_pmu);
+
+ cmem_lat_pmu->pmu = (struct pmu) {
+ .parent = &pdev->dev,
+ .task_ctx_nr = perf_invalid_context,
+ .pmu_enable = cmem_lat_pmu_enable,
+ .pmu_disable = cmem_lat_pmu_disable,
+ .event_init = cmem_lat_pmu_event_init,
+ .add = cmem_lat_pmu_add,
+ .del = cmem_lat_pmu_del,
+ .start = cmem_lat_pmu_start,
+ .stop = cmem_lat_pmu_stop,
+ .read = cmem_lat_pmu_read,
+ .attr_groups = cmem_lat_pmu_attr_groups,
+ .capabilities = PERF_PMU_CAP_NO_EXCLUDE |
+ PERF_PMU_CAP_NO_INTERRUPT,
+ };
+
+ /* Map the address of all the instances. */
+ for (i = 0; i < NUM_INSTANCES; i++) {
+ cmem_lat_pmu->base[i] = devm_platform_ioremap_resource(pdev, i);
+ if (IS_ERR(cmem_lat_pmu->base[i])) {
+ dev_err(dev, "Failed map address for instance %d\n", i);
+ return PTR_ERR(cmem_lat_pmu->base[i]);
+ }
+ }
+
+ /* Map broadcast address. */
+ cmem_lat_pmu->base_broadcast = devm_platform_ioremap_resource(pdev,
+ NUM_INSTANCES);
+ if (IS_ERR(cmem_lat_pmu->base_broadcast)) {
+ dev_err(dev, "Failed map broadcast address\n");
+ return PTR_ERR(cmem_lat_pmu->base_broadcast);
+ }
+
+ ret = cmem_lat_pmu_get_cpus(cmem_lat_pmu, socket);
+ if (ret)
+ return ret;
+
+ ret = cpuhp_state_add_instance(cmem_lat_pmu_cpuhp_state,
+ &cmem_lat_pmu->node);
+ if (ret) {
+ dev_err(&pdev->dev, "Error %d registering hotplug\n", ret);
+ return ret;
+ }
+
+ cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CMEM_LAT_CG_CTRL_ENABLE);
+ cmem_lat_pmu_ctrl(cmem_lat_pmu, CMEM_LAT_CTRL_CLR);
+ cmem_lat_pmu_cg_ctrl(cmem_lat_pmu, CMEM_LAT_CG_CTRL_DISABLE);
+
+ ret = perf_pmu_register(&cmem_lat_pmu->pmu, name, -1);
+ if (ret) {
+ dev_err(&pdev->dev, "Failed to register PMU: %d\n", ret);
+ cpuhp_state_remove_instance(cmem_lat_pmu_cpuhp_state,
+ &cmem_lat_pmu->node);
+ return ret;
+ }
+
+ dev_dbg(&pdev->dev, "Registered %s PMU\n", name);
+
+ return 0;
+}
+
+static void cmem_lat_pmu_device_remove(struct platform_device *pdev)
+{
+ struct cmem_lat_pmu *cmem_lat_pmu = platform_get_drvdata(pdev);
+
+ perf_pmu_unregister(&cmem_lat_pmu->pmu);
+ cpuhp_state_remove_instance(cmem_lat_pmu_cpuhp_state,
+ &cmem_lat_pmu->node);
+}
+
+static const struct acpi_device_id cmem_lat_pmu_acpi_match[] = {
+ { "NVDA2021" },
+ { }
+};
+MODULE_DEVICE_TABLE(acpi, cmem_lat_pmu_acpi_match);
+
+static struct platform_driver cmem_lat_pmu_driver = {
+ .driver = {
+ .name = "nvidia-t410-cmem-latency-pmu",
+ .acpi_match_table = ACPI_PTR(cmem_lat_pmu_acpi_match),
+ .suppress_bind_attrs = true,
+ },
+ .probe = cmem_lat_pmu_probe,
+ .remove = cmem_lat_pmu_device_remove,
+};
+
+static int __init cmem_lat_pmu_init(void)
+{
+ int ret;
+
+ ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+ "perf/nvidia/cmem_latency:online",
+ cmem_lat_pmu_cpu_online,
+ cmem_lat_pmu_cpu_teardown);
+ if (ret < 0)
+ return ret;
+
+ cmem_lat_pmu_cpuhp_state = ret;
+
+ return platform_driver_register(&cmem_lat_pmu_driver);
+}
+
+static void __exit cmem_lat_pmu_exit(void)
+{
+ platform_driver_unregister(&cmem_lat_pmu_driver);
+ cpuhp_remove_multi_state(cmem_lat_pmu_cpuhp_state);
+}
+
+module_init(cmem_lat_pmu_init);
+module_exit(cmem_lat_pmu_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("NVIDIA Tegra410 CPU Memory Latency PMU driver");
+MODULE_AUTHOR("Besar Wicaksono <bwicaksono@nvidia.com>");
diff --git a/drivers/resctrl/Kconfig b/drivers/resctrl/Kconfig
index c808e0470394..672abea3b03c 100644
--- a/drivers/resctrl/Kconfig
+++ b/drivers/resctrl/Kconfig
@@ -1,6 +1,7 @@
menuconfig ARM64_MPAM_DRIVER
bool "MPAM driver"
- depends on ARM64 && ARM64_MPAM && EXPERT
+ depends on ARM64 && ARM64_MPAM
+ select ACPI_MPAM if ACPI
help
Memory System Resource Partitioning and Monitoring (MPAM) driver for
System IP, e.g. caches and memory controllers.
@@ -22,3 +23,9 @@ config MPAM_KUNIT_TEST
If unsure, say N.
endif
+
+config ARM64_MPAM_RESCTRL_FS
+ bool
+ default y if ARM64_MPAM_DRIVER && RESCTRL_FS
+ select RESCTRL_RMID_DEPENDS_ON_CLOSID
+ select RESCTRL_ASSIGN_FIXED
diff --git a/drivers/resctrl/Makefile b/drivers/resctrl/Makefile
index 898199dcf80d..4f6d0e81f9b8 100644
--- a/drivers/resctrl/Makefile
+++ b/drivers/resctrl/Makefile
@@ -1,4 +1,5 @@
obj-$(CONFIG_ARM64_MPAM_DRIVER) += mpam.o
mpam-y += mpam_devices.o
+mpam-$(CONFIG_ARM64_MPAM_RESCTRL_FS) += mpam_resctrl.o
ccflags-$(CONFIG_ARM64_MPAM_DRIVER_DEBUG) += -DDEBUG
diff --git a/drivers/resctrl/mpam_devices.c b/drivers/resctrl/mpam_devices.c
index 0666be6b0e88..41b14344b16f 100644
--- a/drivers/resctrl/mpam_devices.c
+++ b/drivers/resctrl/mpam_devices.c
@@ -29,7 +29,15 @@
#include "mpam_internal.h"
-DEFINE_STATIC_KEY_FALSE(mpam_enabled); /* This moves to arch code */
+/* Values for the T241 errata workaround */
+#define T241_CHIPS_MAX 4
+#define T241_CHIP_NSLICES 12
+#define T241_SPARE_REG0_OFF 0x1b0000
+#define T241_SPARE_REG1_OFF 0x1c0000
+#define T241_CHIP_ID(phys) FIELD_GET(GENMASK_ULL(44, 43), phys)
+#define T241_SHADOW_REG_OFF(sidx, pid) (0x360048 + (sidx) * 0x10000 + (pid) * 8)
+#define SMCCC_SOC_ID_T241 0x036b0241
+static void __iomem *t241_scratch_regs[T241_CHIPS_MAX];
/*
* mpam_list_lock protects the SRCU lists when writing. Once the
@@ -76,6 +84,14 @@ static DECLARE_WORK(mpam_broken_work, &mpam_disable);
static char *mpam_disable_reason;
/*
+ * Whether resctrl has been setup. Used by cpuhp in preference to
+ * mpam_is_enabled(). The disable call after an error interrupt makes
+ * mpam_is_enabled() false before the cpuhp callbacks are made.
+ * Reads/writes should hold mpam_cpuhp_state_lock, (or be cpuhp callbacks).
+ */
+static bool mpam_resctrl_enabled;
+
+/*
* An MSC is a physical container for controls and monitors, each identified by
* their RIS index. These share a base-address, interrupts and some MMIO
* registers. A vMSC is a virtual container for RIS in an MSC that control or
@@ -624,6 +640,86 @@ static struct mpam_msc_ris *mpam_get_or_create_ris(struct mpam_msc *msc,
return ERR_PTR(-ENOENT);
}
+static int mpam_enable_quirk_nvidia_t241_1(struct mpam_msc *msc,
+ const struct mpam_quirk *quirk)
+{
+ s32 soc_id = arm_smccc_get_soc_id_version();
+ struct resource *r;
+ phys_addr_t phys;
+
+ /*
+ * A mapping to a device other than the MSC is needed, check
+ * SOC_ID is NVIDIA T241 chip (036b:0241)
+ */
+ if (soc_id < 0 || soc_id != SMCCC_SOC_ID_T241)
+ return -EINVAL;
+
+ r = platform_get_resource(msc->pdev, IORESOURCE_MEM, 0);
+ if (!r)
+ return -EINVAL;
+
+ /* Find the internal registers base addr from the CHIP ID */
+ msc->t241_id = T241_CHIP_ID(r->start);
+ phys = FIELD_PREP(GENMASK_ULL(45, 44), msc->t241_id) | 0x19000000ULL;
+
+ t241_scratch_regs[msc->t241_id] = ioremap(phys, SZ_8M);
+ if (WARN_ON_ONCE(!t241_scratch_regs[msc->t241_id]))
+ return -EINVAL;
+
+ pr_info_once("Enabled workaround for NVIDIA T241 erratum T241-MPAM-1\n");
+
+ return 0;
+}
+
+static const struct mpam_quirk mpam_quirks[] = {
+ {
+ /* NVIDIA t241 erratum T241-MPAM-1 */
+ .init = mpam_enable_quirk_nvidia_t241_1,
+ .iidr = MPAM_IIDR_NVIDIA_T241,
+ .iidr_mask = MPAM_IIDR_MATCH_ONE,
+ .workaround = T241_SCRUB_SHADOW_REGS,
+ },
+ {
+ /* NVIDIA t241 erratum T241-MPAM-4 */
+ .iidr = MPAM_IIDR_NVIDIA_T241,
+ .iidr_mask = MPAM_IIDR_MATCH_ONE,
+ .workaround = T241_FORCE_MBW_MIN_TO_ONE,
+ },
+ {
+ /* NVIDIA t241 erratum T241-MPAM-6 */
+ .iidr = MPAM_IIDR_NVIDIA_T241,
+ .iidr_mask = MPAM_IIDR_MATCH_ONE,
+ .workaround = T241_MBW_COUNTER_SCALE_64,
+ },
+ {
+ /* ARM CMN-650 CSU erratum 3642720 */
+ .iidr = MPAM_IIDR_ARM_CMN_650,
+ .iidr_mask = MPAM_IIDR_MATCH_ONE,
+ .workaround = IGNORE_CSU_NRDY,
+ },
+ { NULL } /* Sentinel */
+};
+
+static void mpam_enable_quirks(struct mpam_msc *msc)
+{
+ const struct mpam_quirk *quirk;
+
+ for (quirk = &mpam_quirks[0]; quirk->iidr_mask; quirk++) {
+ int err = 0;
+
+ if (quirk->iidr != (msc->iidr & quirk->iidr_mask))
+ continue;
+
+ if (quirk->init)
+ err = quirk->init(msc, quirk);
+
+ if (err)
+ continue;
+
+ mpam_set_quirk(quirk->workaround, msc);
+ }
+}
+
/*
* IHI009A.a has this nugget: "If a monitor does not support automatic behaviour
* of NRDY, software can use this bit for any purpose" - so hardware might not
@@ -715,6 +811,13 @@ static void mpam_ris_hw_probe(struct mpam_msc_ris *ris)
mpam_set_feature(mpam_feat_mbw_part, props);
props->bwa_wd = FIELD_GET(MPAMF_MBW_IDR_BWA_WD, mbw_features);
+
+ /*
+ * The BWA_WD field can represent 0-63, but the control fields it
+ * describes have a maximum of 16 bits.
+ */
+ props->bwa_wd = min(props->bwa_wd, 16);
+
if (props->bwa_wd && FIELD_GET(MPAMF_MBW_IDR_HAS_MAX, mbw_features))
mpam_set_feature(mpam_feat_mbw_max, props);
@@ -851,8 +954,11 @@ static int mpam_msc_hw_probe(struct mpam_msc *msc)
/* Grab an IDR value to find out how many RIS there are */
mutex_lock(&msc->part_sel_lock);
idr = mpam_msc_read_idr(msc);
+ msc->iidr = mpam_read_partsel_reg(msc, IIDR);
mutex_unlock(&msc->part_sel_lock);
+ mpam_enable_quirks(msc);
+
msc->ris_max = FIELD_GET(MPAMF_IDR_RIS_MAX, idr);
/* Use these values so partid/pmg always starts with a valid value */
@@ -903,6 +1009,7 @@ struct mon_read {
enum mpam_device_features type;
u64 *val;
int err;
+ bool waited_timeout;
};
static bool mpam_ris_has_mbwu_long_counter(struct mpam_msc_ris *ris)
@@ -1052,7 +1159,7 @@ static void write_msmon_ctl_flt_vals(struct mon_read *m, u32 ctl_val,
}
}
-static u64 mpam_msmon_overflow_val(enum mpam_device_features type)
+static u64 __mpam_msmon_overflow_val(enum mpam_device_features type)
{
/* TODO: implement scaling counters */
switch (type) {
@@ -1067,6 +1174,18 @@ static u64 mpam_msmon_overflow_val(enum mpam_device_features type)
}
}
+static u64 mpam_msmon_overflow_val(enum mpam_device_features type,
+ struct mpam_msc *msc)
+{
+ u64 overflow_val = __mpam_msmon_overflow_val(type);
+
+ if (mpam_has_quirk(T241_MBW_COUNTER_SCALE_64, msc) &&
+ type != mpam_feat_msmon_mbwu_63counter)
+ overflow_val *= 64;
+
+ return overflow_val;
+}
+
static void __ris_msmon_read(void *arg)
{
u64 now;
@@ -1137,6 +1256,10 @@ static void __ris_msmon_read(void *arg)
if (mpam_has_feature(mpam_feat_msmon_csu_hw_nrdy, rprops))
nrdy = now & MSMON___NRDY;
now = FIELD_GET(MSMON___VALUE, now);
+
+ if (mpam_has_quirk(IGNORE_CSU_NRDY, msc) && m->waited_timeout)
+ nrdy = false;
+
break;
case mpam_feat_msmon_mbwu_31counter:
case mpam_feat_msmon_mbwu_44counter:
@@ -1157,13 +1280,17 @@ static void __ris_msmon_read(void *arg)
now = FIELD_GET(MSMON___VALUE, now);
}
+ if (mpam_has_quirk(T241_MBW_COUNTER_SCALE_64, msc) &&
+ m->type != mpam_feat_msmon_mbwu_63counter)
+ now *= 64;
+
if (nrdy)
break;
mbwu_state = &ris->mbwu_state[ctx->mon];
if (overflow)
- mbwu_state->correction += mpam_msmon_overflow_val(m->type);
+ mbwu_state->correction += mpam_msmon_overflow_val(m->type, msc);
/*
* Include bandwidth consumed before the last hardware reset and
@@ -1270,6 +1397,7 @@ int mpam_msmon_read(struct mpam_component *comp, struct mon_cfg *ctx,
.ctx = ctx,
.type = type,
.val = val,
+ .waited_timeout = true,
};
*val = 0;
@@ -1338,6 +1466,75 @@ static void mpam_reset_msc_bitmap(struct mpam_msc *msc, u16 reg, u16 wd)
__mpam_write_reg(msc, reg, bm);
}
+static void mpam_apply_t241_erratum(struct mpam_msc_ris *ris, u16 partid)
+{
+ int sidx, i, lcount = 1000;
+ void __iomem *regs;
+ u64 val0, val;
+
+ regs = t241_scratch_regs[ris->vmsc->msc->t241_id];
+
+ for (i = 0; i < lcount; i++) {
+ /* Read the shadow register at index 0 */
+ val0 = readq_relaxed(regs + T241_SHADOW_REG_OFF(0, partid));
+
+ /* Check if all the shadow registers have the same value */
+ for (sidx = 1; sidx < T241_CHIP_NSLICES; sidx++) {
+ val = readq_relaxed(regs +
+ T241_SHADOW_REG_OFF(sidx, partid));
+ if (val != val0)
+ break;
+ }
+ if (sidx == T241_CHIP_NSLICES)
+ break;
+ }
+
+ if (i == lcount)
+ pr_warn_once("t241: inconsistent values in shadow regs");
+
+ /* Write a value zero to spare registers to take effect of MBW conf */
+ writeq_relaxed(0, regs + T241_SPARE_REG0_OFF);
+ writeq_relaxed(0, regs + T241_SPARE_REG1_OFF);
+}
+
+static void mpam_quirk_post_config_change(struct mpam_msc_ris *ris, u16 partid,
+ struct mpam_config *cfg)
+{
+ if (mpam_has_quirk(T241_SCRUB_SHADOW_REGS, ris->vmsc->msc))
+ mpam_apply_t241_erratum(ris, partid);
+}
+
+static u16 mpam_wa_t241_force_mbw_min_to_one(struct mpam_props *props)
+{
+ u16 max_hw_value, min_hw_granule, res0_bits;
+
+ res0_bits = 16 - props->bwa_wd;
+ max_hw_value = ((1 << props->bwa_wd) - 1) << res0_bits;
+ min_hw_granule = ~max_hw_value;
+
+ return min_hw_granule + 1;
+}
+
+static u16 mpam_wa_t241_calc_min_from_max(struct mpam_props *props,
+ struct mpam_config *cfg)
+{
+ u16 val = 0;
+ u16 max;
+ u16 delta = ((5 * MPAMCFG_MBW_MAX_MAX) / 100) - 1;
+
+ if (mpam_has_feature(mpam_feat_mbw_max, cfg)) {
+ max = cfg->mbw_max;
+ } else {
+ /* Resetting. Hence, use the ris specific default. */
+ max = GENMASK(15, 16 - props->bwa_wd);
+ }
+
+ if (max > delta)
+ val = max - delta;
+
+ return val;
+}
+
/* Called via IPI. Call while holding an SRCU reference */
static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid,
struct mpam_config *cfg)
@@ -1364,36 +1561,41 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid,
__mpam_intpart_sel(ris->ris_idx, partid, msc);
}
- if (mpam_has_feature(mpam_feat_cpor_part, rprops) &&
- mpam_has_feature(mpam_feat_cpor_part, cfg)) {
- if (cfg->reset_cpbm)
- mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, rprops->cpbm_wd);
- else
+ if (mpam_has_feature(mpam_feat_cpor_part, rprops)) {
+ if (mpam_has_feature(mpam_feat_cpor_part, cfg))
mpam_write_partsel_reg(msc, CPBM, cfg->cpbm);
+ else
+ mpam_reset_msc_bitmap(msc, MPAMCFG_CPBM, rprops->cpbm_wd);
}
- if (mpam_has_feature(mpam_feat_mbw_part, rprops) &&
- mpam_has_feature(mpam_feat_mbw_part, cfg)) {
- if (cfg->reset_mbw_pbm)
+ if (mpam_has_feature(mpam_feat_mbw_part, rprops)) {
+ if (mpam_has_feature(mpam_feat_mbw_part, cfg))
mpam_reset_msc_bitmap(msc, MPAMCFG_MBW_PBM, rprops->mbw_pbm_bits);
else
mpam_write_partsel_reg(msc, MBW_PBM, cfg->mbw_pbm);
}
- if (mpam_has_feature(mpam_feat_mbw_min, rprops) &&
- mpam_has_feature(mpam_feat_mbw_min, cfg))
- mpam_write_partsel_reg(msc, MBW_MIN, 0);
+ if (mpam_has_feature(mpam_feat_mbw_min, rprops)) {
+ u16 val = 0;
- if (mpam_has_feature(mpam_feat_mbw_max, rprops) &&
- mpam_has_feature(mpam_feat_mbw_max, cfg)) {
- if (cfg->reset_mbw_max)
- mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX);
- else
+ if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, msc)) {
+ u16 min = mpam_wa_t241_force_mbw_min_to_one(rprops);
+
+ val = mpam_wa_t241_calc_min_from_max(rprops, cfg);
+ val = max(val, min);
+ }
+
+ mpam_write_partsel_reg(msc, MBW_MIN, val);
+ }
+
+ if (mpam_has_feature(mpam_feat_mbw_max, rprops)) {
+ if (mpam_has_feature(mpam_feat_mbw_max, cfg))
mpam_write_partsel_reg(msc, MBW_MAX, cfg->mbw_max);
+ else
+ mpam_write_partsel_reg(msc, MBW_MAX, MPAMCFG_MBW_MAX_MAX);
}
- if (mpam_has_feature(mpam_feat_mbw_prop, rprops) &&
- mpam_has_feature(mpam_feat_mbw_prop, cfg))
+ if (mpam_has_feature(mpam_feat_mbw_prop, rprops))
mpam_write_partsel_reg(msc, MBW_PROP, 0);
if (mpam_has_feature(mpam_feat_cmax_cmax, rprops))
@@ -1421,6 +1623,8 @@ static void mpam_reprogram_ris_partid(struct mpam_msc_ris *ris, u16 partid,
mpam_write_partsel_reg(msc, PRI, pri_val);
}
+ mpam_quirk_post_config_change(ris, partid, cfg);
+
mutex_unlock(&msc->part_sel_lock);
}
@@ -1493,16 +1697,6 @@ static int mpam_save_mbwu_state(void *arg)
return 0;
}
-static void mpam_init_reset_cfg(struct mpam_config *reset_cfg)
-{
- *reset_cfg = (struct mpam_config) {
- .reset_cpbm = true,
- .reset_mbw_pbm = true,
- .reset_mbw_max = true,
- };
- bitmap_fill(reset_cfg->features, MPAM_FEATURE_LAST);
-}
-
/*
* Called via smp_call_on_cpu() to prevent migration, while still being
* pre-emptible. Caller must hold mpam_srcu.
@@ -1510,14 +1704,12 @@ static void mpam_init_reset_cfg(struct mpam_config *reset_cfg)
static int mpam_reset_ris(void *arg)
{
u16 partid, partid_max;
- struct mpam_config reset_cfg;
+ struct mpam_config reset_cfg = {};
struct mpam_msc_ris *ris = arg;
if (ris->in_reset_state)
return 0;
- mpam_init_reset_cfg(&reset_cfg);
-
spin_lock(&partid_max_lock);
partid_max = mpam_partid_max;
spin_unlock(&partid_max_lock);
@@ -1632,6 +1824,9 @@ static int mpam_cpu_online(unsigned int cpu)
mpam_reprogram_msc(msc);
}
+ if (mpam_resctrl_enabled)
+ return mpam_resctrl_online_cpu(cpu);
+
return 0;
}
@@ -1675,6 +1870,9 @@ static int mpam_cpu_offline(unsigned int cpu)
{
struct mpam_msc *msc;
+ if (mpam_resctrl_enabled)
+ mpam_resctrl_offline_cpu(cpu);
+
guard(srcu)(&mpam_srcu);
list_for_each_entry_srcu(msc, &mpam_all_msc, all_msc_list,
srcu_read_lock_held(&mpam_srcu)) {
@@ -1971,6 +2169,7 @@ static bool mpam_has_cmax_wd_feature(struct mpam_props *props)
* resulting safe value must be compatible with both. When merging values in
* the tree, all the aliasing resources must be handled first.
* On mismatch, parent is modified.
+ * Quirks on an MSC will apply to all MSC in that class.
*/
static void __props_mismatch(struct mpam_props *parent,
struct mpam_props *child, bool alias)
@@ -2090,6 +2289,7 @@ static void __props_mismatch(struct mpam_props *parent,
* nobble the class feature, as we can't configure all the resources.
* e.g. The L3 cache is composed of two resources with 13 and 17 portion
* bitmaps respectively.
+ * Quirks on an MSC will apply to all MSC in that class.
*/
static void
__class_props_mismatch(struct mpam_class *class, struct mpam_vmsc *vmsc)
@@ -2103,6 +2303,9 @@ __class_props_mismatch(struct mpam_class *class, struct mpam_vmsc *vmsc)
dev_dbg(dev, "Merging features for class:0x%lx &= vmsc:0x%lx\n",
(long)cprops->features, (long)vprops->features);
+ /* Merge quirks */
+ class->quirks |= vmsc->msc->quirks;
+
/* Take the safe value for any common features */
__props_mismatch(cprops, vprops, false);
}
@@ -2167,6 +2370,9 @@ static void mpam_enable_merge_class_features(struct mpam_component *comp)
list_for_each_entry(vmsc, &comp->vmsc, comp_list)
__class_props_mismatch(class, vmsc);
+
+ if (mpam_has_quirk(T241_FORCE_MBW_MIN_TO_ONE, class))
+ mpam_clear_feature(mpam_feat_mbw_min, &class->props);
}
/*
@@ -2520,6 +2726,12 @@ static void mpam_enable_once(void)
mutex_unlock(&mpam_list_lock);
cpus_read_unlock();
+ if (!err) {
+ err = mpam_resctrl_setup();
+ if (err)
+ pr_err("Failed to initialise resctrl: %d\n", err);
+ }
+
if (err) {
mpam_disable_reason = "Failed to enable.";
schedule_work(&mpam_broken_work);
@@ -2527,6 +2739,7 @@ static void mpam_enable_once(void)
}
static_branch_enable(&mpam_enabled);
+ mpam_resctrl_enabled = true;
mpam_register_cpuhp_callbacks(mpam_cpu_online, mpam_cpu_offline,
"mpam:online");
@@ -2559,7 +2772,7 @@ static void mpam_reset_component_locked(struct mpam_component *comp)
}
}
-static void mpam_reset_class_locked(struct mpam_class *class)
+void mpam_reset_class_locked(struct mpam_class *class)
{
struct mpam_component *comp;
@@ -2586,24 +2799,39 @@ static void mpam_reset_class(struct mpam_class *class)
void mpam_disable(struct work_struct *ignored)
{
int idx;
+ bool do_resctrl_exit;
struct mpam_class *class;
struct mpam_msc *msc, *tmp;
+ if (mpam_is_enabled())
+ static_branch_disable(&mpam_enabled);
+
mutex_lock(&mpam_cpuhp_state_lock);
if (mpam_cpuhp_state) {
cpuhp_remove_state(mpam_cpuhp_state);
mpam_cpuhp_state = 0;
}
+
+ /*
+ * Removing the cpuhp state called mpam_cpu_offline() and told resctrl
+ * all the CPUs are offline.
+ */
+ do_resctrl_exit = mpam_resctrl_enabled;
+ mpam_resctrl_enabled = false;
mutex_unlock(&mpam_cpuhp_state_lock);
- static_branch_disable(&mpam_enabled);
+ if (do_resctrl_exit)
+ mpam_resctrl_exit();
mpam_unregister_irqs();
idx = srcu_read_lock(&mpam_srcu);
list_for_each_entry_srcu(class, &mpam_classes, classes_list,
- srcu_read_lock_held(&mpam_srcu))
+ srcu_read_lock_held(&mpam_srcu)) {
mpam_reset_class(class);
+ if (do_resctrl_exit)
+ mpam_resctrl_teardown_class(class);
+ }
srcu_read_unlock(&mpam_srcu, idx);
mutex_lock(&mpam_list_lock);
@@ -2694,6 +2922,7 @@ int mpam_apply_config(struct mpam_component *comp, u16 partid,
srcu_read_lock_held(&mpam_srcu)) {
arg.ris = ris;
mpam_touch_msc(msc, __write_config, &arg);
+ ris->in_reset_state = false;
}
mutex_unlock(&msc->cfg_lock);
}
diff --git a/drivers/resctrl/mpam_internal.h b/drivers/resctrl/mpam_internal.h
index e8971842b124..1914aefdcba9 100644
--- a/drivers/resctrl/mpam_internal.h
+++ b/drivers/resctrl/mpam_internal.h
@@ -12,22 +12,31 @@
#include <linux/jump_label.h>
#include <linux/llist.h>
#include <linux/mutex.h>
+#include <linux/resctrl.h>
#include <linux/spinlock.h>
#include <linux/srcu.h>
#include <linux/types.h>
+#include <asm/mpam.h>
+
#define MPAM_MSC_MAX_NUM_RIS 16
struct platform_device;
-DECLARE_STATIC_KEY_FALSE(mpam_enabled);
-
#ifdef CONFIG_MPAM_KUNIT_TEST
#define PACKED_FOR_KUNIT __packed
#else
#define PACKED_FOR_KUNIT
#endif
+/*
+ * This 'mon' values must not alias an actual monitor, so must be larger than
+ * U16_MAX, but not be confused with an errno value, so smaller than
+ * (u32)-SZ_4K.
+ * USE_PRE_ALLOCATED is used to avoid confusion with an actual monitor.
+ */
+#define USE_PRE_ALLOCATED (U16_MAX + 1)
+
static inline bool mpam_is_enabled(void)
{
return static_branch_likely(&mpam_enabled);
@@ -76,6 +85,8 @@ struct mpam_msc {
u8 pmg_max;
unsigned long ris_idxs;
u32 ris_max;
+ u32 iidr;
+ u16 quirks;
/*
* error_irq_lock is taken when registering/unregistering the error
@@ -119,6 +130,9 @@ struct mpam_msc {
void __iomem *mapped_hwpage;
size_t mapped_hwpage_sz;
+ /* Values only used on some platforms for quirks */
+ u32 t241_id;
+
struct mpam_garbage garbage;
};
@@ -207,6 +221,42 @@ struct mpam_props {
#define mpam_set_feature(_feat, x) __set_bit(_feat, (x)->features)
#define mpam_clear_feature(_feat, x) __clear_bit(_feat, (x)->features)
+/* Workaround bits for msc->quirks */
+enum mpam_device_quirks {
+ T241_SCRUB_SHADOW_REGS,
+ T241_FORCE_MBW_MIN_TO_ONE,
+ T241_MBW_COUNTER_SCALE_64,
+ IGNORE_CSU_NRDY,
+ MPAM_QUIRK_LAST
+};
+
+#define mpam_has_quirk(_quirk, x) ((1 << (_quirk) & (x)->quirks))
+#define mpam_set_quirk(_quirk, x) ((x)->quirks |= (1 << (_quirk)))
+
+struct mpam_quirk {
+ int (*init)(struct mpam_msc *msc, const struct mpam_quirk *quirk);
+
+ u32 iidr;
+ u32 iidr_mask;
+
+ enum mpam_device_quirks workaround;
+};
+
+#define MPAM_IIDR_MATCH_ONE (FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID, 0xfff) | \
+ FIELD_PREP_CONST(MPAMF_IIDR_VARIANT, 0xf) | \
+ FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0xf) | \
+ FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0xfff))
+
+#define MPAM_IIDR_NVIDIA_T241 (FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID, 0x241) | \
+ FIELD_PREP_CONST(MPAMF_IIDR_VARIANT, 0) | \
+ FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0) | \
+ FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0x36b))
+
+#define MPAM_IIDR_ARM_CMN_650 (FIELD_PREP_CONST(MPAMF_IIDR_PRODUCTID, 0) | \
+ FIELD_PREP_CONST(MPAMF_IIDR_VARIANT, 0) | \
+ FIELD_PREP_CONST(MPAMF_IIDR_REVISION, 0) | \
+ FIELD_PREP_CONST(MPAMF_IIDR_IMPLEMENTER, 0x43b))
+
/* The values for MSMON_CFG_MBWU_FLT.RWBW */
enum mon_filter_options {
COUNT_BOTH = 0,
@@ -215,7 +265,11 @@ enum mon_filter_options {
};
struct mon_cfg {
- u16 mon;
+ /*
+ * mon must be large enough to hold out of range values like
+ * USE_PRE_ALLOCATED
+ */
+ u32 mon;
u8 pmg;
bool match_pmg;
bool csu_exclude_clean;
@@ -246,6 +300,7 @@ struct mpam_class {
struct mpam_props props;
u32 nrdy_usec;
+ u16 quirks;
u8 level;
enum mpam_class_types type;
@@ -266,10 +321,6 @@ struct mpam_config {
u32 mbw_pbm;
u16 mbw_max;
- bool reset_cpbm;
- bool reset_mbw_pbm;
- bool reset_mbw_max;
-
struct mpam_garbage garbage;
};
@@ -337,6 +388,32 @@ struct mpam_msc_ris {
struct mpam_garbage garbage;
};
+struct mpam_resctrl_dom {
+ struct mpam_component *ctrl_comp;
+
+ /*
+ * There is no single mon_comp because different events may be backed
+ * by different class/components. mon_comp is indexed by the event
+ * number.
+ */
+ struct mpam_component *mon_comp[QOS_NUM_EVENTS];
+
+ struct rdt_ctrl_domain resctrl_ctrl_dom;
+ struct rdt_l3_mon_domain resctrl_mon_dom;
+};
+
+struct mpam_resctrl_res {
+ struct mpam_class *class;
+ struct rdt_resource resctrl_res;
+ bool cdp_enabled;
+};
+
+struct mpam_resctrl_mon {
+ struct mpam_class *class;
+
+ /* per-class data that resctrl needs will live here */
+};
+
static inline int mpam_alloc_csu_mon(struct mpam_class *class)
{
struct mpam_props *cprops = &class->props;
@@ -381,6 +458,9 @@ extern u8 mpam_pmg_max;
void mpam_enable(struct work_struct *work);
void mpam_disable(struct work_struct *work);
+/* Reset all the RIS in a class under cpus_read_lock() */
+void mpam_reset_class_locked(struct mpam_class *class);
+
int mpam_apply_config(struct mpam_component *comp, u16 partid,
struct mpam_config *cfg);
@@ -391,6 +471,20 @@ void mpam_msmon_reset_mbwu(struct mpam_component *comp, struct mon_cfg *ctx);
int mpam_get_cpumask_from_cache_id(unsigned long cache_id, u32 cache_level,
cpumask_t *affinity);
+#ifdef CONFIG_RESCTRL_FS
+int mpam_resctrl_setup(void);
+void mpam_resctrl_exit(void);
+int mpam_resctrl_online_cpu(unsigned int cpu);
+void mpam_resctrl_offline_cpu(unsigned int cpu);
+void mpam_resctrl_teardown_class(struct mpam_class *class);
+#else
+static inline int mpam_resctrl_setup(void) { return 0; }
+static inline void mpam_resctrl_exit(void) { }
+static inline int mpam_resctrl_online_cpu(unsigned int cpu) { return 0; }
+static inline void mpam_resctrl_offline_cpu(unsigned int cpu) { }
+static inline void mpam_resctrl_teardown_class(struct mpam_class *class) { }
+#endif /* CONFIG_RESCTRL_FS */
+
/*
* MPAM MSCs have the following register layout. See:
* Arm Memory System Resource Partitioning and Monitoring (MPAM) System
diff --git a/drivers/resctrl/mpam_resctrl.c b/drivers/resctrl/mpam_resctrl.c
new file mode 100644
index 000000000000..a9938006d0e6
--- /dev/null
+++ b/drivers/resctrl/mpam_resctrl.c
@@ -0,0 +1,1704 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2025 Arm Ltd.
+
+#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
+
+#include <linux/arm_mpam.h>
+#include <linux/cacheinfo.h>
+#include <linux/cpu.h>
+#include <linux/cpumask.h>
+#include <linux/errno.h>
+#include <linux/limits.h>
+#include <linux/list.h>
+#include <linux/math.h>
+#include <linux/printk.h>
+#include <linux/rculist.h>
+#include <linux/resctrl.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+
+#include <asm/mpam.h>
+
+#include "mpam_internal.h"
+
+DECLARE_WAIT_QUEUE_HEAD(resctrl_mon_ctx_waiters);
+
+/*
+ * The classes we've picked to map to resctrl resources, wrapped
+ * in with their resctrl structure.
+ * Class pointer may be NULL.
+ */
+static struct mpam_resctrl_res mpam_resctrl_controls[RDT_NUM_RESOURCES];
+
+#define for_each_mpam_resctrl_control(res, rid) \
+ for (rid = 0, res = &mpam_resctrl_controls[rid]; \
+ rid < RDT_NUM_RESOURCES; \
+ rid++, res = &mpam_resctrl_controls[rid])
+
+/*
+ * The classes we've picked to map to resctrl events.
+ * Resctrl believes all the worlds a Xeon, and these are all on the L3. This
+ * array lets us find the actual class backing the event counters. e.g.
+ * the only memory bandwidth counters may be on the memory controller, but to
+ * make use of them, we pretend they are on L3. Restrict the events considered
+ * to those supported by MPAM.
+ * Class pointer may be NULL.
+ */
+#define MPAM_MAX_EVENT QOS_L3_MBM_TOTAL_EVENT_ID
+static struct mpam_resctrl_mon mpam_resctrl_counters[MPAM_MAX_EVENT + 1];
+
+#define for_each_mpam_resctrl_mon(mon, eventid) \
+ for (eventid = QOS_FIRST_EVENT, mon = &mpam_resctrl_counters[eventid]; \
+ eventid <= MPAM_MAX_EVENT; \
+ eventid++, mon = &mpam_resctrl_counters[eventid])
+
+/* The lock for modifying resctrl's domain lists from cpuhp callbacks. */
+static DEFINE_MUTEX(domain_list_lock);
+
+/*
+ * MPAM emulates CDP by setting different PARTID in the I/D fields of MPAM0_EL1.
+ * This applies globally to all traffic the CPU generates.
+ */
+static bool cdp_enabled;
+
+/*
+ * We use cacheinfo to discover the size of the caches and their id. cacheinfo
+ * populates this from a device_initcall(). mpam_resctrl_setup() must wait.
+ */
+static bool cacheinfo_ready;
+static DECLARE_WAIT_QUEUE_HEAD(wait_cacheinfo_ready);
+
+/*
+ * If resctrl_init() succeeded, resctrl_exit() can be used to remove support
+ * for the filesystem in the event of an error.
+ */
+static bool resctrl_enabled;
+
+bool resctrl_arch_alloc_capable(void)
+{
+ struct mpam_resctrl_res *res;
+ enum resctrl_res_level rid;
+
+ for_each_mpam_resctrl_control(res, rid) {
+ if (res->resctrl_res.alloc_capable)
+ return true;
+ }
+
+ return false;
+}
+
+bool resctrl_arch_mon_capable(void)
+{
+ struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3];
+ struct rdt_resource *l3 = &res->resctrl_res;
+
+ /* All monitors are presented as being on the L3 cache */
+ return l3->mon_capable;
+}
+
+bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt)
+{
+ return false;
+}
+
+void resctrl_arch_mon_event_config_read(void *info)
+{
+}
+
+void resctrl_arch_mon_event_config_write(void *info)
+{
+}
+
+void resctrl_arch_reset_rmid_all(struct rdt_resource *r, struct rdt_l3_mon_domain *d)
+{
+}
+
+void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_l3_mon_domain *d,
+ u32 closid, u32 rmid, enum resctrl_event_id eventid)
+{
+}
+
+void resctrl_arch_reset_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d,
+ u32 closid, u32 rmid, int cntr_id,
+ enum resctrl_event_id eventid)
+{
+}
+
+void resctrl_arch_config_cntr(struct rdt_resource *r, struct rdt_l3_mon_domain *d,
+ enum resctrl_event_id evtid, u32 rmid, u32 closid,
+ u32 cntr_id, bool assign)
+{
+}
+
+int resctrl_arch_cntr_read(struct rdt_resource *r, struct rdt_l3_mon_domain *d,
+ u32 unused, u32 rmid, int cntr_id,
+ enum resctrl_event_id eventid, u64 *val)
+{
+ return -EOPNOTSUPP;
+}
+
+bool resctrl_arch_mbm_cntr_assign_enabled(struct rdt_resource *r)
+{
+ return false;
+}
+
+int resctrl_arch_mbm_cntr_assign_set(struct rdt_resource *r, bool enable)
+{
+ return -EINVAL;
+}
+
+int resctrl_arch_io_alloc_enable(struct rdt_resource *r, bool enable)
+{
+ return -EOPNOTSUPP;
+}
+
+bool resctrl_arch_get_io_alloc_enabled(struct rdt_resource *r)
+{
+ return false;
+}
+
+void resctrl_arch_pre_mount(void)
+{
+}
+
+bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level rid)
+{
+ return mpam_resctrl_controls[rid].cdp_enabled;
+}
+
+/**
+ * resctrl_reset_task_closids() - Reset the PARTID/PMG values for all tasks.
+ *
+ * At boot, all existing tasks use partid zero for D and I.
+ * To enable/disable CDP emulation, all these tasks need relabelling.
+ */
+static void resctrl_reset_task_closids(void)
+{
+ struct task_struct *p, *t;
+
+ read_lock(&tasklist_lock);
+ for_each_process_thread(p, t) {
+ resctrl_arch_set_closid_rmid(t, RESCTRL_RESERVED_CLOSID,
+ RESCTRL_RESERVED_RMID);
+ }
+ read_unlock(&tasklist_lock);
+}
+
+int resctrl_arch_set_cdp_enabled(enum resctrl_res_level rid, bool enable)
+{
+ u32 partid_i = RESCTRL_RESERVED_CLOSID, partid_d = RESCTRL_RESERVED_CLOSID;
+ struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3];
+ struct rdt_resource *l3 = &res->resctrl_res;
+ int cpu;
+
+ if (!IS_ENABLED(CONFIG_EXPERT) && enable) {
+ /*
+ * If the resctrl fs is mounted more than once, sequentially,
+ * then CDP can lead to the use of out of range PARTIDs.
+ */
+ pr_warn("CDP not supported\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (enable)
+ pr_warn("CDP is an expert feature and may cause MPAM to malfunction.\n");
+
+ /*
+ * resctrl_arch_set_cdp_enabled() is only called with enable set to
+ * false on error and unmount.
+ */
+ cdp_enabled = enable;
+ mpam_resctrl_controls[rid].cdp_enabled = enable;
+
+ if (enable)
+ l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx() / 2;
+ else
+ l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx();
+
+ /* The mbw_max feature can't hide cdp as it's a per-partid maximum. */
+ if (cdp_enabled && !mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled)
+ mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = false;
+
+ if (mpam_resctrl_controls[RDT_RESOURCE_MBA].cdp_enabled &&
+ mpam_resctrl_controls[RDT_RESOURCE_MBA].class)
+ mpam_resctrl_controls[RDT_RESOURCE_MBA].resctrl_res.alloc_capable = true;
+
+ if (enable) {
+ if (mpam_partid_max < 1)
+ return -EINVAL;
+
+ partid_d = resctrl_get_config_index(RESCTRL_RESERVED_CLOSID, CDP_DATA);
+ partid_i = resctrl_get_config_index(RESCTRL_RESERVED_CLOSID, CDP_CODE);
+ }
+
+ mpam_set_task_partid_pmg(current, partid_d, partid_i, 0, 0);
+ WRITE_ONCE(arm64_mpam_global_default, mpam_get_regval(current));
+
+ resctrl_reset_task_closids();
+
+ for_each_possible_cpu(cpu)
+ mpam_set_cpu_defaults(cpu, partid_d, partid_i, 0, 0);
+ on_each_cpu(resctrl_arch_sync_cpu_closid_rmid, NULL, 1);
+
+ return 0;
+}
+
+static bool mpam_resctrl_hide_cdp(enum resctrl_res_level rid)
+{
+ return cdp_enabled && !resctrl_arch_get_cdp_enabled(rid);
+}
+
+/*
+ * MSC may raise an error interrupt if it sees an out or range partid/pmg,
+ * and go on to truncate the value. Regardless of what the hardware supports,
+ * only the system wide safe value is safe to use.
+ */
+u32 resctrl_arch_get_num_closid(struct rdt_resource *ignored)
+{
+ return mpam_partid_max + 1;
+}
+
+u32 resctrl_arch_system_num_rmid_idx(void)
+{
+ return (mpam_pmg_max + 1) * (mpam_partid_max + 1);
+}
+
+u32 resctrl_arch_rmid_idx_encode(u32 closid, u32 rmid)
+{
+ return closid * (mpam_pmg_max + 1) + rmid;
+}
+
+void resctrl_arch_rmid_idx_decode(u32 idx, u32 *closid, u32 *rmid)
+{
+ *closid = idx / (mpam_pmg_max + 1);
+ *rmid = idx % (mpam_pmg_max + 1);
+}
+
+void resctrl_arch_sched_in(struct task_struct *tsk)
+{
+ lockdep_assert_preemption_disabled();
+
+ mpam_thread_switch(tsk);
+}
+
+void resctrl_arch_set_cpu_default_closid_rmid(int cpu, u32 closid, u32 rmid)
+{
+ WARN_ON_ONCE(closid > U16_MAX);
+ WARN_ON_ONCE(rmid > U8_MAX);
+
+ if (!cdp_enabled) {
+ mpam_set_cpu_defaults(cpu, closid, closid, rmid, rmid);
+ } else {
+ /*
+ * When CDP is enabled, resctrl halves the closid range and we
+ * use odd/even partid for one closid.
+ */
+ u32 partid_d = resctrl_get_config_index(closid, CDP_DATA);
+ u32 partid_i = resctrl_get_config_index(closid, CDP_CODE);
+
+ mpam_set_cpu_defaults(cpu, partid_d, partid_i, rmid, rmid);
+ }
+}
+
+void resctrl_arch_sync_cpu_closid_rmid(void *info)
+{
+ struct resctrl_cpu_defaults *r = info;
+
+ lockdep_assert_preemption_disabled();
+
+ if (r) {
+ resctrl_arch_set_cpu_default_closid_rmid(smp_processor_id(),
+ r->closid, r->rmid);
+ }
+
+ resctrl_arch_sched_in(current);
+}
+
+void resctrl_arch_set_closid_rmid(struct task_struct *tsk, u32 closid, u32 rmid)
+{
+ WARN_ON_ONCE(closid > U16_MAX);
+ WARN_ON_ONCE(rmid > U8_MAX);
+
+ if (!cdp_enabled) {
+ mpam_set_task_partid_pmg(tsk, closid, closid, rmid, rmid);
+ } else {
+ u32 partid_d = resctrl_get_config_index(closid, CDP_DATA);
+ u32 partid_i = resctrl_get_config_index(closid, CDP_CODE);
+
+ mpam_set_task_partid_pmg(tsk, partid_d, partid_i, rmid, rmid);
+ }
+}
+
+bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid)
+{
+ u64 regval = mpam_get_regval(tsk);
+ u32 tsk_closid = FIELD_GET(MPAM0_EL1_PARTID_D, regval);
+
+ if (cdp_enabled)
+ tsk_closid >>= 1;
+
+ return tsk_closid == closid;
+}
+
+/* The task's pmg is not unique, the partid must be considered too */
+bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 closid, u32 rmid)
+{
+ u64 regval = mpam_get_regval(tsk);
+ u32 tsk_closid = FIELD_GET(MPAM0_EL1_PARTID_D, regval);
+ u32 tsk_rmid = FIELD_GET(MPAM0_EL1_PMG_D, regval);
+
+ if (cdp_enabled)
+ tsk_closid >>= 1;
+
+ return (tsk_closid == closid) && (tsk_rmid == rmid);
+}
+
+struct rdt_resource *resctrl_arch_get_resource(enum resctrl_res_level l)
+{
+ if (l >= RDT_NUM_RESOURCES)
+ return NULL;
+
+ return &mpam_resctrl_controls[l].resctrl_res;
+}
+
+static int resctrl_arch_mon_ctx_alloc_no_wait(enum resctrl_event_id evtid)
+{
+ struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid];
+
+ if (!mpam_is_enabled())
+ return -EINVAL;
+
+ if (!mon->class)
+ return -EINVAL;
+
+ switch (evtid) {
+ case QOS_L3_OCCUP_EVENT_ID:
+ /* With CDP, one monitor gets used for both code/data reads */
+ return mpam_alloc_csu_mon(mon->class);
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ return USE_PRE_ALLOCATED;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+void *resctrl_arch_mon_ctx_alloc(struct rdt_resource *r,
+ enum resctrl_event_id evtid)
+{
+ DEFINE_WAIT(wait);
+ int *ret;
+
+ ret = kmalloc_obj(*ret);
+ if (!ret)
+ return ERR_PTR(-ENOMEM);
+
+ do {
+ prepare_to_wait(&resctrl_mon_ctx_waiters, &wait,
+ TASK_INTERRUPTIBLE);
+ *ret = resctrl_arch_mon_ctx_alloc_no_wait(evtid);
+ if (*ret == -ENOSPC)
+ schedule();
+ } while (*ret == -ENOSPC && !signal_pending(current));
+ finish_wait(&resctrl_mon_ctx_waiters, &wait);
+
+ return ret;
+}
+
+static void resctrl_arch_mon_ctx_free_no_wait(enum resctrl_event_id evtid,
+ u32 mon_idx)
+{
+ struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[evtid];
+
+ if (!mpam_is_enabled())
+ return;
+
+ if (!mon->class)
+ return;
+
+ if (evtid == QOS_L3_OCCUP_EVENT_ID)
+ mpam_free_csu_mon(mon->class, mon_idx);
+
+ wake_up(&resctrl_mon_ctx_waiters);
+}
+
+void resctrl_arch_mon_ctx_free(struct rdt_resource *r,
+ enum resctrl_event_id evtid, void *arch_mon_ctx)
+{
+ u32 mon_idx = *(u32 *)arch_mon_ctx;
+
+ kfree(arch_mon_ctx);
+
+ resctrl_arch_mon_ctx_free_no_wait(evtid, mon_idx);
+}
+
+static int __read_mon(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp,
+ enum mpam_device_features mon_type,
+ int mon_idx,
+ enum resctrl_conf_type cdp_type, u32 closid, u32 rmid, u64 *val)
+{
+ struct mon_cfg cfg;
+
+ if (!mpam_is_enabled())
+ return -EINVAL;
+
+ /* Shift closid to account for CDP */
+ closid = resctrl_get_config_index(closid, cdp_type);
+
+ if (irqs_disabled()) {
+ /* Check if we can access this domain without an IPI */
+ return -EIO;
+ }
+
+ cfg = (struct mon_cfg) {
+ .mon = mon_idx,
+ .match_pmg = true,
+ .partid = closid,
+ .pmg = rmid,
+ };
+
+ return mpam_msmon_read(mon_comp, &cfg, mon_type, val);
+}
+
+static int read_mon_cdp_safe(struct mpam_resctrl_mon *mon, struct mpam_component *mon_comp,
+ enum mpam_device_features mon_type,
+ int mon_idx, u32 closid, u32 rmid, u64 *val)
+{
+ if (cdp_enabled) {
+ u64 code_val = 0, data_val = 0;
+ int err;
+
+ err = __read_mon(mon, mon_comp, mon_type, mon_idx,
+ CDP_CODE, closid, rmid, &code_val);
+ if (err)
+ return err;
+
+ err = __read_mon(mon, mon_comp, mon_type, mon_idx,
+ CDP_DATA, closid, rmid, &data_val);
+ if (err)
+ return err;
+
+ *val += code_val + data_val;
+ return 0;
+ }
+
+ return __read_mon(mon, mon_comp, mon_type, mon_idx,
+ CDP_NONE, closid, rmid, val);
+}
+
+/* MBWU when not in ABMC mode (not supported), and CSU counters. */
+int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain_hdr *hdr,
+ u32 closid, u32 rmid, enum resctrl_event_id eventid,
+ void *arch_priv, u64 *val, void *arch_mon_ctx)
+{
+ struct mpam_resctrl_dom *l3_dom;
+ struct mpam_component *mon_comp;
+ u32 mon_idx = *(u32 *)arch_mon_ctx;
+ enum mpam_device_features mon_type;
+ struct mpam_resctrl_mon *mon = &mpam_resctrl_counters[eventid];
+
+ resctrl_arch_rmid_read_context_check();
+
+ if (!mpam_is_enabled())
+ return -EINVAL;
+
+ if (eventid >= QOS_NUM_EVENTS || !mon->class)
+ return -EINVAL;
+
+ l3_dom = container_of(hdr, struct mpam_resctrl_dom, resctrl_mon_dom.hdr);
+ mon_comp = l3_dom->mon_comp[eventid];
+
+ if (eventid != QOS_L3_OCCUP_EVENT_ID)
+ return -EINVAL;
+
+ mon_type = mpam_feat_msmon_csu;
+
+ return read_mon_cdp_safe(mon, mon_comp, mon_type, mon_idx,
+ closid, rmid, val);
+}
+
+/*
+ * The rmid realloc threshold should be for the smallest cache exposed to
+ * resctrl.
+ */
+static int update_rmid_limits(struct mpam_class *class)
+{
+ u32 num_unique_pmg = resctrl_arch_system_num_rmid_idx();
+ struct mpam_props *cprops = &class->props;
+ struct cacheinfo *ci;
+
+ lockdep_assert_cpus_held();
+
+ if (!mpam_has_feature(mpam_feat_msmon_csu, cprops))
+ return 0;
+
+ /*
+ * Assume cache levels are the same size for all CPUs...
+ * The check just requires any online CPU and it can't go offline as we
+ * hold the cpu lock.
+ */
+ ci = get_cpu_cacheinfo_level(raw_smp_processor_id(), class->level);
+ if (!ci || ci->size == 0) {
+ pr_debug("Could not read cache size for class %u\n",
+ class->level);
+ return -EINVAL;
+ }
+
+ if (!resctrl_rmid_realloc_limit ||
+ ci->size < resctrl_rmid_realloc_limit) {
+ resctrl_rmid_realloc_limit = ci->size;
+ resctrl_rmid_realloc_threshold = ci->size / num_unique_pmg;
+ }
+
+ return 0;
+}
+
+static bool cache_has_usable_cpor(struct mpam_class *class)
+{
+ struct mpam_props *cprops = &class->props;
+
+ if (!mpam_has_feature(mpam_feat_cpor_part, cprops))
+ return false;
+
+ /* resctrl uses u32 for all bitmap configurations */
+ return class->props.cpbm_wd <= 32;
+}
+
+static bool mba_class_use_mbw_max(struct mpam_props *cprops)
+{
+ return (mpam_has_feature(mpam_feat_mbw_max, cprops) &&
+ cprops->bwa_wd);
+}
+
+static bool class_has_usable_mba(struct mpam_props *cprops)
+{
+ return mba_class_use_mbw_max(cprops);
+}
+
+static bool cache_has_usable_csu(struct mpam_class *class)
+{
+ struct mpam_props *cprops;
+
+ if (!class)
+ return false;
+
+ cprops = &class->props;
+
+ if (!mpam_has_feature(mpam_feat_msmon_csu, cprops))
+ return false;
+
+ /*
+ * CSU counters settle on the value, so we can get away with
+ * having only one.
+ */
+ if (!cprops->num_csu_mon)
+ return false;
+
+ return true;
+}
+
+/*
+ * Calculate the worst-case percentage change from each implemented step
+ * in the control.
+ */
+static u32 get_mba_granularity(struct mpam_props *cprops)
+{
+ if (!mba_class_use_mbw_max(cprops))
+ return 0;
+
+ /*
+ * bwa_wd is the number of bits implemented in the 0.xxx
+ * fixed point fraction. 1 bit is 50%, 2 is 25% etc.
+ */
+ return DIV_ROUND_UP(MAX_MBA_BW, 1 << cprops->bwa_wd);
+}
+
+/*
+ * Each fixed-point hardware value architecturally represents a range
+ * of values: the full range 0% - 100% is split contiguously into
+ * (1 << cprops->bwa_wd) equal bands.
+ *
+ * Although the bwa_bwd fields have 6 bits the maximum valid value is 16
+ * as it reports the width of fields that are at most 16 bits. When
+ * fewer than 16 bits are valid the least significant bits are
+ * ignored. The implied binary point is kept between bits 15 and 16 and
+ * so the valid bits are leftmost.
+ *
+ * See ARM IHI0099B.a "MPAM system component specification", Section 9.3,
+ * "The fixed-point fractional format" for more information.
+ *
+ * Find the nearest percentage value to the upper bound of the selected band:
+ */
+static u32 mbw_max_to_percent(u16 mbw_max, struct mpam_props *cprops)
+{
+ u32 val = mbw_max;
+
+ val >>= 16 - cprops->bwa_wd;
+ val += 1;
+ val *= MAX_MBA_BW;
+ val = DIV_ROUND_CLOSEST(val, 1 << cprops->bwa_wd);
+
+ return val;
+}
+
+/*
+ * Find the band whose upper bound is closest to the specified percentage.
+ *
+ * A round-to-nearest policy is followed here as a balanced compromise
+ * between unexpected under-commit of the resource (where the total of
+ * a set of resource allocations after conversion is less than the
+ * expected total, due to rounding of the individual converted
+ * percentages) and over-commit (where the total of the converted
+ * allocations is greater than expected).
+ */
+static u16 percent_to_mbw_max(u8 pc, struct mpam_props *cprops)
+{
+ u32 val = pc;
+
+ val <<= cprops->bwa_wd;
+ val = DIV_ROUND_CLOSEST(val, MAX_MBA_BW);
+ val = max(val, 1) - 1;
+ val <<= 16 - cprops->bwa_wd;
+
+ return val;
+}
+
+static u32 get_mba_min(struct mpam_props *cprops)
+{
+ if (!mba_class_use_mbw_max(cprops)) {
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+
+ return mbw_max_to_percent(0, cprops);
+}
+
+/* Find the L3 cache that has affinity with this CPU */
+static int find_l3_equivalent_bitmask(int cpu, cpumask_var_t tmp_cpumask)
+{
+ u32 cache_id = get_cpu_cacheinfo_id(cpu, 3);
+
+ lockdep_assert_cpus_held();
+
+ return mpam_get_cpumask_from_cache_id(cache_id, 3, tmp_cpumask);
+}
+
+/*
+ * topology_matches_l3() - Is the provided class the same shape as L3
+ * @victim: The class we'd like to pretend is L3.
+ *
+ * resctrl expects all the world's a Xeon, and all counters are on the
+ * L3. We allow some mapping counters on other classes. This requires
+ * that the CPU->domain mapping is the same kind of shape.
+ *
+ * Using cacheinfo directly would make this work even if resctrl can't
+ * use the L3 - but cacheinfo can't tell us anything about offline CPUs.
+ * Using the L3 resctrl domain list also depends on CPUs being online.
+ * Using the mpam_class we picked for L3 so we can use its domain list
+ * assumes that there are MPAM controls on the L3.
+ * Instead, this path eventually uses the mpam_get_cpumask_from_cache_id()
+ * helper which can tell us about offline CPUs ... but getting the cache_id
+ * to start with relies on at least one CPU per L3 cache being online at
+ * boot.
+ *
+ * Walk the victim component list and compare the affinity mask with the
+ * corresponding L3. The topology matches if each victim:component's affinity
+ * mask is the same as the CPU's corresponding L3's. These lists/masks are
+ * computed from firmware tables so don't change at runtime.
+ */
+static bool topology_matches_l3(struct mpam_class *victim)
+{
+ int cpu, err;
+ struct mpam_component *victim_iter;
+
+ lockdep_assert_cpus_held();
+
+ cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL;
+ if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL))
+ return false;
+
+ guard(srcu)(&mpam_srcu);
+ list_for_each_entry_srcu(victim_iter, &victim->components, class_list,
+ srcu_read_lock_held(&mpam_srcu)) {
+ if (cpumask_empty(&victim_iter->affinity)) {
+ pr_debug("class %u has CPU-less component %u - can't match L3!\n",
+ victim->level, victim_iter->comp_id);
+ return false;
+ }
+
+ cpu = cpumask_any_and(&victim_iter->affinity, cpu_online_mask);
+ if (WARN_ON_ONCE(cpu >= nr_cpu_ids))
+ return false;
+
+ cpumask_clear(tmp_cpumask);
+ err = find_l3_equivalent_bitmask(cpu, tmp_cpumask);
+ if (err) {
+ pr_debug("Failed to find L3's equivalent component to class %u component %u\n",
+ victim->level, victim_iter->comp_id);
+ return false;
+ }
+
+ /* Any differing bits in the affinity mask? */
+ if (!cpumask_equal(tmp_cpumask, &victim_iter->affinity)) {
+ pr_debug("class %u component %u has Mismatched CPU mask with L3 equivalent\n"
+ "L3:%*pbl != victim:%*pbl\n",
+ victim->level, victim_iter->comp_id,
+ cpumask_pr_args(tmp_cpumask),
+ cpumask_pr_args(&victim_iter->affinity));
+
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/*
+ * Test if the traffic for a class matches that at egress from the L3. For
+ * MSC at memory controllers this is only possible if there is a single L3
+ * as otherwise the counters at the memory can include bandwidth from the
+ * non-local L3.
+ */
+static bool traffic_matches_l3(struct mpam_class *class)
+{
+ int err, cpu;
+
+ lockdep_assert_cpus_held();
+
+ if (class->type == MPAM_CLASS_CACHE && class->level == 3)
+ return true;
+
+ if (class->type == MPAM_CLASS_CACHE && class->level != 3) {
+ pr_debug("class %u is a different cache from L3\n", class->level);
+ return false;
+ }
+
+ if (class->type != MPAM_CLASS_MEMORY) {
+ pr_debug("class %u is neither of type cache or memory\n", class->level);
+ return false;
+ }
+
+ cpumask_var_t __free(free_cpumask_var) tmp_cpumask = CPUMASK_VAR_NULL;
+ if (!alloc_cpumask_var(&tmp_cpumask, GFP_KERNEL)) {
+ pr_debug("cpumask allocation failed\n");
+ return false;
+ }
+
+ cpu = cpumask_any_and(&class->affinity, cpu_online_mask);
+ err = find_l3_equivalent_bitmask(cpu, tmp_cpumask);
+ if (err) {
+ pr_debug("Failed to find L3 downstream to cpu %d\n", cpu);
+ return false;
+ }
+
+ if (!cpumask_equal(tmp_cpumask, cpu_possible_mask)) {
+ pr_debug("There is more than one L3\n");
+ return false;
+ }
+
+ /* Be strict; the traffic might stop in the intermediate cache. */
+ if (get_cpu_cacheinfo_id(cpu, 4) != -1) {
+ pr_debug("L3 isn't the last level of cache\n");
+ return false;
+ }
+
+ if (num_possible_nodes() > 1) {
+ pr_debug("There is more than one numa node\n");
+ return false;
+ }
+
+#ifdef CONFIG_HMEM_REPORTING
+ if (node_devices[cpu_to_node(cpu)]->cache_dev) {
+ pr_debug("There is a memory side cache\n");
+ return false;
+ }
+#endif
+
+ return true;
+}
+
+/* Test whether we can export MPAM_CLASS_CACHE:{2,3}? */
+static void mpam_resctrl_pick_caches(void)
+{
+ struct mpam_class *class;
+ struct mpam_resctrl_res *res;
+
+ lockdep_assert_cpus_held();
+
+ guard(srcu)(&mpam_srcu);
+ list_for_each_entry_srcu(class, &mpam_classes, classes_list,
+ srcu_read_lock_held(&mpam_srcu)) {
+ if (class->type != MPAM_CLASS_CACHE) {
+ pr_debug("class %u is not a cache\n", class->level);
+ continue;
+ }
+
+ if (class->level != 2 && class->level != 3) {
+ pr_debug("class %u is not L2 or L3\n", class->level);
+ continue;
+ }
+
+ if (!cache_has_usable_cpor(class)) {
+ pr_debug("class %u cache misses CPOR\n", class->level);
+ continue;
+ }
+
+ if (!cpumask_equal(&class->affinity, cpu_possible_mask)) {
+ pr_debug("class %u has missing CPUs, mask %*pb != %*pb\n", class->level,
+ cpumask_pr_args(&class->affinity),
+ cpumask_pr_args(cpu_possible_mask));
+ continue;
+ }
+
+ if (class->level == 2)
+ res = &mpam_resctrl_controls[RDT_RESOURCE_L2];
+ else
+ res = &mpam_resctrl_controls[RDT_RESOURCE_L3];
+ res->class = class;
+ }
+}
+
+static void mpam_resctrl_pick_mba(void)
+{
+ struct mpam_class *class, *candidate_class = NULL;
+ struct mpam_resctrl_res *res;
+
+ lockdep_assert_cpus_held();
+
+ guard(srcu)(&mpam_srcu);
+ list_for_each_entry_srcu(class, &mpam_classes, classes_list,
+ srcu_read_lock_held(&mpam_srcu)) {
+ struct mpam_props *cprops = &class->props;
+
+ if (class->level != 3 && class->type == MPAM_CLASS_CACHE) {
+ pr_debug("class %u is a cache but not the L3\n", class->level);
+ continue;
+ }
+
+ if (!class_has_usable_mba(cprops)) {
+ pr_debug("class %u has no bandwidth control\n",
+ class->level);
+ continue;
+ }
+
+ if (!cpumask_equal(&class->affinity, cpu_possible_mask)) {
+ pr_debug("class %u has missing CPUs\n", class->level);
+ continue;
+ }
+
+ if (!topology_matches_l3(class)) {
+ pr_debug("class %u topology doesn't match L3\n",
+ class->level);
+ continue;
+ }
+
+ if (!traffic_matches_l3(class)) {
+ pr_debug("class %u traffic doesn't match L3 egress\n",
+ class->level);
+ continue;
+ }
+
+ /*
+ * Pick a resource to be MBA that as close as possible to
+ * the L3. mbm_total counts the bandwidth leaving the L3
+ * cache and MBA should correspond as closely as possible
+ * for proper operation of mba_sc.
+ */
+ if (!candidate_class || class->level < candidate_class->level)
+ candidate_class = class;
+ }
+
+ if (candidate_class) {
+ pr_debug("selected class %u to back MBA\n",
+ candidate_class->level);
+ res = &mpam_resctrl_controls[RDT_RESOURCE_MBA];
+ res->class = candidate_class;
+ }
+}
+
+static void counter_update_class(enum resctrl_event_id evt_id,
+ struct mpam_class *class)
+{
+ struct mpam_class *existing_class = mpam_resctrl_counters[evt_id].class;
+
+ if (existing_class) {
+ if (class->level == 3) {
+ pr_debug("Existing class is L3 - L3 wins\n");
+ return;
+ }
+
+ if (existing_class->level < class->level) {
+ pr_debug("Existing class is closer to L3, %u versus %u - closer is better\n",
+ existing_class->level, class->level);
+ return;
+ }
+ }
+
+ mpam_resctrl_counters[evt_id].class = class;
+}
+
+static void mpam_resctrl_pick_counters(void)
+{
+ struct mpam_class *class;
+
+ lockdep_assert_cpus_held();
+
+ guard(srcu)(&mpam_srcu);
+ list_for_each_entry_srcu(class, &mpam_classes, classes_list,
+ srcu_read_lock_held(&mpam_srcu)) {
+ /* The name of the resource is L3... */
+ if (class->type == MPAM_CLASS_CACHE && class->level != 3) {
+ pr_debug("class %u is a cache but not the L3", class->level);
+ continue;
+ }
+
+ if (!cpumask_equal(&class->affinity, cpu_possible_mask)) {
+ pr_debug("class %u does not cover all CPUs",
+ class->level);
+ continue;
+ }
+
+ if (cache_has_usable_csu(class)) {
+ pr_debug("class %u has usable CSU",
+ class->level);
+
+ /* CSU counters only make sense on a cache. */
+ switch (class->type) {
+ case MPAM_CLASS_CACHE:
+ if (update_rmid_limits(class))
+ break;
+
+ counter_update_class(QOS_L3_OCCUP_EVENT_ID, class);
+ break;
+ default:
+ break;
+ }
+ }
+ }
+}
+
+static int mpam_resctrl_control_init(struct mpam_resctrl_res *res)
+{
+ struct mpam_class *class = res->class;
+ struct mpam_props *cprops = &class->props;
+ struct rdt_resource *r = &res->resctrl_res;
+
+ switch (r->rid) {
+ case RDT_RESOURCE_L2:
+ case RDT_RESOURCE_L3:
+ r->schema_fmt = RESCTRL_SCHEMA_BITMAP;
+ r->cache.arch_has_sparse_bitmasks = true;
+
+ r->cache.cbm_len = class->props.cpbm_wd;
+ /* mpam_devices will reject empty bitmaps */
+ r->cache.min_cbm_bits = 1;
+
+ if (r->rid == RDT_RESOURCE_L2) {
+ r->name = "L2";
+ r->ctrl_scope = RESCTRL_L2_CACHE;
+ r->cdp_capable = true;
+ } else {
+ r->name = "L3";
+ r->ctrl_scope = RESCTRL_L3_CACHE;
+ r->cdp_capable = true;
+ }
+
+ /*
+ * Which bits are shared with other ...things... Unknown
+ * devices use partid-0 which uses all the bitmap fields. Until
+ * we have configured the SMMU and GIC not to do this 'all the
+ * bits' is the correct answer here.
+ */
+ r->cache.shareable_bits = resctrl_get_default_ctrl(r);
+ r->alloc_capable = true;
+ break;
+ case RDT_RESOURCE_MBA:
+ r->schema_fmt = RESCTRL_SCHEMA_RANGE;
+ r->ctrl_scope = RESCTRL_L3_CACHE;
+
+ r->membw.delay_linear = true;
+ r->membw.throttle_mode = THREAD_THROTTLE_UNDEFINED;
+ r->membw.min_bw = get_mba_min(cprops);
+ r->membw.max_bw = MAX_MBA_BW;
+ r->membw.bw_gran = get_mba_granularity(cprops);
+
+ r->name = "MB";
+ r->alloc_capable = true;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int mpam_resctrl_pick_domain_id(int cpu, struct mpam_component *comp)
+{
+ struct mpam_class *class = comp->class;
+
+ if (class->type == MPAM_CLASS_CACHE)
+ return comp->comp_id;
+
+ if (topology_matches_l3(class)) {
+ /* Use the corresponding L3 component ID as the domain ID */
+ int id = get_cpu_cacheinfo_id(cpu, 3);
+
+ /* Implies topology_matches_l3() made a mistake */
+ if (WARN_ON_ONCE(id == -1))
+ return comp->comp_id;
+
+ return id;
+ }
+
+ /* Otherwise, expose the ID used by the firmware table code. */
+ return comp->comp_id;
+}
+
+static int mpam_resctrl_monitor_init(struct mpam_resctrl_mon *mon,
+ enum resctrl_event_id type)
+{
+ struct mpam_resctrl_res *res = &mpam_resctrl_controls[RDT_RESOURCE_L3];
+ struct rdt_resource *l3 = &res->resctrl_res;
+
+ lockdep_assert_cpus_held();
+
+ /*
+ * There also needs to be an L3 cache present.
+ * The check just requires any online CPU and it can't go offline as we
+ * hold the cpu lock.
+ */
+ if (get_cpu_cacheinfo_id(raw_smp_processor_id(), 3) == -1)
+ return 0;
+
+ /*
+ * If there are no MPAM resources on L3, force it into existence.
+ * topology_matches_l3() already ensures this looks like the L3.
+ * The domain-ids will be fixed up by mpam_resctrl_domain_hdr_init().
+ */
+ if (!res->class) {
+ pr_warn_once("Faking L3 MSC to enable counters.\n");
+ res->class = mpam_resctrl_counters[type].class;
+ }
+
+ /*
+ * Called multiple times!, once per event type that has a
+ * monitoring class.
+ * Setting name is necessary on monitor only platforms.
+ */
+ l3->name = "L3";
+ l3->mon_scope = RESCTRL_L3_CACHE;
+
+ /*
+ * num-rmid is the upper bound for the number of monitoring groups that
+ * can exist simultaneously, including the default monitoring group for
+ * each control group. Hence, advertise the whole rmid_idx space even
+ * though each control group has its own pmg/rmid space. Unfortunately,
+ * this does mean userspace needs to know the architecture to correctly
+ * interpret this value.
+ */
+ l3->mon.num_rmid = resctrl_arch_system_num_rmid_idx();
+
+ if (resctrl_enable_mon_event(type, false, 0, NULL))
+ l3->mon_capable = true;
+
+ return 0;
+}
+
+u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d,
+ u32 closid, enum resctrl_conf_type type)
+{
+ u32 partid;
+ struct mpam_config *cfg;
+ struct mpam_props *cprops;
+ struct mpam_resctrl_res *res;
+ struct mpam_resctrl_dom *dom;
+ enum mpam_device_features configured_by;
+
+ lockdep_assert_cpus_held();
+
+ if (!mpam_is_enabled())
+ return resctrl_get_default_ctrl(r);
+
+ res = container_of(r, struct mpam_resctrl_res, resctrl_res);
+ dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom);
+ cprops = &res->class->props;
+
+ /*
+ * When CDP is enabled, but the resource doesn't support it,
+ * the control is cloned across both partids.
+ * Pick one at random to read:
+ */
+ if (mpam_resctrl_hide_cdp(r->rid))
+ type = CDP_DATA;
+
+ partid = resctrl_get_config_index(closid, type);
+ cfg = &dom->ctrl_comp->cfg[partid];
+
+ switch (r->rid) {
+ case RDT_RESOURCE_L2:
+ case RDT_RESOURCE_L3:
+ configured_by = mpam_feat_cpor_part;
+ break;
+ case RDT_RESOURCE_MBA:
+ if (mpam_has_feature(mpam_feat_mbw_max, cprops)) {
+ configured_by = mpam_feat_mbw_max;
+ break;
+ }
+ fallthrough;
+ default:
+ return resctrl_get_default_ctrl(r);
+ }
+
+ if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r) ||
+ !mpam_has_feature(configured_by, cfg))
+ return resctrl_get_default_ctrl(r);
+
+ switch (configured_by) {
+ case mpam_feat_cpor_part:
+ return cfg->cpbm;
+ case mpam_feat_mbw_max:
+ return mbw_max_to_percent(cfg->mbw_max, cprops);
+ default:
+ return resctrl_get_default_ctrl(r);
+ }
+}
+
+int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d,
+ u32 closid, enum resctrl_conf_type t, u32 cfg_val)
+{
+ int err;
+ u32 partid;
+ struct mpam_config cfg;
+ struct mpam_props *cprops;
+ struct mpam_resctrl_res *res;
+ struct mpam_resctrl_dom *dom;
+
+ lockdep_assert_cpus_held();
+ lockdep_assert_irqs_enabled();
+
+ if (!mpam_is_enabled())
+ return -EINVAL;
+
+ /*
+ * No need to check the CPU as mpam_apply_config() doesn't care, and
+ * resctrl_arch_update_domains() relies on this.
+ */
+ res = container_of(r, struct mpam_resctrl_res, resctrl_res);
+ dom = container_of(d, struct mpam_resctrl_dom, resctrl_ctrl_dom);
+ cprops = &res->class->props;
+
+ if (mpam_resctrl_hide_cdp(r->rid))
+ t = CDP_DATA;
+
+ partid = resctrl_get_config_index(closid, t);
+ if (!r->alloc_capable || partid >= resctrl_arch_get_num_closid(r)) {
+ pr_debug("Not alloc capable or computed PARTID out of range\n");
+ return -EINVAL;
+ }
+
+ /*
+ * Copy the current config to avoid clearing other resources when the
+ * same component is exposed multiple times through resctrl.
+ */
+ cfg = dom->ctrl_comp->cfg[partid];
+
+ switch (r->rid) {
+ case RDT_RESOURCE_L2:
+ case RDT_RESOURCE_L3:
+ cfg.cpbm = cfg_val;
+ mpam_set_feature(mpam_feat_cpor_part, &cfg);
+ break;
+ case RDT_RESOURCE_MBA:
+ if (mpam_has_feature(mpam_feat_mbw_max, cprops)) {
+ cfg.mbw_max = percent_to_mbw_max(cfg_val, cprops);
+ mpam_set_feature(mpam_feat_mbw_max, &cfg);
+ break;
+ }
+ fallthrough;
+ default:
+ return -EINVAL;
+ }
+
+ /*
+ * When CDP is enabled, but the resource doesn't support it, we need to
+ * apply the same configuration to the other partid.
+ */
+ if (mpam_resctrl_hide_cdp(r->rid)) {
+ partid = resctrl_get_config_index(closid, CDP_CODE);
+ err = mpam_apply_config(dom->ctrl_comp, partid, &cfg);
+ if (err)
+ return err;
+
+ partid = resctrl_get_config_index(closid, CDP_DATA);
+ return mpam_apply_config(dom->ctrl_comp, partid, &cfg);
+ }
+
+ return mpam_apply_config(dom->ctrl_comp, partid, &cfg);
+}
+
+int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
+{
+ int err;
+ struct rdt_ctrl_domain *d;
+
+ lockdep_assert_cpus_held();
+ lockdep_assert_irqs_enabled();
+
+ if (!mpam_is_enabled())
+ return -EINVAL;
+
+ list_for_each_entry_rcu(d, &r->ctrl_domains, hdr.list) {
+ for (enum resctrl_conf_type t = 0; t < CDP_NUM_TYPES; t++) {
+ struct resctrl_staged_config *cfg = &d->staged_config[t];
+
+ if (!cfg->have_new_ctrl)
+ continue;
+
+ err = resctrl_arch_update_one(r, d, closid, t,
+ cfg->new_ctrl);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+void resctrl_arch_reset_all_ctrls(struct rdt_resource *r)
+{
+ struct mpam_resctrl_res *res;
+
+ lockdep_assert_cpus_held();
+
+ if (!mpam_is_enabled())
+ return;
+
+ res = container_of(r, struct mpam_resctrl_res, resctrl_res);
+ mpam_reset_class_locked(res->class);
+}
+
+static void mpam_resctrl_domain_hdr_init(int cpu, struct mpam_component *comp,
+ enum resctrl_res_level rid,
+ struct rdt_domain_hdr *hdr)
+{
+ lockdep_assert_cpus_held();
+
+ INIT_LIST_HEAD(&hdr->list);
+ hdr->id = mpam_resctrl_pick_domain_id(cpu, comp);
+ hdr->rid = rid;
+ cpumask_set_cpu(cpu, &hdr->cpu_mask);
+}
+
+static void mpam_resctrl_online_domain_hdr(unsigned int cpu,
+ struct rdt_domain_hdr *hdr)
+{
+ lockdep_assert_cpus_held();
+
+ cpumask_set_cpu(cpu, &hdr->cpu_mask);
+}
+
+/**
+ * mpam_resctrl_offline_domain_hdr() - Update the domain header to remove a CPU.
+ * @cpu: The CPU to remove from the domain.
+ * @hdr: The domain's header.
+ *
+ * Removes @cpu from the header mask. If this was the last CPU in the domain,
+ * the domain header is removed from its parent list and true is returned,
+ * indicating the parent structure can be freed.
+ * If there are other CPUs in the domain, returns false.
+ */
+static bool mpam_resctrl_offline_domain_hdr(unsigned int cpu,
+ struct rdt_domain_hdr *hdr)
+{
+ lockdep_assert_held(&domain_list_lock);
+
+ cpumask_clear_cpu(cpu, &hdr->cpu_mask);
+ if (cpumask_empty(&hdr->cpu_mask)) {
+ list_del_rcu(&hdr->list);
+ synchronize_rcu();
+ return true;
+ }
+
+ return false;
+}
+
+static void mpam_resctrl_domain_insert(struct list_head *list,
+ struct rdt_domain_hdr *new)
+{
+ struct rdt_domain_hdr *err;
+ struct list_head *pos = NULL;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ err = resctrl_find_domain(list, new->id, &pos);
+ if (WARN_ON_ONCE(err))
+ return;
+
+ list_add_tail_rcu(&new->list, pos);
+}
+
+static struct mpam_component *find_component(struct mpam_class *class, int cpu)
+{
+ struct mpam_component *comp;
+
+ guard(srcu)(&mpam_srcu);
+ list_for_each_entry_srcu(comp, &class->components, class_list,
+ srcu_read_lock_held(&mpam_srcu)) {
+ if (cpumask_test_cpu(cpu, &comp->affinity))
+ return comp;
+ }
+
+ return NULL;
+}
+
+static struct mpam_resctrl_dom *
+mpam_resctrl_alloc_domain(unsigned int cpu, struct mpam_resctrl_res *res)
+{
+ int err;
+ struct mpam_resctrl_dom *dom;
+ struct rdt_l3_mon_domain *mon_d;
+ struct rdt_ctrl_domain *ctrl_d;
+ struct mpam_class *class = res->class;
+ struct mpam_component *comp_iter, *ctrl_comp;
+ struct rdt_resource *r = &res->resctrl_res;
+
+ lockdep_assert_held(&domain_list_lock);
+
+ ctrl_comp = NULL;
+ guard(srcu)(&mpam_srcu);
+ list_for_each_entry_srcu(comp_iter, &class->components, class_list,
+ srcu_read_lock_held(&mpam_srcu)) {
+ if (cpumask_test_cpu(cpu, &comp_iter->affinity)) {
+ ctrl_comp = comp_iter;
+ break;
+ }
+ }
+
+ /* class has no component for this CPU */
+ if (WARN_ON_ONCE(!ctrl_comp))
+ return ERR_PTR(-EINVAL);
+
+ dom = kzalloc_node(sizeof(*dom), GFP_KERNEL, cpu_to_node(cpu));
+ if (!dom)
+ return ERR_PTR(-ENOMEM);
+
+ if (r->alloc_capable) {
+ dom->ctrl_comp = ctrl_comp;
+
+ ctrl_d = &dom->resctrl_ctrl_dom;
+ mpam_resctrl_domain_hdr_init(cpu, ctrl_comp, r->rid, &ctrl_d->hdr);
+ ctrl_d->hdr.type = RESCTRL_CTRL_DOMAIN;
+ err = resctrl_online_ctrl_domain(r, ctrl_d);
+ if (err)
+ goto free_domain;
+
+ mpam_resctrl_domain_insert(&r->ctrl_domains, &ctrl_d->hdr);
+ } else {
+ pr_debug("Skipped control domain online - no controls\n");
+ }
+
+ if (r->mon_capable) {
+ struct mpam_component *any_mon_comp;
+ struct mpam_resctrl_mon *mon;
+ enum resctrl_event_id eventid;
+
+ /*
+ * Even if the monitor domain is backed by a different
+ * component, the L3 component IDs need to be used... only
+ * there may be no ctrl_comp for the L3.
+ * Search each event's class list for a component with
+ * overlapping CPUs and set up the dom->mon_comp array.
+ */
+
+ for_each_mpam_resctrl_mon(mon, eventid) {
+ struct mpam_component *mon_comp;
+
+ if (!mon->class)
+ continue; // dummy resource
+
+ mon_comp = find_component(mon->class, cpu);
+ dom->mon_comp[eventid] = mon_comp;
+ if (mon_comp)
+ any_mon_comp = mon_comp;
+ }
+ if (!any_mon_comp) {
+ WARN_ON_ONCE(0);
+ err = -EFAULT;
+ goto offline_ctrl_domain;
+ }
+
+ mon_d = &dom->resctrl_mon_dom;
+ mpam_resctrl_domain_hdr_init(cpu, any_mon_comp, r->rid, &mon_d->hdr);
+ mon_d->hdr.type = RESCTRL_MON_DOMAIN;
+ err = resctrl_online_mon_domain(r, &mon_d->hdr);
+ if (err)
+ goto offline_ctrl_domain;
+
+ mpam_resctrl_domain_insert(&r->mon_domains, &mon_d->hdr);
+ } else {
+ pr_debug("Skipped monitor domain online - no monitors\n");
+ }
+
+ return dom;
+
+offline_ctrl_domain:
+ if (r->alloc_capable) {
+ mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr);
+ resctrl_offline_ctrl_domain(r, ctrl_d);
+ }
+free_domain:
+ kfree(dom);
+ dom = ERR_PTR(err);
+
+ return dom;
+}
+
+/*
+ * We know all the monitors are associated with the L3, even if there are no
+ * controls and therefore no control component. Find the cache-id for the CPU
+ * and use that to search for existing resctrl domains.
+ * This relies on mpam_resctrl_pick_domain_id() using the L3 cache-id
+ * for anything that is not a cache.
+ */
+static struct mpam_resctrl_dom *mpam_resctrl_get_mon_domain_from_cpu(int cpu)
+{
+ int cache_id;
+ struct mpam_resctrl_dom *dom;
+ struct mpam_resctrl_res *l3 = &mpam_resctrl_controls[RDT_RESOURCE_L3];
+
+ lockdep_assert_cpus_held();
+
+ if (!l3->class)
+ return NULL;
+ cache_id = get_cpu_cacheinfo_id(cpu, 3);
+ if (cache_id < 0)
+ return NULL;
+
+ list_for_each_entry_rcu(dom, &l3->resctrl_res.mon_domains, resctrl_mon_dom.hdr.list) {
+ if (dom->resctrl_mon_dom.hdr.id == cache_id)
+ return dom;
+ }
+
+ return NULL;
+}
+
+static struct mpam_resctrl_dom *
+mpam_resctrl_get_domain_from_cpu(int cpu, struct mpam_resctrl_res *res)
+{
+ struct mpam_resctrl_dom *dom;
+ struct rdt_resource *r = &res->resctrl_res;
+
+ lockdep_assert_cpus_held();
+
+ list_for_each_entry_rcu(dom, &r->ctrl_domains, resctrl_ctrl_dom.hdr.list) {
+ if (cpumask_test_cpu(cpu, &dom->ctrl_comp->affinity))
+ return dom;
+ }
+
+ if (r->rid != RDT_RESOURCE_L3)
+ return NULL;
+
+ /* Search the mon domain list too - needed on monitor only platforms. */
+ return mpam_resctrl_get_mon_domain_from_cpu(cpu);
+}
+
+int mpam_resctrl_online_cpu(unsigned int cpu)
+{
+ struct mpam_resctrl_res *res;
+ enum resctrl_res_level rid;
+
+ guard(mutex)(&domain_list_lock);
+ for_each_mpam_resctrl_control(res, rid) {
+ struct mpam_resctrl_dom *dom;
+ struct rdt_resource *r = &res->resctrl_res;
+
+ if (!res->class)
+ continue; // dummy_resource;
+
+ dom = mpam_resctrl_get_domain_from_cpu(cpu, res);
+ if (!dom) {
+ dom = mpam_resctrl_alloc_domain(cpu, res);
+ if (IS_ERR(dom))
+ return PTR_ERR(dom);
+ } else {
+ if (r->alloc_capable) {
+ struct rdt_ctrl_domain *ctrl_d = &dom->resctrl_ctrl_dom;
+
+ mpam_resctrl_online_domain_hdr(cpu, &ctrl_d->hdr);
+ }
+ if (r->mon_capable) {
+ struct rdt_l3_mon_domain *mon_d = &dom->resctrl_mon_dom;
+
+ mpam_resctrl_online_domain_hdr(cpu, &mon_d->hdr);
+ }
+ }
+ }
+
+ resctrl_online_cpu(cpu);
+
+ return 0;
+}
+
+void mpam_resctrl_offline_cpu(unsigned int cpu)
+{
+ struct mpam_resctrl_res *res;
+ enum resctrl_res_level rid;
+
+ resctrl_offline_cpu(cpu);
+
+ guard(mutex)(&domain_list_lock);
+ for_each_mpam_resctrl_control(res, rid) {
+ struct mpam_resctrl_dom *dom;
+ struct rdt_l3_mon_domain *mon_d;
+ struct rdt_ctrl_domain *ctrl_d;
+ bool ctrl_dom_empty, mon_dom_empty;
+ struct rdt_resource *r = &res->resctrl_res;
+
+ if (!res->class)
+ continue; // dummy resource
+
+ dom = mpam_resctrl_get_domain_from_cpu(cpu, res);
+ if (WARN_ON_ONCE(!dom))
+ continue;
+
+ if (r->alloc_capable) {
+ ctrl_d = &dom->resctrl_ctrl_dom;
+ ctrl_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &ctrl_d->hdr);
+ if (ctrl_dom_empty)
+ resctrl_offline_ctrl_domain(&res->resctrl_res, ctrl_d);
+ } else {
+ ctrl_dom_empty = true;
+ }
+
+ if (r->mon_capable) {
+ mon_d = &dom->resctrl_mon_dom;
+ mon_dom_empty = mpam_resctrl_offline_domain_hdr(cpu, &mon_d->hdr);
+ if (mon_dom_empty)
+ resctrl_offline_mon_domain(&res->resctrl_res, &mon_d->hdr);
+ } else {
+ mon_dom_empty = true;
+ }
+
+ if (ctrl_dom_empty && mon_dom_empty)
+ kfree(dom);
+ }
+}
+
+int mpam_resctrl_setup(void)
+{
+ int err = 0;
+ struct mpam_resctrl_res *res;
+ enum resctrl_res_level rid;
+ struct mpam_resctrl_mon *mon;
+ enum resctrl_event_id eventid;
+
+ wait_event(wait_cacheinfo_ready, cacheinfo_ready);
+
+ cpus_read_lock();
+ for_each_mpam_resctrl_control(res, rid) {
+ INIT_LIST_HEAD_RCU(&res->resctrl_res.ctrl_domains);
+ INIT_LIST_HEAD_RCU(&res->resctrl_res.mon_domains);
+ res->resctrl_res.rid = rid;
+ }
+
+ /* Find some classes to use for controls */
+ mpam_resctrl_pick_caches();
+ mpam_resctrl_pick_mba();
+
+ /* Initialise the resctrl structures from the classes */
+ for_each_mpam_resctrl_control(res, rid) {
+ if (!res->class)
+ continue; // dummy resource
+
+ err = mpam_resctrl_control_init(res);
+ if (err) {
+ pr_debug("Failed to initialise rid %u\n", rid);
+ goto internal_error;
+ }
+ }
+
+ /* Find some classes to use for monitors */
+ mpam_resctrl_pick_counters();
+
+ for_each_mpam_resctrl_mon(mon, eventid) {
+ if (!mon->class)
+ continue; // dummy resource
+
+ err = mpam_resctrl_monitor_init(mon, eventid);
+ if (err) {
+ pr_debug("Failed to initialise event %u\n", eventid);
+ goto internal_error;
+ }
+ }
+
+ cpus_read_unlock();
+
+ if (!resctrl_arch_alloc_capable() && !resctrl_arch_mon_capable()) {
+ pr_debug("No alloc(%u) or monitor(%u) found - resctrl not supported\n",
+ resctrl_arch_alloc_capable(), resctrl_arch_mon_capable());
+ return -EOPNOTSUPP;
+ }
+
+ err = resctrl_init();
+ if (err)
+ return err;
+
+ WRITE_ONCE(resctrl_enabled, true);
+
+ return 0;
+
+internal_error:
+ cpus_read_unlock();
+ pr_debug("Internal error %d - resctrl not supported\n", err);
+ return err;
+}
+
+void mpam_resctrl_exit(void)
+{
+ if (!READ_ONCE(resctrl_enabled))
+ return;
+
+ WRITE_ONCE(resctrl_enabled, false);
+ resctrl_exit();
+}
+
+/*
+ * The driver is detaching an MSC from this class, if resctrl was using it,
+ * pull on resctrl_exit().
+ */
+void mpam_resctrl_teardown_class(struct mpam_class *class)
+{
+ struct mpam_resctrl_res *res;
+ enum resctrl_res_level rid;
+ struct mpam_resctrl_mon *mon;
+ enum resctrl_event_id eventid;
+
+ might_sleep();
+
+ for_each_mpam_resctrl_control(res, rid) {
+ if (res->class == class) {
+ res->class = NULL;
+ break;
+ }
+ }
+ for_each_mpam_resctrl_mon(mon, eventid) {
+ if (mon->class == class) {
+ mon->class = NULL;
+ break;
+ }
+ }
+}
+
+static int __init __cacheinfo_ready(void)
+{
+ cacheinfo_ready = true;
+ wake_up(&wait_cacheinfo_ready);
+
+ return 0;
+}
+device_initcall_sync(__cacheinfo_ready);
+
+#ifdef CONFIG_MPAM_KUNIT_TEST
+#include "test_mpam_resctrl.c"
+#endif
diff --git a/drivers/resctrl/test_mpam_resctrl.c b/drivers/resctrl/test_mpam_resctrl.c
new file mode 100644
index 000000000000..b93d6ad87e43
--- /dev/null
+++ b/drivers/resctrl/test_mpam_resctrl.c
@@ -0,0 +1,315 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2025 Arm Ltd.
+/* This file is intended to be included into mpam_resctrl.c */
+
+#include <kunit/test.h>
+#include <linux/array_size.h>
+#include <linux/bits.h>
+#include <linux/math.h>
+#include <linux/sprintf.h>
+
+struct percent_value_case {
+ u8 pc;
+ u8 width;
+ u16 value;
+};
+
+/*
+ * Mysterious inscriptions taken from the union of ARM DDI 0598D.b,
+ * "Arm Architecture Reference Manual Supplement - Memory System
+ * Resource Partitioning and Monitoring (MPAM), for A-profile
+ * architecture", Section 9.8, "About the fixed-point fractional
+ * format" (exact percentage entries only) and ARM IHI0099B.a
+ * "MPAM system component specification", Section 9.3,
+ * "The fixed-point fractional format":
+ */
+static const struct percent_value_case percent_value_cases[] = {
+ /* Architectural cases: */
+ { 1, 8, 1 }, { 1, 12, 0x27 }, { 1, 16, 0x28e },
+ { 25, 8, 0x3f }, { 25, 12, 0x3ff }, { 25, 16, 0x3fff },
+ { 33, 8, 0x53 }, { 33, 12, 0x546 }, { 33, 16, 0x5479 },
+ { 35, 8, 0x58 }, { 35, 12, 0x598 }, { 35, 16, 0x5998 },
+ { 45, 8, 0x72 }, { 45, 12, 0x732 }, { 45, 16, 0x7332 },
+ { 50, 8, 0x7f }, { 50, 12, 0x7ff }, { 50, 16, 0x7fff },
+ { 52, 8, 0x84 }, { 52, 12, 0x850 }, { 52, 16, 0x851d },
+ { 55, 8, 0x8b }, { 55, 12, 0x8cb }, { 55, 16, 0x8ccb },
+ { 58, 8, 0x93 }, { 58, 12, 0x946 }, { 58, 16, 0x9479 },
+ { 75, 8, 0xbf }, { 75, 12, 0xbff }, { 75, 16, 0xbfff },
+ { 80, 8, 0xcb }, { 80, 12, 0xccb }, { 80, 16, 0xcccb },
+ { 88, 8, 0xe0 }, { 88, 12, 0xe13 }, { 88, 16, 0xe146 },
+ { 95, 8, 0xf2 }, { 95, 12, 0xf32 }, { 95, 16, 0xf332 },
+ { 100, 8, 0xff }, { 100, 12, 0xfff }, { 100, 16, 0xffff },
+};
+
+static void test_percent_value_desc(const struct percent_value_case *param,
+ char *desc)
+{
+ snprintf(desc, KUNIT_PARAM_DESC_SIZE,
+ "pc=%d, width=%d, value=0x%.*x\n",
+ param->pc, param->width,
+ DIV_ROUND_UP(param->width, 4), param->value);
+}
+
+KUNIT_ARRAY_PARAM(test_percent_value, percent_value_cases,
+ test_percent_value_desc);
+
+struct percent_value_test_info {
+ u32 pc; /* result of value-to-percent conversion */
+ u32 value; /* result of percent-to-value conversion */
+ u32 max_value; /* maximum raw value allowed by test params */
+ unsigned int shift; /* promotes raw testcase value to 16 bits */
+};
+
+/*
+ * Convert a reference percentage to a fixed-point MAX value and
+ * vice-versa, based on param (not test->param_value!)
+ */
+static void __prepare_percent_value_test(struct kunit *test,
+ struct percent_value_test_info *res,
+ const struct percent_value_case *param)
+{
+ struct mpam_props fake_props = { };
+
+ /* Reject bogus test parameters that would break the tests: */
+ KUNIT_ASSERT_GE(test, param->width, 1);
+ KUNIT_ASSERT_LE(test, param->width, 16);
+ KUNIT_ASSERT_LT(test, param->value, 1 << param->width);
+
+ mpam_set_feature(mpam_feat_mbw_max, &fake_props);
+ fake_props.bwa_wd = param->width;
+
+ res->shift = 16 - param->width;
+ res->max_value = GENMASK_U32(param->width - 1, 0);
+ res->value = percent_to_mbw_max(param->pc, &fake_props);
+ res->pc = mbw_max_to_percent(param->value << res->shift, &fake_props);
+}
+
+static void test_get_mba_granularity(struct kunit *test)
+{
+ int ret;
+ struct mpam_props fake_props = { };
+
+ /* Use MBW_MAX */
+ mpam_set_feature(mpam_feat_mbw_max, &fake_props);
+
+ fake_props.bwa_wd = 0;
+ KUNIT_EXPECT_FALSE(test, mba_class_use_mbw_max(&fake_props));
+
+ fake_props.bwa_wd = 1;
+ KUNIT_EXPECT_TRUE(test, mba_class_use_mbw_max(&fake_props));
+
+ /* Architectural maximum: */
+ fake_props.bwa_wd = 16;
+ KUNIT_EXPECT_TRUE(test, mba_class_use_mbw_max(&fake_props));
+
+ /* No usable control... */
+ fake_props.bwa_wd = 0;
+ ret = get_mba_granularity(&fake_props);
+ KUNIT_EXPECT_EQ(test, ret, 0);
+
+ fake_props.bwa_wd = 1;
+ ret = get_mba_granularity(&fake_props);
+ KUNIT_EXPECT_EQ(test, ret, 50); /* DIV_ROUND_UP(100, 1 << 1)% = 50% */
+
+ fake_props.bwa_wd = 2;
+ ret = get_mba_granularity(&fake_props);
+ KUNIT_EXPECT_EQ(test, ret, 25); /* DIV_ROUND_UP(100, 1 << 2)% = 25% */
+
+ fake_props.bwa_wd = 3;
+ ret = get_mba_granularity(&fake_props);
+ KUNIT_EXPECT_EQ(test, ret, 13); /* DIV_ROUND_UP(100, 1 << 3)% = 13% */
+
+ fake_props.bwa_wd = 6;
+ ret = get_mba_granularity(&fake_props);
+ KUNIT_EXPECT_EQ(test, ret, 2); /* DIV_ROUND_UP(100, 1 << 6)% = 2% */
+
+ fake_props.bwa_wd = 7;
+ ret = get_mba_granularity(&fake_props);
+ KUNIT_EXPECT_EQ(test, ret, 1); /* DIV_ROUND_UP(100, 1 << 7)% = 1% */
+
+ /* Granularity saturates at 1% */
+ fake_props.bwa_wd = 16; /* architectural maximum */
+ ret = get_mba_granularity(&fake_props);
+ KUNIT_EXPECT_EQ(test, ret, 1); /* DIV_ROUND_UP(100, 1 << 16)% = 1% */
+}
+
+static void test_mbw_max_to_percent(struct kunit *test)
+{
+ const struct percent_value_case *param = test->param_value;
+ struct percent_value_test_info res;
+
+ /*
+ * Since the reference values in percent_value_cases[] all
+ * correspond to exact percentages, round-to-nearest will
+ * always give the exact percentage back when the MPAM max
+ * value has precision of 0.5% or finer. (Always true for the
+ * reference data, since they all specify 8 bits or more of
+ * precision.
+ *
+ * So, keep it simple and demand an exact match:
+ */
+ __prepare_percent_value_test(test, &res, param);
+ KUNIT_EXPECT_EQ(test, res.pc, param->pc);
+}
+
+static void test_percent_to_mbw_max(struct kunit *test)
+{
+ const struct percent_value_case *param = test->param_value;
+ struct percent_value_test_info res;
+
+ __prepare_percent_value_test(test, &res, param);
+
+ KUNIT_EXPECT_GE(test, res.value, param->value << res.shift);
+ KUNIT_EXPECT_LE(test, res.value, (param->value + 1) << res.shift);
+ KUNIT_EXPECT_LE(test, res.value, res.max_value << res.shift);
+
+ /* No flexibility allowed for 0% and 100%! */
+
+ if (param->pc == 0)
+ KUNIT_EXPECT_EQ(test, res.value, 0);
+
+ if (param->pc == 100)
+ KUNIT_EXPECT_EQ(test, res.value, res.max_value << res.shift);
+}
+
+static const void *test_all_bwa_wd_gen_params(struct kunit *test, const void *prev,
+ char *desc)
+{
+ uintptr_t param = (uintptr_t)prev;
+
+ if (param > 15)
+ return NULL;
+
+ param++;
+
+ snprintf(desc, KUNIT_PARAM_DESC_SIZE, "wd=%u\n", (unsigned int)param);
+
+ return (void *)param;
+}
+
+static unsigned int test_get_bwa_wd(struct kunit *test)
+{
+ uintptr_t param = (uintptr_t)test->param_value;
+
+ KUNIT_ASSERT_GE(test, param, 1);
+ KUNIT_ASSERT_LE(test, param, 16);
+
+ return param;
+}
+
+static void test_mbw_max_to_percent_limits(struct kunit *test)
+{
+ struct mpam_props fake_props = {0};
+ u32 max_value;
+
+ mpam_set_feature(mpam_feat_mbw_max, &fake_props);
+ fake_props.bwa_wd = test_get_bwa_wd(test);
+ max_value = GENMASK(15, 16 - fake_props.bwa_wd);
+
+ KUNIT_EXPECT_EQ(test, mbw_max_to_percent(max_value, &fake_props),
+ MAX_MBA_BW);
+ KUNIT_EXPECT_EQ(test, mbw_max_to_percent(0, &fake_props),
+ get_mba_min(&fake_props));
+
+ /*
+ * Rounding policy dependent 0% sanity-check:
+ * With round-to-nearest, the minimum mbw_max value really
+ * should map to 0% if there are at least 200 steps.
+ * (100 steps may be enough for some other rounding policies.)
+ */
+ if (fake_props.bwa_wd >= 8)
+ KUNIT_EXPECT_EQ(test, mbw_max_to_percent(0, &fake_props), 0);
+
+ if (fake_props.bwa_wd < 8 &&
+ mbw_max_to_percent(0, &fake_props) == 0)
+ kunit_warn(test, "wd=%d: Testsuite/driver Rounding policy mismatch?",
+ fake_props.bwa_wd);
+}
+
+/*
+ * Check that converting a percentage to mbw_max and back again (or, as
+ * appropriate, vice-versa) always restores the original value:
+ */
+static void test_percent_max_roundtrip_stability(struct kunit *test)
+{
+ struct mpam_props fake_props = {0};
+ unsigned int shift;
+ u32 pc, max, pc2, max2;
+
+ mpam_set_feature(mpam_feat_mbw_max, &fake_props);
+ fake_props.bwa_wd = test_get_bwa_wd(test);
+ shift = 16 - fake_props.bwa_wd;
+
+ /*
+ * Converting a valid value from the coarser scale to the finer
+ * scale and back again must yield the original value:
+ */
+ if (fake_props.bwa_wd >= 7) {
+ /* More than 100 steps: only test exact pc values: */
+ for (pc = get_mba_min(&fake_props); pc <= MAX_MBA_BW; pc++) {
+ max = percent_to_mbw_max(pc, &fake_props);
+ pc2 = mbw_max_to_percent(max, &fake_props);
+ KUNIT_EXPECT_EQ(test, pc2, pc);
+ }
+ } else {
+ /* Fewer than 100 steps: only test exact mbw_max values: */
+ for (max = 0; max < 1 << 16; max += 1 << shift) {
+ pc = mbw_max_to_percent(max, &fake_props);
+ max2 = percent_to_mbw_max(pc, &fake_props);
+ KUNIT_EXPECT_EQ(test, max2, max);
+ }
+ }
+}
+
+static void test_percent_to_max_rounding(struct kunit *test)
+{
+ const struct percent_value_case *param = test->param_value;
+ unsigned int num_rounded_up = 0, total = 0;
+ struct percent_value_test_info res;
+
+ for (param = percent_value_cases, total = 0;
+ param < &percent_value_cases[ARRAY_SIZE(percent_value_cases)];
+ param++, total++) {
+ __prepare_percent_value_test(test, &res, param);
+ if (res.value > param->value << res.shift)
+ num_rounded_up++;
+ }
+
+ /*
+ * The MPAM driver applies a round-to-nearest policy, whereas a
+ * round-down policy seems to have been applied in the
+ * reference table from which the test vectors were selected.
+ *
+ * For a large and well-distributed suite of test vectors,
+ * about half should be rounded up and half down compared with
+ * the reference table. The actual test vectors are few in
+ * number and probably not very well distributed however, so
+ * tolerate a round-up rate of between 1/4 and 3/4 before
+ * crying foul:
+ */
+
+ kunit_info(test, "Round-up rate: %u%% (%u/%u)\n",
+ DIV_ROUND_CLOSEST(num_rounded_up * 100, total),
+ num_rounded_up, total);
+
+ KUNIT_EXPECT_GE(test, 4 * num_rounded_up, 1 * total);
+ KUNIT_EXPECT_LE(test, 4 * num_rounded_up, 3 * total);
+}
+
+static struct kunit_case mpam_resctrl_test_cases[] = {
+ KUNIT_CASE(test_get_mba_granularity),
+ KUNIT_CASE_PARAM(test_mbw_max_to_percent, test_percent_value_gen_params),
+ KUNIT_CASE_PARAM(test_percent_to_mbw_max, test_percent_value_gen_params),
+ KUNIT_CASE_PARAM(test_mbw_max_to_percent_limits, test_all_bwa_wd_gen_params),
+ KUNIT_CASE(test_percent_to_max_rounding),
+ KUNIT_CASE_PARAM(test_percent_max_roundtrip_stability,
+ test_all_bwa_wd_gen_params),
+ {}
+};
+
+static struct kunit_suite mpam_resctrl_test_suite = {
+ .name = "mpam_resctrl_test_suite",
+ .test_cases = mpam_resctrl_test_cases,
+};
+
+kunit_test_suites(&mpam_resctrl_test_suite);