From 87db7579ebd5ded337056eb765542eb2608f16e3 Mon Sep 17 00:00:00 2001 From: Philippe Bergheaud Date: Fri, 19 Jun 2020 16:04:39 +0200 Subject: ocxl: control via sysfs whether the FPGA is reloaded on a link reset Some opencapi FPGA images allow to control if the FPGA should be reloaded on the next adapter reset. If it is supported, the image specifies it through a Vendor Specific DVSEC in the config space of function 0. Signed-off-by: Philippe Bergheaud Signed-off-by: Frederic Barrat Reviewed-by: Andrew Donnellan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200619140439.153962-1-fbarrat@linux.ibm.com --- Documentation/ABI/testing/sysfs-class-ocxl | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-class-ocxl b/Documentation/ABI/testing/sysfs-class-ocxl index b5b1fa197592..ae1276efa45a 100644 --- a/Documentation/ABI/testing/sysfs-class-ocxl +++ b/Documentation/ABI/testing/sysfs-class-ocxl @@ -33,3 +33,14 @@ Date: January 2018 Contact: linuxppc-dev@lists.ozlabs.org Description: read/write Give access the global mmio area for the AFU + +What: /sys/class/ocxl//reload_on_reset +Date: February 2020 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read/write + Control whether the FPGA is reloaded on a link reset. Enabled + through a vendor-specific logic block on the FPGA. + 0 Do not reload FPGA image from flash + 1 Reload FPGA image from flash + unavailable + The device does not support this capability -- cgit v1.2.3 From 7d38f089731fe129a49e254028caec6f05420f18 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 2 Jul 2020 14:09:21 +0000 Subject: docs: powerpc: Clarify book3s/32 MMU families Documentation wrongly tells that book3s/32 CPU have hash MMU. 603 and e300 core only have software loaded TLB. 755, 7450 family and e600 core have both hash MMU and software loaded TLB. This can be selected by setting a bit in HID2 (755) or HID0 (others). At the time being this is not supported by the kernel. Make this explicit in the documentation. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/261923c075d1cb49d02493685e8585d4ea2a5197.1593698951.git.christophe.leroy@csgroup.eu --- Documentation/powerpc/cpu_families.rst | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'Documentation') diff --git a/Documentation/powerpc/cpu_families.rst b/Documentation/powerpc/cpu_families.rst index 1e063c5440c3..9b84e045e713 100644 --- a/Documentation/powerpc/cpu_families.rst +++ b/Documentation/powerpc/cpu_families.rst @@ -9,7 +9,9 @@ and are supported by arch/powerpc. Book3S (aka sPAPR) ------------------ -- Hash MMU +- Hash MMU (except 603 and e300) +- Software loaded TLB (603 and e300) +- Selectable Software loaded TLB in addition to hash MMU (755, 7450, e600) - Mix of 32 & 64 bit:: +--------------+ +----------------+ @@ -24,9 +26,9 @@ Book3S (aka sPAPR) | | | | v v - +--------------+ +----------------+ +-------+ - | 604 | | 750 (G3) | ---> | 750CX | - +--------------+ +----------------+ +-------+ + +--------------+ +-----+ +----------------+ +-------+ + | 604 | | 755 | <--- | 750 (G3) | ---> | 750CX | + +--------------+ +-----+ +----------------+ +-------+ | | | | | | v v v -- cgit v1.2.3 From 3e79f082ebfc130360bcee23e4dd74729dcafdf4 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 1 Jul 2020 12:52:32 +0530 Subject: libnvdimm/nvdimm/flush: Allow architecture to override the flush barrier Architectures like ppc64 provide persistent memory specific barriers that will ensure that all stores for which the modifications are written to persistent storage by preceding dcbfps and dcbstps instructions have updated persistent storage before any data access or data transfer caused by subsequent instructions is initiated. This is in addition to the ordering done by wmb() Update nvdimm core such that architecture can use barriers other than wmb to ensure all previous writes are architecturally visible for the platform buffer flush. Signed-off-by: Aneesh Kumar K.V Reviewed-by: Dan Williams Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200701072235.223558-5-aneesh.kumar@linux.ibm.com --- Documentation/memory-barriers.txt | 14 ++++++++++++++ drivers/md/dm-writecache.c | 2 +- drivers/nvdimm/region_devs.c | 8 ++++---- include/asm-generic/barrier.h | 10 ++++++++++ 4 files changed, 29 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index eaabc3134294..ff07cd3b2f82 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -1935,6 +1935,20 @@ There are some more advanced barrier functions: relaxed I/O accessors and the Documentation/DMA-API.txt file for more information on consistent memory. + (*) pmem_wmb(); + + This is for use with persistent memory to ensure that stores for which + modifications are written to persistent storage reached a platform + durability domain. + + For example, after a non-temporal write to pmem region, we use pmem_wmb() + to ensure that stores have reached a platform durability domain. This ensures + that stores have updated persistent storage before any data access or + data transfer caused by subsequent instructions is initiated. This is + in addition to the ordering done by wmb(). + + For load from persistent memory, existing read memory barriers are sufficient + to ensure read ordering. =============================== IMPLICIT KERNEL MEMORY BARRIERS diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 74f3c506f084..00534fa4a384 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -536,7 +536,7 @@ static void ssd_commit_superblock(struct dm_writecache *wc) static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios) { if (WC_MODE_PMEM(wc)) - wmb(); + pmem_wmb(); else ssd_commit_flushed(wc, wait_for_ios); } diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index 4502f9c4708d..c3237c2b03a6 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -1206,13 +1206,13 @@ int generic_nvdimm_flush(struct nd_region *nd_region) idx = this_cpu_add_return(flush_idx, hash_32(current->pid + idx, 8)); /* - * The first wmb() is needed to 'sfence' all previous writes - * such that they are architecturally visible for the platform - * buffer flush. Note that we've already arranged for pmem + * The pmem_wmb() is needed to 'sfence' all + * previous writes such that they are architecturally visible for + * the platform buffer flush. Note that we've already arranged for pmem * writes to avoid the cache via memcpy_flushcache(). The final * wmb() ensures ordering for the NVDIMM flush write. */ - wmb(); + pmem_wmb(); for (i = 0; i < nd_region->ndr_mappings; i++) if (ndrd_get_flush_wpq(ndrd, i, 0)) writeq(1, ndrd_get_flush_wpq(ndrd, i, idx)); diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h index 2eacaf7d62f6..b589bb216ee5 100644 --- a/include/asm-generic/barrier.h +++ b/include/asm-generic/barrier.h @@ -257,5 +257,15 @@ do { \ }) #endif +/* + * pmem_wmb() ensures that all stores for which the modification + * are written to persistent storage by preceding instructions have + * updated persistent storage before any data access or data transfer + * caused by subsequent instructions is initiated. + */ +#ifndef pmem_wmb +#define pmem_wmb() wmb() +#endif + #endif /* !__ASSEMBLY__ */ #endif /* __ASM_GENERIC_BARRIER_H */ -- cgit v1.2.3 From 48f6e7f6d948b56489da027bc3284c709b939d28 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Fri, 12 Jun 2020 00:12:21 -0500 Subject: powerpc/pseries: remove cede offline state for CPUs This effectively reverts commit 3aa565f53c39 ("powerpc/pseries: Add hooks to put the CPU into an appropriate offline state"), which added an offline mode for CPUs which uses the H_CEDE hcall instead of the architected stop-self RTAS function in order to facilitate "folding" of dedicated mode processors on PowerVM platforms to achieve energy savings. This has been the default offline mode since its introduction. There's nothing about stop-self that would prevent the hypervisor from achieving the energy savings available via H_CEDE, so the original premise of this change appears to be flawed. I also have encountered the claim that the transition to and from ceded state is much faster than stop-self/start-cpu. Certainly we would not want to use stop-self as an *idle* mode. That is what H_CEDE is for. However, this difference is insignificant in the context of Linux CPU hotplug, where the latency of an offline or online operation on current systems is on the order of 100ms, mainly attributable to all the various subsystems' cpuhp callbacks. The cede offline mode also prevents accurate accounting, as discussed before: https://lore.kernel.org/linuxppc-dev/1571740391-3251-1-git-send-email-ego@linux.vnet.ibm.com/ Unconditionally use stop-self to offline processor threads. This is the architected method for offlining CPUs on PAPR systems. The "cede_offline" boot parameter is rendered obsolete. Removing this code enables the removal of the partition suspend code which temporarily onlines all present CPUs. Fixes: 3aa565f53c39 ("powerpc/pseries: Add hooks to put the CPU into an appropriate offline state") Signed-off-by: Nathan Lynch Reviewed-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200612051238.1007764-2-nathanl@linux.ibm.com --- Documentation/core-api/cpu_hotplug.rst | 7 - arch/powerpc/platforms/pseries/hotplug-cpu.c | 170 ++---------------------- arch/powerpc/platforms/pseries/offline_states.h | 38 ------ arch/powerpc/platforms/pseries/pmem.c | 1 - arch/powerpc/platforms/pseries/smp.c | 28 +--- 5 files changed, 15 insertions(+), 229 deletions(-) delete mode 100644 arch/powerpc/platforms/pseries/offline_states.h (limited to 'Documentation') diff --git a/Documentation/core-api/cpu_hotplug.rst b/Documentation/core-api/cpu_hotplug.rst index 4a50ab7817f7..b1ae1ac159cf 100644 --- a/Documentation/core-api/cpu_hotplug.rst +++ b/Documentation/core-api/cpu_hotplug.rst @@ -50,13 +50,6 @@ Command Line Switches This option is limited to the X86 and S390 architecture. -``cede_offline={"off","on"}`` - Use this option to disable/enable putting offlined processors to an extended - ``H_CEDE`` state on supported pseries platforms. If nothing is specified, - ``cede_offline`` is set to "on". - - This option is limited to the PowerPC architecture. - ``cpu0_hotplug`` Allow to shutdown CPU0. diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 3e8cbfe7a80f..d4b346355bb9 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -35,54 +35,10 @@ #include #include "pseries.h" -#include "offline_states.h" /* This version can't take the spinlock, because it never returns */ static int rtas_stop_self_token = RTAS_UNKNOWN_SERVICE; -static DEFINE_PER_CPU(enum cpu_state_vals, preferred_offline_state) = - CPU_STATE_OFFLINE; -static DEFINE_PER_CPU(enum cpu_state_vals, current_state) = CPU_STATE_OFFLINE; - -static enum cpu_state_vals default_offline_state = CPU_STATE_OFFLINE; - -static bool cede_offline_enabled __read_mostly = true; - -/* - * Enable/disable cede_offline when available. - */ -static int __init setup_cede_offline(char *str) -{ - return (kstrtobool(str, &cede_offline_enabled) == 0); -} - -__setup("cede_offline=", setup_cede_offline); - -enum cpu_state_vals get_cpu_current_state(int cpu) -{ - return per_cpu(current_state, cpu); -} - -void set_cpu_current_state(int cpu, enum cpu_state_vals state) -{ - per_cpu(current_state, cpu) = state; -} - -enum cpu_state_vals get_preferred_offline_state(int cpu) -{ - return per_cpu(preferred_offline_state, cpu); -} - -void set_preferred_offline_state(int cpu, enum cpu_state_vals state) -{ - per_cpu(preferred_offline_state, cpu) = state; -} - -void set_default_offline_state(int cpu) -{ - per_cpu(preferred_offline_state, cpu) = default_offline_state; -} - static void rtas_stop_self(void) { static struct rtas_args args; @@ -101,9 +57,7 @@ static void rtas_stop_self(void) static void pseries_mach_cpu_die(void) { - unsigned int cpu = smp_processor_id(); unsigned int hwcpu = hard_smp_processor_id(); - u8 cede_latency_hint = 0; local_irq_disable(); idle_task_exit(); @@ -112,49 +66,6 @@ static void pseries_mach_cpu_die(void) else xics_teardown_cpu(); - if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) { - set_cpu_current_state(cpu, CPU_STATE_INACTIVE); - if (ppc_md.suspend_disable_cpu) - ppc_md.suspend_disable_cpu(); - - cede_latency_hint = 2; - - get_lppaca()->idle = 1; - if (!lppaca_shared_proc(get_lppaca())) - get_lppaca()->donate_dedicated_cpu = 1; - - while (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) { - while (!prep_irq_for_idle()) { - local_irq_enable(); - local_irq_disable(); - } - - extended_cede_processor(cede_latency_hint); - } - - local_irq_disable(); - - if (!lppaca_shared_proc(get_lppaca())) - get_lppaca()->donate_dedicated_cpu = 0; - get_lppaca()->idle = 0; - - if (get_preferred_offline_state(cpu) == CPU_STATE_ONLINE) { - unregister_slb_shadow(hwcpu); - - hard_irq_disable(); - /* - * Call to start_secondary_resume() will not return. - * Kernel stack will be reset and start_secondary() - * will be called to continue the online operation. - */ - start_secondary_resume(); - } - } - - /* Requested state is CPU_STATE_OFFLINE at this point */ - WARN_ON(get_preferred_offline_state(cpu) != CPU_STATE_OFFLINE); - - set_cpu_current_state(cpu, CPU_STATE_OFFLINE); unregister_slb_shadow(hwcpu); rtas_stop_self(); @@ -200,24 +111,13 @@ static void pseries_cpu_die(unsigned int cpu) int cpu_status = 1; unsigned int pcpu = get_hard_smp_processor_id(cpu); - if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) { - cpu_status = 1; - for (tries = 0; tries < 5000; tries++) { - if (get_cpu_current_state(cpu) == CPU_STATE_INACTIVE) { - cpu_status = 0; - break; - } - msleep(1); - } - } else if (get_preferred_offline_state(cpu) == CPU_STATE_OFFLINE) { + for (tries = 0; tries < 25; tries++) { + cpu_status = smp_query_cpu_stopped(pcpu); + if (cpu_status == QCSS_STOPPED || + cpu_status == QCSS_HARDWARE_ERROR) + break; + cpu_relax(); - for (tries = 0; tries < 25; tries++) { - cpu_status = smp_query_cpu_stopped(pcpu); - if (cpu_status == QCSS_STOPPED || - cpu_status == QCSS_HARDWARE_ERROR) - break; - cpu_relax(); - } } if (cpu_status != 0) { @@ -359,28 +259,15 @@ static int dlpar_offline_cpu(struct device_node *dn) if (get_hard_smp_processor_id(cpu) != thread) continue; - if (get_cpu_current_state(cpu) == CPU_STATE_OFFLINE) + if (!cpu_online(cpu)) break; - if (get_cpu_current_state(cpu) == CPU_STATE_ONLINE) { - set_preferred_offline_state(cpu, - CPU_STATE_OFFLINE); - cpu_maps_update_done(); - timed_topology_update(1); - rc = device_offline(get_cpu_device(cpu)); - if (rc) - goto out; - cpu_maps_update_begin(); - break; - } - - /* - * The cpu is in CPU_STATE_INACTIVE. - * Upgrade it's state to CPU_STATE_OFFLINE. - */ - set_preferred_offline_state(cpu, CPU_STATE_OFFLINE); - WARN_ON(plpar_hcall_norets(H_PROD, thread) != H_SUCCESS); - __cpu_die(cpu); + cpu_maps_update_done(); + timed_topology_update(1); + rc = device_offline(get_cpu_device(cpu)); + if (rc) + goto out; + cpu_maps_update_begin(); break; } if (cpu == num_possible_cpus()) { @@ -414,8 +301,6 @@ static int dlpar_online_cpu(struct device_node *dn) for_each_present_cpu(cpu) { if (get_hard_smp_processor_id(cpu) != thread) continue; - BUG_ON(get_cpu_current_state(cpu) - != CPU_STATE_OFFLINE); cpu_maps_update_done(); timed_topology_update(1); find_and_online_cpu_nid(cpu); @@ -1013,27 +898,8 @@ static struct notifier_block pseries_smp_nb = { .notifier_call = pseries_smp_notifier, }; -#define MAX_CEDE_LATENCY_LEVELS 4 -#define CEDE_LATENCY_PARAM_LENGTH 10 -#define CEDE_LATENCY_PARAM_MAX_LENGTH \ - (MAX_CEDE_LATENCY_LEVELS * CEDE_LATENCY_PARAM_LENGTH * sizeof(char)) -#define CEDE_LATENCY_TOKEN 45 - -static char cede_parameters[CEDE_LATENCY_PARAM_MAX_LENGTH]; - -static int parse_cede_parameters(void) -{ - memset(cede_parameters, 0, CEDE_LATENCY_PARAM_MAX_LENGTH); - return rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, - NULL, - CEDE_LATENCY_TOKEN, - __pa(cede_parameters), - CEDE_LATENCY_PARAM_MAX_LENGTH); -} - static int __init pseries_cpu_hotplug_init(void) { - int cpu; int qcss_tok; #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE @@ -1056,16 +922,8 @@ static int __init pseries_cpu_hotplug_init(void) smp_ops->cpu_die = pseries_cpu_die; /* Processors can be added/removed only on LPAR */ - if (firmware_has_feature(FW_FEATURE_LPAR)) { + if (firmware_has_feature(FW_FEATURE_LPAR)) of_reconfig_notifier_register(&pseries_smp_nb); - cpu_maps_update_begin(); - if (cede_offline_enabled && parse_cede_parameters() == 0) { - default_offline_state = CPU_STATE_INACTIVE; - for_each_online_cpu(cpu) - set_default_offline_state(cpu); - } - cpu_maps_update_done(); - } return 0; } diff --git a/arch/powerpc/platforms/pseries/offline_states.h b/arch/powerpc/platforms/pseries/offline_states.h deleted file mode 100644 index 51414aee2862..000000000000 --- a/arch/powerpc/platforms/pseries/offline_states.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _OFFLINE_STATES_H_ -#define _OFFLINE_STATES_H_ - -/* Cpu offline states go here */ -enum cpu_state_vals { - CPU_STATE_OFFLINE, - CPU_STATE_INACTIVE, - CPU_STATE_ONLINE, - CPU_MAX_OFFLINE_STATES -}; - -#ifdef CONFIG_HOTPLUG_CPU -extern enum cpu_state_vals get_cpu_current_state(int cpu); -extern void set_cpu_current_state(int cpu, enum cpu_state_vals state); -extern void set_preferred_offline_state(int cpu, enum cpu_state_vals state); -extern void set_default_offline_state(int cpu); -#else -static inline enum cpu_state_vals get_cpu_current_state(int cpu) -{ - return CPU_STATE_ONLINE; -} - -static inline void set_cpu_current_state(int cpu, enum cpu_state_vals state) -{ -} - -static inline void set_preferred_offline_state(int cpu, enum cpu_state_vals state) -{ -} - -static inline void set_default_offline_state(int cpu) -{ -} -#endif - -extern enum cpu_state_vals get_preferred_offline_state(int cpu); -#endif diff --git a/arch/powerpc/platforms/pseries/pmem.c b/arch/powerpc/platforms/pseries/pmem.c index 2347e1038f58..e1dc5d3254df 100644 --- a/arch/powerpc/platforms/pseries/pmem.c +++ b/arch/powerpc/platforms/pseries/pmem.c @@ -24,7 +24,6 @@ #include #include "pseries.h" -#include "offline_states.h" static struct device_node *pmem_node; diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index 6891710833be..7ebacac03dc3 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -44,8 +44,6 @@ #include #include "pseries.h" -#include "offline_states.h" - /* * The Primary thread of each non-boot processor was started from the OF client @@ -108,10 +106,7 @@ static inline int smp_startup_cpu(unsigned int lcpu) /* Fixup atomic count: it exited inside IRQ handler. */ task_thread_info(paca_ptrs[lcpu]->__current)->preempt_count = 0; -#ifdef CONFIG_HOTPLUG_CPU - if (get_cpu_current_state(lcpu) == CPU_STATE_INACTIVE) - goto out; -#endif + /* * If the RTAS start-cpu token does not exist then presume the * cpu is already spinning. @@ -126,9 +121,6 @@ static inline int smp_startup_cpu(unsigned int lcpu) return 0; } -#ifdef CONFIG_HOTPLUG_CPU -out: -#endif return 1; } @@ -143,10 +135,6 @@ static void smp_setup_cpu(int cpu) vpa_init(cpu); cpumask_clear_cpu(cpu, of_spin_mask); -#ifdef CONFIG_HOTPLUG_CPU - set_cpu_current_state(cpu, CPU_STATE_ONLINE); - set_default_offline_state(cpu); -#endif } static int smp_pSeries_kick_cpu(int nr) @@ -163,20 +151,6 @@ static int smp_pSeries_kick_cpu(int nr) * the processor will continue on to secondary_start */ paca_ptrs[nr]->cpu_start = 1; -#ifdef CONFIG_HOTPLUG_CPU - set_preferred_offline_state(nr, CPU_STATE_ONLINE); - - if (get_cpu_current_state(nr) == CPU_STATE_INACTIVE) { - long rc; - unsigned long hcpuid; - - hcpuid = get_hard_smp_processor_id(nr); - rc = plpar_hcall_norets(H_PROD, hcpuid); - if (rc != H_SUCCESS) - printk(KERN_ERR "Error: Prod to wake up processor %d " - "Ret= %ld\n", nr, rc); - } -#endif return 0; } -- cgit v1.2.3 From 792f73f747b82f6cb191a323e1f5755d33149b50 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Thu, 9 Jul 2020 10:48:36 +0530 Subject: powerpc/hv-24x7: Add sysfs files inside hv-24x7 device to show cpumask Patch here adds a cpumask attr to hv_24x7 pmu along with ABI documentation. Primary use to expose the cpumask is for the perf tool which has the capability to parse the driver sysfs folder and understand the cpumask file. Having cpumask file will reduce the number of perf command line parameters (will avoid "-C" option in the perf tool command line). It can also notify the user which is the current cpu used to retrieve the counter data. command:# cat /sys/devices/hv_24x7/interface/cpumask 0 Signed-off-by: Kajol Jain Reviewed-by: Madhavan Srinivasan Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200709051836.723765-3-kjain@linux.ibm.com --- Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7 | 7 +++++++ arch/powerpc/perf/hv-24x7.c | 8 ++++++++ 2 files changed, 15 insertions(+) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7 b/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7 index e8698afcd952..f7e32f218f73 100644 --- a/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7 +++ b/Documentation/ABI/testing/sysfs-bus-event_source-devices-hv_24x7 @@ -43,6 +43,13 @@ Description: read only This sysfs interface exposes the number of cores per chip present in the system. +What: /sys/devices/hv_24x7/interface/cpumask +Date: July 2020 +Contact: Linux on PowerPC Developer List +Description: read only + This sysfs file exposes the cpumask which is designated to make + HCALLs to retrieve hv-24x7 pmu event counter data. + What: /sys/bus/event_source/devices/hv_24x7/event_descs/ Date: February 2014 Contact: Linux on PowerPC Developer List diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index 93b4700dcf8c..cdb7bfbd157e 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -448,6 +448,12 @@ static ssize_t device_show_string(struct device *dev, return sprintf(buf, "%s\n", (char *)d->var); } +static ssize_t cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return cpumap_print_to_pagebuf(true, buf, &hv_24x7_cpumask); +} + static ssize_t sockets_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1115,6 +1121,7 @@ static DEVICE_ATTR_RO(domains); static DEVICE_ATTR_RO(sockets); static DEVICE_ATTR_RO(chipspersocket); static DEVICE_ATTR_RO(coresperchip); +static DEVICE_ATTR_RO(cpumask); static struct bin_attribute *if_bin_attrs[] = { &bin_attr_catalog, @@ -1128,6 +1135,7 @@ static struct attribute *if_attrs[] = { &dev_attr_sockets.attr, &dev_attr_chipspersocket.attr, &dev_attr_coresperchip.attr, + &dev_attr_cpumask.attr, NULL, }; -- cgit v1.2.3 From c8ed9fc9d29e24dafd08971e6a0c6b302a8ade2d Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Sat, 18 Jul 2020 12:39:58 +0200 Subject: powerpc: Replace HTTP links with HTTPS ones Rationale: Reduces attack surface on kernel devs opening the links for MITM as HTTPS traffic is much harder to manipulate. Deterministic algorithm: For each file: If not .svg: For each line: If doesn't contain `\bxmlns\b`: For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`: If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`: If both the HTTP and HTTPS versions return 200 OK and serve the same content: Replace HTTP with HTTPS. Signed-off-by: Alexander A. Klimov Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200718103958.5455-1-grandmaster@al2klimov.de --- Documentation/powerpc/mpc52xx.rst | 2 +- arch/powerpc/crypto/crc32-vpmsum_core.S | 2 +- arch/powerpc/include/asm/hydra.h | 2 +- drivers/crypto/vmx/aesp8-ppc.pl | 2 +- drivers/crypto/vmx/ghashp8-ppc.pl | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/powerpc/mpc52xx.rst b/Documentation/powerpc/mpc52xx.rst index 8676ac63e077..30260707c3fe 100644 --- a/Documentation/powerpc/mpc52xx.rst +++ b/Documentation/powerpc/mpc52xx.rst @@ -2,7 +2,7 @@ Linux 2.6.x on MPC52xx family ============================= -For the latest info, go to http://www.246tNt.com/mpc52xx/ +For the latest info, go to https://www.246tNt.com/mpc52xx/ To compile/use : diff --git a/arch/powerpc/crypto/crc32-vpmsum_core.S b/arch/powerpc/crypto/crc32-vpmsum_core.S index c3524eba4d0d..a16a717c809c 100644 --- a/arch/powerpc/crypto/crc32-vpmsum_core.S +++ b/arch/powerpc/crypto/crc32-vpmsum_core.S @@ -19,7 +19,7 @@ * We then use fixed point Barrett reduction to compute a mod n over GF(2) * for n = CRC using POWER8 instructions. We use x = 32. * - * http://en.wikipedia.org/wiki/Barrett_reduction + * https://en.wikipedia.org/wiki/Barrett_reduction * * Copyright (C) 2015 Anton Blanchard , IBM */ diff --git a/arch/powerpc/include/asm/hydra.h b/arch/powerpc/include/asm/hydra.h index b3b0f2d020f0..ae02eb53d6ef 100644 --- a/arch/powerpc/include/asm/hydra.h +++ b/arch/powerpc/include/asm/hydra.h @@ -10,7 +10,7 @@ * * © Copyright 1995 Apple Computer, Inc. All rights reserved. * - * It's available online from http://www.cpu.lu/~mlan/ftp/MacTech.pdf + * It's available online from https://www.cpu.lu/~mlan/ftp/MacTech.pdf * You can obtain paper copies of this book from computer bookstores or by * writing Morgan Kaufmann Publishers, Inc., 340 Pine Street, Sixth Floor, San * Francisco, CA 94104. Reference ISBN 1-55860-393-X. diff --git a/drivers/crypto/vmx/aesp8-ppc.pl b/drivers/crypto/vmx/aesp8-ppc.pl index db874367b602..50a0a18f35da 100644 --- a/drivers/crypto/vmx/aesp8-ppc.pl +++ b/drivers/crypto/vmx/aesp8-ppc.pl @@ -50,7 +50,7 @@ # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. +# details see https://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # This module implements support for AES instructions as per PowerISA diff --git a/drivers/crypto/vmx/ghashp8-ppc.pl b/drivers/crypto/vmx/ghashp8-ppc.pl index 38b06503ede0..09bba1852eec 100644 --- a/drivers/crypto/vmx/ghashp8-ppc.pl +++ b/drivers/crypto/vmx/ghashp8-ppc.pl @@ -13,7 +13,7 @@ # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further -# details see http://www.openssl.org/~appro/cryptogams/. +# details see https://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # GHASH for for PowerISA v2.07. -- cgit v1.2.3 From 5752fe0b811bb3cee531c52074921c6dd09dc42d Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Fri, 17 Jul 2020 10:38:17 -0400 Subject: KVM: PPC: Book3S HV: Save/restore new PMU registers Power ISA v3.1 has added new performance monitoring unit (PMU) special purpose registers (SPRs). They are: Monitor Mode Control Register 3 (MMCR3) Sampled Instruction Event Register A (SIER2) Sampled Instruction Event Register B (SIER3) Add support to save/restore these new SPRs while entering/exiting guest. Also include changes to support KVM_REG_PPC_MMCR3/SIER2/SIER3. Add new SPRs to KVM API documentation. Signed-off-by: Athira Rajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1594996707-3727-6-git-send-email-atrajeev@linux.vnet.ibm.com --- Documentation/virt/kvm/api.rst | 3 +++ arch/powerpc/include/asm/kvm_book3s_asm.h | 2 +- arch/powerpc/include/asm/kvm_host.h | 4 ++-- arch/powerpc/include/uapi/asm/kvm.h | 5 +++++ arch/powerpc/kernel/asm-offsets.c | 3 +++ arch/powerpc/kvm/book3s_hv.c | 22 ++++++++++++++++++++-- arch/powerpc/kvm/book3s_hv_interrupts.S | 8 ++++++++ arch/powerpc/kvm/book3s_hv_rmhandlers.S | 24 ++++++++++++++++++++++++ tools/arch/powerpc/include/uapi/asm/kvm.h | 5 +++++ 9 files changed, 71 insertions(+), 5 deletions(-) (limited to 'Documentation') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 426f94582b7a..8cb6eb76c129 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -2156,9 +2156,12 @@ registers, find a list below: PPC KVM_REG_PPC_MMCRA 64 PPC KVM_REG_PPC_MMCR2 64 PPC KVM_REG_PPC_MMCRS 64 + PPC KVM_REG_PPC_MMCR3 64 PPC KVM_REG_PPC_SIAR 64 PPC KVM_REG_PPC_SDAR 64 PPC KVM_REG_PPC_SIER 64 + PPC KVM_REG_PPC_SIER2 64 + PPC KVM_REG_PPC_SIER3 64 PPC KVM_REG_PPC_PMC1 32 PPC KVM_REG_PPC_PMC2 32 PPC KVM_REG_PPC_PMC3 32 diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index 45704f2716e2..078f4648ea27 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -119,7 +119,7 @@ struct kvmppc_host_state { void __iomem *xive_tima_virt; u32 saved_xirr; u64 dabr; - u64 host_mmcr[7]; /* MMCR 0,1,A, SIAR, SDAR, MMCR2, SIER */ + u64 host_mmcr[10]; /* MMCR 0,1,A, SIAR, SDAR, MMCR2, SIER, MMCR3, SIER2/3 */ u32 host_pmc[8]; u64 host_purr; u64 host_spurr; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 80b0005a7224..e020d269416d 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -637,14 +637,14 @@ struct kvm_vcpu_arch { u32 ccr1; u32 dbsr; - u64 mmcr[3]; /* MMCR0, MMCR1, MMCR2 */ + u64 mmcr[4]; /* MMCR0, MMCR1, MMCR2, MMCR3 */ u64 mmcra; u64 mmcrs; u32 pmc[8]; u32 spmc[2]; u64 siar; u64 sdar; - u64 sier; + u64 sier[3]; #ifdef CONFIG_PPC_TRANSACTIONAL_MEM u64 tfhar; u64 texasr; diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 264e266a85bf..c3af3f324c5a 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -640,6 +640,11 @@ struct kvm_ppc_cpu_char { #define KVM_REG_PPC_ONLINE (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbf) #define KVM_REG_PPC_PTCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc0) +/* POWER10 registers */ +#define KVM_REG_PPC_MMCR3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc1) +#define KVM_REG_PPC_SIER2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc2) +#define KVM_REG_PPC_SIER3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc3) + /* Transactional Memory checkpointed state: * This is all GPRs, all VSX regs and a subset of SPRs */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 6fa4853691de..8711c2164b45 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -698,6 +698,9 @@ int main(void) HSTATE_FIELD(HSTATE_SDAR, host_mmcr[4]); HSTATE_FIELD(HSTATE_MMCR2, host_mmcr[5]); HSTATE_FIELD(HSTATE_SIER, host_mmcr[6]); + HSTATE_FIELD(HSTATE_MMCR3, host_mmcr[7]); + HSTATE_FIELD(HSTATE_SIER2, host_mmcr[8]); + HSTATE_FIELD(HSTATE_SIER3, host_mmcr[9]); HSTATE_FIELD(HSTATE_PMC1, host_pmc[0]); HSTATE_FIELD(HSTATE_PMC2, host_pmc[1]); HSTATE_FIELD(HSTATE_PMC3, host_pmc[2]); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index b10bb404f0d5..0a0df31987f7 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1692,6 +1692,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_MMCRS: *val = get_reg_val(id, vcpu->arch.mmcrs); break; + case KVM_REG_PPC_MMCR3: + *val = get_reg_val(id, vcpu->arch.mmcr[3]); + break; case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8: i = id - KVM_REG_PPC_PMC1; *val = get_reg_val(id, vcpu->arch.pmc[i]); @@ -1707,7 +1710,13 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, *val = get_reg_val(id, vcpu->arch.sdar); break; case KVM_REG_PPC_SIER: - *val = get_reg_val(id, vcpu->arch.sier); + *val = get_reg_val(id, vcpu->arch.sier[0]); + break; + case KVM_REG_PPC_SIER2: + *val = get_reg_val(id, vcpu->arch.sier[1]); + break; + case KVM_REG_PPC_SIER3: + *val = get_reg_val(id, vcpu->arch.sier[2]); break; case KVM_REG_PPC_IAMR: *val = get_reg_val(id, vcpu->arch.iamr); @@ -1922,6 +1931,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_MMCRS: vcpu->arch.mmcrs = set_reg_val(id, *val); break; + case KVM_REG_PPC_MMCR3: + *val = get_reg_val(id, vcpu->arch.mmcr[3]); + break; case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8: i = id - KVM_REG_PPC_PMC1; vcpu->arch.pmc[i] = set_reg_val(id, *val); @@ -1937,7 +1949,13 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, vcpu->arch.sdar = set_reg_val(id, *val); break; case KVM_REG_PPC_SIER: - vcpu->arch.sier = set_reg_val(id, *val); + vcpu->arch.sier[0] = set_reg_val(id, *val); + break; + case KVM_REG_PPC_SIER2: + vcpu->arch.sier[1] = set_reg_val(id, *val); + break; + case KVM_REG_PPC_SIER3: + vcpu->arch.sier[2] = set_reg_val(id, *val); break; case KVM_REG_PPC_IAMR: vcpu->arch.iamr = set_reg_val(id, *val); diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S index 63fd81f3039d..59822cba454d 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupts.S +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -140,6 +140,14 @@ BEGIN_FTR_SECTION std r8, HSTATE_MMCR2(r13) std r9, HSTATE_SIER(r13) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +BEGIN_FTR_SECTION + mfspr r5, SPRN_MMCR3 + mfspr r6, SPRN_SIER2 + mfspr r7, SPRN_SIER3 + std r5, HSTATE_MMCR3(r13) + std r6, HSTATE_SIER2(r13) + std r7, HSTATE_SIER3(r13) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_31) mfspr r3, SPRN_PMC1 mfspr r5, SPRN_PMC2 mfspr r6, SPRN_PMC3 diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 702eaa253042..799d6d0f4ead 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -3435,6 +3435,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) mtspr SPRN_MMCRA, r6 mtspr SPRN_SIAR, r7 mtspr SPRN_SDAR, r8 +BEGIN_FTR_SECTION + ld r5, VCPU_MMCR + 24(r4) + ld r6, VCPU_SIER + 8(r4) + ld r7, VCPU_SIER + 16(r4) + mtspr SPRN_MMCR3, r5 + mtspr SPRN_SIER2, r6 + mtspr SPRN_SIER3, r7 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_31) BEGIN_FTR_SECTION ld r5, VCPU_MMCR + 16(r4) ld r6, VCPU_SIER(r4) @@ -3496,6 +3504,14 @@ BEGIN_FTR_SECTION mtspr SPRN_MMCR2, r8 mtspr SPRN_SIER, r9 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +BEGIN_FTR_SECTION + ld r5, HSTATE_MMCR3(r13) + ld r6, HSTATE_SIER2(r13) + ld r7, HSTATE_SIER3(r13) + mtspr SPRN_MMCR3, r5 + mtspr SPRN_SIER2, r6 + mtspr SPRN_SIER3, r7 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_31) mtspr SPRN_MMCR0, r3 isync mtlr r0 @@ -3555,6 +3571,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) BEGIN_FTR_SECTION std r10, VCPU_MMCR + 16(r9) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +BEGIN_FTR_SECTION + mfspr r5, SPRN_MMCR3 + mfspr r6, SPRN_SIER2 + mfspr r7, SPRN_SIER3 + std r5, VCPU_MMCR + 24(r9) + std r6, VCPU_SIER + 8(r9) + std r7, VCPU_SIER + 16(r9) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_31) std r7, VCPU_SIAR(r9) std r8, VCPU_SDAR(r9) mfspr r3, SPRN_PMC1 diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h index 264e266a85bf..c3af3f324c5a 100644 --- a/tools/arch/powerpc/include/uapi/asm/kvm.h +++ b/tools/arch/powerpc/include/uapi/asm/kvm.h @@ -640,6 +640,11 @@ struct kvm_ppc_cpu_char { #define KVM_REG_PPC_ONLINE (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbf) #define KVM_REG_PPC_PTCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc0) +/* POWER10 registers */ +#define KVM_REG_PPC_MMCR3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc1) +#define KVM_REG_PPC_SIER2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc2) +#define KVM_REG_PPC_SIER3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc3) + /* Transactional Memory checkpointed state: * This is all GPRs, all VSX regs and a subset of SPRs */ -- cgit v1.2.3 From 7fa95f9adaee7e5cbb195d3359741120829e488b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 11 Jun 2020 18:12:03 +1000 Subject: powerpc/64s: system call support for scv/rfscv instructions Add support for the scv instruction on POWER9 and later CPUs. For now this implements the zeroth scv vector 'scv 0', as identical to 'sc' system calls, with the exception that LR is not preserved, nor are volatile CR registers, and error is not indicated with CR0[SO], but by returning a negative errno. rfscv is implemented to return from scv type system calls. It can not be used to return from sc system calls because those are defined to preserve LR. getpid syscall throughput on POWER9 is improved by 26% (428 to 318 cycles), largely due to reducing mtmsr and mtspr. Signed-off-by: Nicholas Piggin [mpe: Fix ppc64e build] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200611081203.995112-3-npiggin@gmail.com --- Documentation/powerpc/syscall64-abi.rst | 42 +++++--- arch/powerpc/include/asm/asm-prototypes.h | 2 +- arch/powerpc/include/asm/exception-64s.h | 6 ++ arch/powerpc/include/asm/head-64.h | 2 +- arch/powerpc/include/asm/ppc_asm.h | 2 + arch/powerpc/include/asm/ptrace.h | 7 +- arch/powerpc/include/asm/setup.h | 4 +- arch/powerpc/include/asm/sstep.h | 1 + arch/powerpc/kernel/cpu_setup_power.S | 10 +- arch/powerpc/kernel/cputable.c | 3 +- arch/powerpc/kernel/dt_cpu_ftrs.c | 1 + arch/powerpc/kernel/entry_64.S | 171 +++++++++++++++++++++++++++++- arch/powerpc/kernel/exceptions-64s.S | 123 ++++++++++++++++++++- arch/powerpc/kernel/process.c | 10 +- arch/powerpc/kernel/setup_64.c | 5 +- arch/powerpc/kernel/signal.c | 19 +++- arch/powerpc/kernel/syscall_64.c | 32 ++++-- arch/powerpc/lib/sstep.c | 16 +++ arch/powerpc/platforms/pseries/setup.c | 8 +- arch/powerpc/xmon/xmon.c | 1 + 20 files changed, 421 insertions(+), 44 deletions(-) (limited to 'Documentation') diff --git a/Documentation/powerpc/syscall64-abi.rst b/Documentation/powerpc/syscall64-abi.rst index e49f69f941b9..46caaadbb029 100644 --- a/Documentation/powerpc/syscall64-abi.rst +++ b/Documentation/powerpc/syscall64-abi.rst @@ -5,6 +5,15 @@ Power Architecture 64-bit Linux system call ABI syscall ======= +Invocation +---------- +The syscall is made with the sc instruction, and returns with execution +continuing at the instruction following the sc instruction. + +If PPC_FEATURE2_SCV appears in the AT_HWCAP2 ELF auxiliary vector, the +scv 0 instruction is an alternative that may provide better performance, +with some differences to calling sequence. + syscall calling sequence\ [1]_ matches the Power Architecture 64-bit ELF ABI specification C function calling sequence, including register preservation rules, with the following differences. @@ -12,16 +21,23 @@ rules, with the following differences. .. [1] Some syscalls (typically low-level management functions) may have different calling sequences (e.g., rt_sigreturn). -Parameters and return value ---------------------------- +Parameters +---------- The system call number is specified in r0. There is a maximum of 6 integer parameters to a syscall, passed in r3-r8. -Both a return value and a return error code are returned. cr0.SO is the return -error code, and r3 is the return value or error code. When cr0.SO is clear, -the syscall succeeded and r3 is the return value. When cr0.SO is set, the -syscall failed and r3 is the error code that generally corresponds to errno. +Return value +------------ +- For the sc instruction, both a value and an error condition are returned. + cr0.SO is the error condition, and r3 is the return value. When cr0.SO is + clear, the syscall succeeded and r3 is the return value. When cr0.SO is set, + the syscall failed and r3 is the error value (that normally corresponds to + errno). + +- For the scv 0 instruction, the return value indicates failure if it is + -4095..-1 (i.e., it is >= -MAX_ERRNO (-4095) as an unsigned comparison), + in which case the error value is the negated return value. Stack ----- @@ -34,22 +50,23 @@ Register preservation rules match the ELF ABI calling sequence with the following differences: =========== ============= ======================================== +--- For the sc instruction, differences with the ELF ABI --- r0 Volatile (System call number.) r3 Volatile (Parameter 1, and return value.) r4-r8 Volatile (Parameters 2-6.) -cr0 Volatile (cr0.SO is the return error condition) +cr0 Volatile (cr0.SO is the return error condition.) cr1, cr5-7 Nonvolatile lr Nonvolatile + +--- For the scv 0 instruction, differences with the ELF ABI --- +r0 Volatile (System call number.) +r3 Volatile (Parameter 1, and return value.) +r4-r8 Volatile (Parameters 2-6.) =========== ============= ======================================== All floating point and vector data registers as well as control and status registers are nonvolatile. -Invocation ----------- -The syscall is performed with the sc instruction, and returns with execution -continuing at the instruction following the sc instruction. - Transactional Memory -------------------- Syscall behavior can change if the processor is in transactional or suspended @@ -75,6 +92,7 @@ auxiliary vector. returning to the caller. This case is not well defined or supported, so this behavior should not be relied upon. +scv 0 syscalls will always behave as PPC_FEATURE2_HTM_NOSC. vsyscall ======== diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 7d81e86a1e5d..fb47bf5818c8 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -98,7 +98,7 @@ unsigned long __init early_init(unsigned long dt_ptr); void __init machine_init(u64 dt_ptr); #endif long system_call_exception(long r3, long r4, long r5, long r6, long r7, long r8, unsigned long r0, struct pt_regs *regs); -notrace unsigned long syscall_exit_prepare(unsigned long r3, struct pt_regs *regs); +notrace unsigned long syscall_exit_prepare(unsigned long r3, struct pt_regs *regs, long scv); notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned long msr); notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsigned long msr); diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 47bd4ea0837d..0c2fe7f042d1 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -123,6 +123,12 @@ hrfid; \ b hrfi_flush_fallback +#define RFSCV_TO_USER \ + STF_EXIT_BARRIER_SLOT; \ + RFI_FLUSH_SLOT; \ + RFSCV; \ + b rfscv_flush_fallback + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_EXCEPTION_H */ diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h index 2dabcf668292..4cb9efa2eb21 100644 --- a/arch/powerpc/include/asm/head-64.h +++ b/arch/powerpc/include/asm/head-64.h @@ -128,7 +128,7 @@ name: .if ((start) % (size) != 0); \ .error "Fixed section exception vector misalignment"; \ .endif; \ - .if ((size) != 0x20) && ((size) != 0x80) && ((size) != 0x100); \ + .if ((size) != 0x20) && ((size) != 0x80) && ((size) != 0x100) && ((size) != 0x1000); \ .error "Fixed section exception vector bad size"; \ .endif; \ .if (start) < sname##_start; \ diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 6b03dff61a05..160f3bb77ea4 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -755,6 +755,8 @@ END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, CPU_FTR_CELL_TB_BUG, 96) #define N_SLINE 68 #define N_SO 100 +#define RFSCV .long 0x4c0000a4 + /* * Create an endian fixup trampoline * diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index ac3970fff0d5..f194339cef3b 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -222,9 +222,14 @@ static inline void set_trap(struct pt_regs *regs, unsigned long val) regs->trap = (regs->trap & TRAP_FLAGS_MASK) | (val & ~TRAP_FLAGS_MASK); } +static inline bool trap_is_scv(struct pt_regs *regs) +{ + return (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && TRAP(regs) == 0x3000); +} + static inline bool trap_is_syscall(struct pt_regs *regs) { - return TRAP(regs) == 0xc00; + return (trap_is_scv(regs) || TRAP(regs) == 0xc00); } static inline bool trap_norestart(struct pt_regs *regs) diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h index 65676e2325b8..9efbddee2bca 100644 --- a/arch/powerpc/include/asm/setup.h +++ b/arch/powerpc/include/asm/setup.h @@ -30,12 +30,12 @@ void setup_panic(void); #define ARCH_PANIC_TIMEOUT 180 #ifdef CONFIG_PPC_PSERIES -extern void pseries_enable_reloc_on_exc(void); +extern bool pseries_enable_reloc_on_exc(void); extern void pseries_disable_reloc_on_exc(void); extern void pseries_big_endian_exceptions(void); extern void pseries_little_endian_exceptions(void); #else -static inline void pseries_enable_reloc_on_exc(void) {} +static inline bool pseries_enable_reloc_on_exc(void) { return false; } static inline void pseries_disable_reloc_on_exc(void) {} static inline void pseries_big_endian_exceptions(void) {} static inline void pseries_little_endian_exceptions(void) {} diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h index 3b01c69a44aa..eaa4fb6c8960 100644 --- a/arch/powerpc/include/asm/sstep.h +++ b/arch/powerpc/include/asm/sstep.h @@ -40,6 +40,7 @@ enum instruction_type { CACHEOP, BARRIER, SYSCALL, + SYSCALL_VECTORED_0, MFMSR, MTMSR, RFI, diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S index efdcfa714106..86527d19348c 100644 --- a/arch/powerpc/kernel/cpu_setup_power.S +++ b/arch/powerpc/kernel/cpu_setup_power.S @@ -98,7 +98,7 @@ _GLOBAL(__setup_cpu_power10) _GLOBAL(__setup_cpu_power9) mflr r11 - bl __init_FSCR + bl __init_FSCR_power9 1: bl __init_PMU bl __init_hvmode_206 mtlr r11 @@ -128,7 +128,7 @@ _GLOBAL(__restore_cpu_power10) _GLOBAL(__restore_cpu_power9) mflr r11 - bl __init_FSCR + bl __init_FSCR_power9 1: bl __init_PMU mfmsr r3 rldicl. r0,r3,4,63 @@ -198,6 +198,12 @@ __init_FSCR_power10: mtspr SPRN_FSCR, r3 // fall through +__init_FSCR_power9: + mfspr r3, SPRN_FSCR + ori r3, r3, FSCR_SCV + mtspr SPRN_FSCR, r3 + // fall through + __init_FSCR: mfspr r3,SPRN_FSCR ori r3,r3,FSCR_TAR|FSCR_EBB diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index b4066354f073..3d406a9626e8 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -120,7 +120,8 @@ extern void __restore_cpu_e6500(void); #define COMMON_USER2_POWER9 (COMMON_USER2_POWER8 | \ PPC_FEATURE2_ARCH_3_00 | \ PPC_FEATURE2_HAS_IEEE128 | \ - PPC_FEATURE2_DARN ) + PPC_FEATURE2_DARN | \ + PPC_FEATURE2_SCV) #define COMMON_USER_POWER10 COMMON_USER_POWER9 #define COMMON_USER2_POWER10 (COMMON_USER2_POWER9 | \ PPC_FEATURE2_ARCH_3_1 | \ diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 3a409517c031..50b2d544361e 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -587,6 +587,7 @@ static struct dt_cpu_feature_match __initdata {"little-endian", feat_enable_le, CPU_FTR_REAL_LE}, {"smt", feat_enable_smt, 0}, {"interrupt-facilities", feat_enable, 0}, + {"system-call-vectored", feat_enable, 0}, {"timer-facilities", feat_enable, 0}, {"timer-facilities-v3", feat_enable, 0}, {"debug-facilities", feat_enable, 0}, diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 9d49338e0c85..223c4f008e63 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -64,15 +64,173 @@ exception_marker: .section ".text" .align 7 +#ifdef CONFIG_PPC_BOOK3S +.macro system_call_vectored name trapnr + .globl system_call_vectored_\name +system_call_vectored_\name: +_ASM_NOKPROBE_SYMBOL(system_call_vectored_\name) +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +BEGIN_FTR_SECTION + extrdi. r10, r12, 1, (63-MSR_TS_T_LG) /* transaction active? */ + bne .Ltabort_syscall +END_FTR_SECTION_IFSET(CPU_FTR_TM) +#endif + INTERRUPT_TO_KERNEL + mr r10,r1 + ld r1,PACAKSAVE(r13) + std r10,0(r1) + std r11,_NIP(r1) + std r12,_MSR(r1) + std r0,GPR0(r1) + std r10,GPR1(r1) + std r2,GPR2(r1) + ld r2,PACATOC(r13) + mfcr r12 + li r11,0 + /* Can we avoid saving r3-r8 in common case? */ + std r3,GPR3(r1) + std r4,GPR4(r1) + std r5,GPR5(r1) + std r6,GPR6(r1) + std r7,GPR7(r1) + std r8,GPR8(r1) + /* Zero r9-r12, this should only be required when restoring all GPRs */ + std r11,GPR9(r1) + std r11,GPR10(r1) + std r11,GPR11(r1) + std r11,GPR12(r1) + std r9,GPR13(r1) + SAVE_NVGPRS(r1) + std r11,_XER(r1) + std r11,_LINK(r1) + std r11,_CTR(r1) + + li r11,\trapnr + std r11,_TRAP(r1) + std r12,_CCR(r1) + std r3,ORIG_GPR3(r1) + addi r10,r1,STACK_FRAME_OVERHEAD + ld r11,exception_marker@toc(r2) + std r11,-16(r10) /* "regshere" marker */ + + /* + * RECONCILE_IRQ_STATE without calling trace_hardirqs_off(), which + * would clobber syscall parameters. Also we always enter with IRQs + * enabled and nothing pending. system_call_exception() will call + * trace_hardirqs_off(). + * + * scv enters with MSR[EE]=1, so don't set PACA_IRQ_HARD_DIS. The + * entry vector already sets PACAIRQSOFTMASK to IRQS_ALL_DISABLED. + */ + + /* Calling convention has r9 = orig r0, r10 = regs */ + mr r9,r0 + bl system_call_exception + +.Lsyscall_vectored_\name\()_exit: + addi r4,r1,STACK_FRAME_OVERHEAD + li r5,1 /* scv */ + bl syscall_exit_prepare + + ld r2,_CCR(r1) + ld r4,_NIP(r1) + ld r5,_MSR(r1) + +BEGIN_FTR_SECTION + stdcx. r0,0,r1 /* to clear the reservation */ +END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS) + +BEGIN_FTR_SECTION + HMT_MEDIUM_LOW +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + + cmpdi r3,0 + bne .Lsyscall_vectored_\name\()_restore_regs + + /* rfscv returns with LR->NIA and CTR->MSR */ + mtlr r4 + mtctr r5 + + /* Could zero these as per ABI, but we may consider a stricter ABI + * which preserves these if libc implementations can benefit, so + * restore them for now until further measurement is done. */ + ld r0,GPR0(r1) + ld r4,GPR4(r1) + ld r5,GPR5(r1) + ld r6,GPR6(r1) + ld r7,GPR7(r1) + ld r8,GPR8(r1) + /* Zero volatile regs that may contain sensitive kernel data */ + li r9,0 + li r10,0 + li r11,0 + li r12,0 + mtspr SPRN_XER,r0 + + /* + * We don't need to restore AMR on the way back to userspace for KUAP. + * The value of AMR only matters while we're in the kernel. + */ + mtcr r2 + ld r2,GPR2(r1) + ld r3,GPR3(r1) + ld r13,GPR13(r1) + ld r1,GPR1(r1) + RFSCV_TO_USER + b . /* prevent speculative execution */ + +.Lsyscall_vectored_\name\()_restore_regs: + li r3,0 + mtmsrd r3,1 + mtspr SPRN_SRR0,r4 + mtspr SPRN_SRR1,r5 + + ld r3,_CTR(r1) + ld r4,_LINK(r1) + ld r5,_XER(r1) + + REST_NVGPRS(r1) + ld r0,GPR0(r1) + mtcr r2 + mtctr r3 + mtlr r4 + mtspr SPRN_XER,r5 + REST_10GPRS(2, r1) + REST_2GPRS(12, r1) + ld r1,GPR1(r1) + RFI_TO_USER +.endm + +system_call_vectored common 0x3000 +/* + * We instantiate another entry copy for the SIGILL variant, with TRAP=0x7ff0 + * which is tested by system_call_exception when r0 is -1 (as set by vector + * entry code). + */ +system_call_vectored sigill 0x7ff0 + + +/* + * Entered via kernel return set up by kernel/sstep.c, must match entry regs + */ + .globl system_call_vectored_emulate +system_call_vectored_emulate: +_ASM_NOKPROBE_SYMBOL(system_call_vectored_emulate) + li r10,IRQS_ALL_DISABLED + stb r10,PACAIRQSOFTMASK(r13) + b system_call_vectored_common +#endif + + .balign IFETCH_ALIGN_BYTES .globl system_call_common system_call_common: +_ASM_NOKPROBE_SYMBOL(system_call_common) #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION extrdi. r10, r12, 1, (63-MSR_TS_T_LG) /* transaction active? */ bne .Ltabort_syscall END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif -_ASM_NOKPROBE_SYMBOL(system_call_common) mr r10,r1 ld r1,PACAKSAVE(r13) std r10,0(r1) @@ -138,6 +296,7 @@ END_BTB_FLUSH_SECTION .Lsyscall_exit: addi r4,r1,STACK_FRAME_OVERHEAD + li r5,0 /* !scv */ bl syscall_exit_prepare ld r2,_CCR(r1) @@ -224,10 +383,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) b . /* prevent speculative execution */ #endif +#ifdef CONFIG_PPC_BOOK3S +_GLOBAL(ret_from_fork_scv) + bl schedule_tail + REST_NVGPRS(r1) + li r3,0 /* fork() return value */ + b .Lsyscall_vectored_common_exit +#endif + _GLOBAL(ret_from_fork) bl schedule_tail REST_NVGPRS(r1) - li r3,0 + li r3,0 /* fork() return value */ b .Lsyscall_exit _GLOBAL(ret_from_kernel_thread) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index f15307c50bc1..8afc0e03d7d4 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -756,6 +756,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) * guarantee they will be delivered virtually. Some conditions (see the ISA) * cause exceptions to be delivered in real mode. * + * The scv instructions are a special case. They get a 0x3000 offset applied. + * scv exceptions have unique reentrancy properties, see below. + * * It's impossible to receive interrupts below 0x300 via AIL. * * KVM: None of the virtual exceptions are from the guest. Anything that @@ -765,8 +768,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) * We layout physical memory as follows: * 0x0000 - 0x00ff : Secondary processor spin code * 0x0100 - 0x18ff : Real mode pSeries interrupt vectors - * 0x1900 - 0x3fff : Real mode trampolines - * 0x4000 - 0x58ff : Relon (IR=1,DR=1) mode pSeries interrupt vectors + * 0x1900 - 0x2fff : Real mode trampolines + * 0x3000 - 0x58ff : Relon (IR=1,DR=1) mode pSeries interrupt vectors * 0x5900 - 0x6fff : Relon mode trampolines * 0x7000 - 0x7fff : FWNMI data area * 0x8000 - .... : Common interrupt handlers, remaining early @@ -777,8 +780,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP) * vectors there. */ OPEN_FIXED_SECTION(real_vectors, 0x0100, 0x1900) -OPEN_FIXED_SECTION(real_trampolines, 0x1900, 0x4000) -OPEN_FIXED_SECTION(virt_vectors, 0x4000, 0x5900) +OPEN_FIXED_SECTION(real_trampolines, 0x1900, 0x3000) +OPEN_FIXED_SECTION(virt_vectors, 0x3000, 0x5900) OPEN_FIXED_SECTION(virt_trampolines, 0x5900, 0x7000) #ifdef CONFIG_PPC_POWERNV @@ -814,6 +817,77 @@ USE_FIXED_SECTION(real_vectors) .globl __start_interrupts __start_interrupts: +/** + * Interrupt 0x3000 - System Call Vectored Interrupt (syscall). + * This is a synchronous interrupt invoked with the "scv" instruction. The + * system call does not alter the HV bit, so it is directed to the OS. + * + * Handling: + * scv instructions enter the kernel without changing EE, RI, ME, or HV. + * In particular, this means we can take a maskable interrupt at any point + * in the scv handler, which is unlike any other interrupt. This is solved + * by treating the instruction addresses below __end_interrupts as being + * soft-masked. + * + * AIL-0 mode scv exceptions go to 0x17000-0x17fff, but we set AIL-3 and + * ensure scv is never executed with relocation off, which means AIL-0 + * should never happen. + * + * Before leaving the below __end_interrupts text, at least of the following + * must be true: + * - MSR[PR]=1 (i.e., return to userspace) + * - MSR_EE|MSR_RI is set (no reentrant exceptions) + * - Standard kernel environment is set up (stack, paca, etc) + * + * Call convention: + * + * syscall register convention is in Documentation/powerpc/syscall64-abi.rst + */ +EXC_VIRT_BEGIN(system_call_vectored, 0x3000, 0x1000) + /* SCV 0 */ + mr r9,r13 + GET_PACA(r13) + mflr r11 + mfctr r12 + li r10,IRQS_ALL_DISABLED + stb r10,PACAIRQSOFTMASK(r13) +#ifdef CONFIG_RELOCATABLE + b system_call_vectored_tramp +#else + b system_call_vectored_common +#endif + nop + + /* SCV 1 - 127 */ + .rept 127 + mr r9,r13 + GET_PACA(r13) + mflr r11 + mfctr r12 + li r10,IRQS_ALL_DISABLED + stb r10,PACAIRQSOFTMASK(r13) + li r0,-1 /* cause failure */ +#ifdef CONFIG_RELOCATABLE + b system_call_vectored_sigill_tramp +#else + b system_call_vectored_sigill +#endif + .endr +EXC_VIRT_END(system_call_vectored, 0x3000, 0x1000) + +#ifdef CONFIG_RELOCATABLE +TRAMP_VIRT_BEGIN(system_call_vectored_tramp) + __LOAD_HANDLER(r10, system_call_vectored_common) + mtctr r10 + bctr + +TRAMP_VIRT_BEGIN(system_call_vectored_sigill_tramp) + __LOAD_HANDLER(r10, system_call_vectored_sigill) + mtctr r10 + bctr +#endif + + /* No virt vectors corresponding with 0x0..0x100 */ EXC_VIRT_NONE(0x4000, 0x100) @@ -2963,6 +3037,47 @@ TRAMP_REAL_BEGIN(hrfi_flush_fallback) GET_SCRATCH0(r13); hrfid +TRAMP_REAL_BEGIN(rfscv_flush_fallback) + /* system call volatile */ + mr r7,r13 + GET_PACA(r13); + mr r8,r1 + ld r1,PACAKSAVE(r13) + mfctr r9 + ld r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13) + ld r11,PACA_L1D_FLUSH_SIZE(r13) + srdi r11,r11,(7 + 3) /* 128 byte lines, unrolled 8x */ + mtctr r11 + DCBT_BOOK3S_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */ + + /* order ld/st prior to dcbt stop all streams with flushing */ + sync + + /* + * The load adresses are at staggered offsets within cachelines, + * which suits some pipelines better (on others it should not + * hurt). + */ +1: + ld r11,(0x80 + 8)*0(r10) + ld r11,(0x80 + 8)*1(r10) + ld r11,(0x80 + 8)*2(r10) + ld r11,(0x80 + 8)*3(r10) + ld r11,(0x80 + 8)*4(r10) + ld r11,(0x80 + 8)*5(r10) + ld r11,(0x80 + 8)*6(r10) + ld r11,(0x80 + 8)*7(r10) + addi r10,r10,0x80*8 + bdnz 1b + + mtctr r9 + li r9,0 + li r10,0 + li r11,0 + mr r1,r8 + mr r13,r7 + RFSCV + USE_TEXT_SECTION() MASKED_INTERRUPT MASKED_INTERRUPT hsrr=1 diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 4650b9bb217f..939935613cf1 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1599,6 +1599,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long usp, { struct pt_regs *childregs, *kregs; extern void ret_from_fork(void); + extern void ret_from_fork_scv(void); extern void ret_from_kernel_thread(void); void (*f)(void); unsigned long sp = (unsigned long)task_stack_page(p) + THREAD_SIZE; @@ -1635,7 +1636,9 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long usp, if (usp) childregs->gpr[1] = usp; p->thread.regs = childregs; - childregs->gpr[3] = 0; /* Result from fork() */ + /* 64s sets this in ret_from_fork */ + if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64)) + childregs->gpr[3] = 0; /* Result from fork() */ if (clone_flags & CLONE_SETTLS) { if (!is_32bit_task()) childregs->gpr[13] = tls; @@ -1643,7 +1646,10 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long usp, childregs->gpr[2] = tls; } - f = ret_from_fork; + if (trap_is_scv(regs)) + f = ret_from_fork_scv; + else + f = ret_from_fork; } childregs->msr &= ~(MSR_FP|MSR_VEC|MSR_VSX); sp -= STACK_FRAME_OVERHEAD; diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 0ba1ed77dc68..6be430107c6f 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -196,7 +196,10 @@ static void __init configure_exceptions(void) /* Under a PAPR hypervisor, we need hypercalls */ if (firmware_has_feature(FW_FEATURE_SET_MODE)) { /* Enable AIL if possible */ - pseries_enable_reloc_on_exc(); + if (!pseries_enable_reloc_on_exc()) { + init_task.thread.fscr &= ~FSCR_SCV; + cur_cpu_spec->cpu_user_features2 &= ~PPC_FEATURE2_SCV; + } /* * Tell the hypervisor that we want our exceptions to diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index b4143b6ff093..d15a98c758b8 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -205,8 +205,14 @@ static void check_syscall_restart(struct pt_regs *regs, struct k_sigaction *ka, return; /* error signalled ? */ - if (!(regs->ccr & 0x10000000)) + if (trap_is_scv(regs)) { + /* 32-bit compat mode sign extend? */ + if (!IS_ERR_VALUE(ret)) + return; + ret = -ret; + } else if (!(regs->ccr & 0x10000000)) { return; + } switch (ret) { case ERESTART_RESTARTBLOCK: @@ -239,9 +245,14 @@ static void check_syscall_restart(struct pt_regs *regs, struct k_sigaction *ka, regs->nip -= 4; regs->result = 0; } else { - regs->result = -EINTR; - regs->gpr[3] = EINTR; - regs->ccr |= 0x10000000; + if (trap_is_scv(regs)) { + regs->result = -EINTR; + regs->gpr[3] = -EINTR; + } else { + regs->result = -EINTR; + regs->gpr[3] = EINTR; + regs->ccr |= 0x10000000; + } } } diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall_64.c index 79edba3ab312..a783fd324cb0 100644 --- a/arch/powerpc/kernel/syscall_64.c +++ b/arch/powerpc/kernel/syscall_64.c @@ -60,6 +60,11 @@ notrace long system_call_exception(long r3, long r4, long r5, local_irq_enable(); if (unlikely(current_thread_info()->flags & _TIF_SYSCALL_DOTRACE)) { + if (unlikely(regs->trap == 0x7ff0)) { + /* Unsupported scv vector */ + _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); + return regs->gpr[3]; + } /* * We use the return value of do_syscall_trace_enter() as the * syscall number. If the syscall was rejected for any reason @@ -78,6 +83,11 @@ notrace long system_call_exception(long r3, long r4, long r5, r8 = regs->gpr[8]; } else if (unlikely(r0 >= NR_syscalls)) { + if (unlikely(regs->trap == 0x7ff0)) { + /* Unsupported scv vector */ + _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); + return regs->gpr[3]; + } return -ENOSYS; } @@ -105,16 +115,20 @@ notrace long system_call_exception(long r3, long r4, long r5, * local irqs must be disabled. Returns false if the caller must re-enable * them, check for new work, and try again. */ -static notrace inline bool prep_irq_for_enabled_exit(void) +static notrace inline bool prep_irq_for_enabled_exit(bool clear_ri) { /* This must be done with RI=1 because tracing may touch vmaps */ trace_hardirqs_on(); /* This pattern matches prep_irq_for_idle */ - __hard_EE_RI_disable(); + if (clear_ri) + __hard_EE_RI_disable(); + else + __hard_irq_disable(); if (unlikely(lazy_irq_pending_nocheck())) { /* Took an interrupt, may have more exit work to do. */ - __hard_RI_enable(); + if (clear_ri) + __hard_RI_enable(); trace_hardirqs_off(); local_paca->irq_happened |= PACA_IRQ_HARD_DIS; @@ -136,7 +150,8 @@ static notrace inline bool prep_irq_for_enabled_exit(void) * because RI=0 and soft mask state is "unreconciled", so it is marked notrace. */ notrace unsigned long syscall_exit_prepare(unsigned long r3, - struct pt_regs *regs) + struct pt_regs *regs, + long scv) { unsigned long *ti_flagsp = ¤t_thread_info()->flags; unsigned long ti_flags; @@ -151,7 +166,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3, ti_flags = *ti_flagsp; - if (unlikely(r3 >= (unsigned long)-MAX_ERRNO)) { + if (unlikely(r3 >= (unsigned long)-MAX_ERRNO) && !scv) { if (likely(!(ti_flags & (_TIF_NOERROR | _TIF_RESTOREALL)))) { r3 = -r3; regs->ccr |= 0x10000000; /* Set SO bit in CR */ @@ -211,7 +226,8 @@ again: } } - if (unlikely(!prep_irq_for_enabled_exit())) { + /* scv need not set RI=0 because SRRs are not used */ + if (unlikely(!prep_irq_for_enabled_exit(!scv))) { local_irq_enable(); goto again; } @@ -282,7 +298,7 @@ again: } } - if (unlikely(!prep_irq_for_enabled_exit())) { + if (unlikely(!prep_irq_for_enabled_exit(true))) { local_irq_enable(); local_irq_disable(); goto again; @@ -345,7 +361,7 @@ again: } } - if (unlikely(!prep_irq_for_enabled_exit())) { + if (unlikely(!prep_irq_for_enabled_exit(true))) { /* * Can't local_irq_restore to replay if we were in * interrupt context. Must replay directly. diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 5abe98216dc2..ab6a137c3862 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -16,6 +16,7 @@ #include extern char system_call_common[]; +extern char system_call_vectored_emulate[]; #ifdef CONFIG_PPC64 /* Bits in SRR1 that are copied from MSR */ @@ -1236,6 +1237,9 @@ int analyse_instr(struct instruction_op *op, const struct pt_regs *regs, case 17: /* sc */ if ((word & 0xfe2) == 2) op->type = SYSCALL; + else if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && + (word & 0xfe3) == 1) + op->type = SYSCALL_VECTORED_0; else op->type = UNKNOWN; return 0; @@ -3378,6 +3382,18 @@ int emulate_step(struct pt_regs *regs, struct ppc_inst instr) regs->msr = MSR_KERNEL; return 1; +#ifdef CONFIG_PPC64_BOOK3S + case SYSCALL_VECTORED_0: /* scv 0 */ + regs->gpr[9] = regs->gpr[13]; + regs->gpr[10] = MSR_KERNEL; + regs->gpr[11] = regs->nip + 4; + regs->gpr[12] = regs->msr & MSR_MASK; + regs->gpr[13] = (unsigned long) get_paca(); + regs->nip = (unsigned long) &system_call_vectored_emulate; + regs->msr = MSR_KERNEL; + return 1; +#endif + case RFI: return -1; #endif diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 2db8469e475f..8c85466e0dd8 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -358,7 +358,7 @@ static void pseries_lpar_idle(void) * to ever be a problem in practice we can move this into a kernel thread to * finish off the process later in boot. */ -void pseries_enable_reloc_on_exc(void) +bool pseries_enable_reloc_on_exc(void) { long rc; unsigned int delay, total_delay = 0; @@ -369,11 +369,13 @@ void pseries_enable_reloc_on_exc(void) if (rc == H_P2) { pr_info("Relocation on exceptions not" " supported\n"); + return false; } else if (rc != H_SUCCESS) { pr_warn("Unable to enable relocation" " on exceptions: %ld\n", rc); + return false; } - break; + return true; } delay = get_longbusy_msecs(rc); @@ -382,7 +384,7 @@ void pseries_enable_reloc_on_exc(void) pr_warn("Warning: Giving up waiting to enable " "relocation on exceptions (%u msec)!\n", total_delay); - return; + return false; } mdelay(delay); diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 7efe4bc3ccf6..3203c3606737 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -1593,6 +1593,7 @@ const char *getvecname(unsigned long vec) case 0x1300: ret = "(Instruction Breakpoint)"; break; case 0x1500: ret = "(Denormalisation)"; break; case 0x1700: ret = "(Altivec Assist)"; break; + case 0x3000: ret = "(System Call Vectored)"; break; default: ret = ""; } return ret; -- cgit v1.2.3 From 2384b36f9156c3b815a5ce5f694edc5054ab7625 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 16 Jul 2020 11:35:22 +1000 Subject: powerpc: Select ARCH_HAS_MEMBARRIER_SYNC_CORE powerpc return from interrupt and return from system call sequences are context synchronising. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200716013522.338318-1-npiggin@gmail.com --- .../features/sched/membarrier-sync-core/arch-support.txt | 4 ++-- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/exception-64e.h | 6 +++++- arch/powerpc/include/asm/exception-64s.h | 8 ++++++++ arch/powerpc/kernel/entry_32.S | 6 ++++++ 5 files changed, 22 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/features/sched/membarrier-sync-core/arch-support.txt b/Documentation/features/sched/membarrier-sync-core/arch-support.txt index 8a521a622966..52ad74a25f54 100644 --- a/Documentation/features/sched/membarrier-sync-core/arch-support.txt +++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt @@ -5,7 +5,7 @@ # # Architecture requirements # -# * arm/arm64 +# * arm/arm64/powerpc # # Rely on implicit context synchronization as a result of exception return # when returning from IPI handler, and when returning to user-space. @@ -45,7 +45,7 @@ | nios2: | TODO | | openrisc: | TODO | | parisc: | TODO | - | powerpc: | TODO | + | powerpc: | ok | | riscv: | TODO | | s390: | TODO | | sh: | TODO | diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 70adf69f8b1a..81c0dee1cbff 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -131,6 +131,7 @@ config PPC select ARCH_HAS_PTE_DEVMAP if PPC_BOOK3S_64 select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_MEMBARRIER_CALLBACKS + select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE && PPC_BOOK3S_64 select ARCH_HAS_STRICT_KERNEL_RWX if (PPC32 && !HIBERNATION) select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST diff --git a/arch/powerpc/include/asm/exception-64e.h b/arch/powerpc/include/asm/exception-64e.h index 54a98ef7d7fe..72b6657acd2d 100644 --- a/arch/powerpc/include/asm/exception-64e.h +++ b/arch/powerpc/include/asm/exception-64e.h @@ -204,7 +204,11 @@ exc_##label##_book3e: LOAD_REG_ADDR(r3,interrupt_base_book3e);\ ori r3,r3,vector_offset@l; \ mtspr SPRN_IVOR##vector_number,r3; - +/* + * powerpc relies on return from interrupt/syscall being context synchronising + * (which rfi is) to support ARCH_HAS_MEMBARRIER_SYNC_CORE without additional + * synchronisation instructions. + */ #define RFI_TO_KERNEL \ rfi diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 47bd4ea0837d..d7a1a427a690 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -68,6 +68,14 @@ * * The nop instructions allow us to insert one or more instructions to flush the * L1-D cache when returning to userspace or a guest. + * + * powerpc relies on return from interrupt/syscall being context synchronising + * (which hrfid, rfid, and rfscv are) to support ARCH_HAS_MEMBARRIER_SYNC_CORE + * without additional synchronisation instructions. + * + * soft-masked interrupt replay does not include a context-synchronising rfid, + * but those always return to kernel, the sync is only required when returning + * to user. */ #define RFI_FLUSH_SLOT \ RFI_FLUSH_FIXUP_SECTION; \ diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 217ebdf5b00b..f4d0af8e1136 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -35,6 +35,12 @@ #include "head_32.h" +/* + * powerpc relies on return from interrupt/syscall being context synchronising + * (which rfi is) to support ARCH_HAS_MEMBARRIER_SYNC_CORE without additional + * synchronisation instructions. + */ + /* * Align to 4k in order to ensure that all functions modyfing srr0/srr1 * fit into one page in order to not encounter a TLB miss between the -- cgit v1.2.3 From bf6b7661f41615c0815fce0a3f27acb5fc005470 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 27 Jul 2020 14:29:08 +0530 Subject: powerpc/book3s64/radix: Add kernel command line option to disable radix GTSE This adds a kernel command line option that can be used to disable GTSE support. Disabling GTSE implies kernel will make hcalls to invalidate TLB entries. This was done so that we can do VM migration between configs that enable/disable GTSE support via hypervisor. To migrate a VM from a system that supports GTSE to a system that doesn't, we can boot the guest with radix_hcall_invalidate=on, thereby forcing the guest to use hcalls for TLB invalidates. The check for hcall availability is done in pSeries_setup_arch so that the panic message appears on the console. This should only happen on a hypervisor that doesn't force the guest to hash translation even though it can't handle the radix GTSE=0 request via CAS. With radix_hcall_invalidate=on if the hypervisor doesn't support hcall_rpt_invalidate hcall it should force the LPAR to hash translation. Signed-off-by: Aneesh Kumar K.V Tested-by: Bharata B Rao Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200727085908.420806-1-aneesh.kumar@linux.ibm.com --- Documentation/admin-guide/kernel-parameters.txt | 4 ++++ arch/powerpc/include/asm/firmware.h | 4 +++- arch/powerpc/kernel/prom_init.c | 13 +++++++++---- arch/powerpc/mm/init_64.c | 1 - arch/powerpc/platforms/pseries/firmware.c | 1 + arch/powerpc/platforms/pseries/setup.c | 5 +++++ 6 files changed, 22 insertions(+), 6 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index fb95fad81c79..3ab61cd0f89c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -896,6 +896,10 @@ disable_radix [PPC] Disable RADIX MMU mode on POWER9 + radix_hcall_invalidate=on [PPC/PSERIES] + Disable RADIX GTSE feature and use hcall for TLB + invalidate. + disable_tlbie [PPC] Disable TLBIE instruction. Currently does not work with KVM, with HASH MMU, or with coherent accelerators. diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index f67efbaba17f..0b295bdb201e 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -52,6 +52,7 @@ #define FW_FEATURE_PAPR_SCM ASM_CONST(0x0000002000000000) #define FW_FEATURE_ULTRAVISOR ASM_CONST(0x0000004000000000) #define FW_FEATURE_STUFF_TCE ASM_CONST(0x0000008000000000) +#define FW_FEATURE_RPT_INVALIDATE ASM_CONST(0x0000010000000000) #ifndef __ASSEMBLY__ @@ -71,7 +72,8 @@ enum { FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN | FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 | FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE | - FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR, + FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR | + FW_FEATURE_RPT_INVALIDATE, FW_FEATURE_PSERIES_ALWAYS = 0, FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_ULTRAVISOR, FW_FEATURE_POWERNV_ALWAYS = 0, diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index cbc605cfdec0..f279a1f58fa7 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -169,6 +169,7 @@ static unsigned long __prombss prom_tce_alloc_end; #ifdef CONFIG_PPC_PSERIES static bool __prombss prom_radix_disable; +static bool __prombss prom_radix_gtse_disable; static bool __prombss prom_xive_disable; #endif @@ -823,6 +824,12 @@ static void __init early_cmdline_parse(void) if (prom_radix_disable) prom_debug("Radix disabled from cmdline\n"); + opt = prom_strstr(prom_cmd_line, "radix_hcall_invalidate=on"); + if (opt) { + prom_radix_gtse_disable = true; + prom_debug("Radix GTSE disabled from cmdline\n"); + } + opt = prom_strstr(prom_cmd_line, "xive=off"); if (opt) { prom_xive_disable = true; @@ -1285,10 +1292,8 @@ static void __init prom_parse_platform_support(u8 index, u8 val, prom_parse_mmu_model(val & OV5_FEAT(OV5_MMU_SUPPORT), support); break; case OV5_INDX(OV5_RADIX_GTSE): /* Radix Extensions */ - if (val & OV5_FEAT(OV5_RADIX_GTSE)) { - prom_debug("Radix - GTSE supported\n"); - support->radix_gtse = true; - } + if (val & OV5_FEAT(OV5_RADIX_GTSE)) + support->radix_gtse = !prom_radix_gtse_disable; break; case OV5_INDX(OV5_XIVE_SUPPORT): /* Interrupt mode */ prom_parse_xive_model(val & OV5_FEAT(OV5_XIVE_SUPPORT), diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 152aa0200cef..4ae5fc0ceb30 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -406,7 +406,6 @@ static void __init early_check_vec5(void) } if (!(vec5[OV5_INDX(OV5_RADIX_GTSE)] & OV5_FEAT(OV5_RADIX_GTSE))) { - pr_warn("WARNING: Hypervisor doesn't support RADIX with GTSE\n"); cur_cpu_spec->mmu_features &= ~MMU_FTR_GTSE; } else cur_cpu_spec->mmu_features |= MMU_FTR_GTSE; diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c index 3e49cc23a97a..4c7b7f5a2ebc 100644 --- a/arch/powerpc/platforms/pseries/firmware.c +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -65,6 +65,7 @@ hypertas_fw_features_table[] = { {FW_FEATURE_HPT_RESIZE, "hcall-hpt-resize"}, {FW_FEATURE_BLOCK_REMOVE, "hcall-block-remove"}, {FW_FEATURE_PAPR_SCM, "hcall-scm"}, + {FW_FEATURE_RPT_INVALIDATE, "hcall-rpt-invalidate"}, }; /* Build up the firmware features bitmask using the contents of diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index e29c9bf0a3b9..b1a8c72a8c12 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -747,6 +747,11 @@ static void __init pSeries_setup_arch(void) smp_init_pseries(); + if (radix_enabled() && !mmu_has_feature(MMU_FTR_GTSE)) + if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE)) + panic("BUG: Radix support requires either GTSE or RPT_INVALIDATE\n"); + + /* openpic global configuration register (64-bit format). */ /* openpic Interrupt Source Unit pointer (64-bit format). */ /* python0 facility area (mmio) (64-bit format) REAL address. */ -- cgit v1.2.3 From 2d02bf835e5731de632c8a13567905fa7c0da01c Mon Sep 17 00:00:00 2001 From: Vaibhav Jain Date: Fri, 31 Jul 2020 12:11:52 +0530 Subject: powerpc/papr_scm: Fetch nvdimm performance stats from PHYP Update papr_scm.c to query dimm performance statistics from PHYP via H_SCM_PERFORMANCE_STATS hcall and export them to user-space as PAPR specific NVDIMM attribute 'perf_stats' in sysfs. The patch also provide a sysfs ABI documentation for the stats being reported and their meanings. During NVDIMM probe time in papr_scm_nvdimm_init() a special variant of H_SCM_PERFORMANCE_STATS hcall is issued to check if collection of performance statistics is supported or not. If successful then a PHYP returns a maximum possible buffer length needed to read all performance stats. This returned value is stored in a per-nvdimm attribute 'stat_buffer_len'. The layout of request buffer for reading NVDIMM performance stats from PHYP is defined in 'struct papr_scm_perf_stats' and 'struct papr_scm_perf_stat'. These structs are used in newly introduced drc_pmem_query_stats() that issues the H_SCM_PERFORMANCE_STATS hcall. The sysfs access function perf_stats_show() uses value 'stat_buffer_len' to allocate a buffer large enough to hold all possible NVDIMM performance stats and passes it to drc_pmem_query_stats() to populate. Finally statistics reported in the buffer are formatted into the sysfs access function output buffer. Signed-off-by: Vaibhav Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20200731064153.182203-2-vaibhav@linux.ibm.com --- Documentation/ABI/testing/sysfs-bus-papr-pmem | 27 +++++ arch/powerpc/platforms/pseries/papr_scm.c | 150 ++++++++++++++++++++++++++ 2 files changed, 177 insertions(+) (limited to 'Documentation') diff --git a/Documentation/ABI/testing/sysfs-bus-papr-pmem b/Documentation/ABI/testing/sysfs-bus-papr-pmem index 5b10d036a8d4..c1a67275c43f 100644 --- a/Documentation/ABI/testing/sysfs-bus-papr-pmem +++ b/Documentation/ABI/testing/sysfs-bus-papr-pmem @@ -25,3 +25,30 @@ Description: NVDIMM have been scrubbed. * "locked" : Indicating that NVDIMM contents cant be modified until next power cycle. + +What: /sys/bus/nd/devices/nmemX/papr/perf_stats +Date: May, 2020 +KernelVersion: v5.9 +Contact: linuxppc-dev , linux-nvdimm@lists.01.org, +Description: + (RO) Report various performance stats related to papr-scm NVDIMM + device. Each stat is reported on a new line with each line + composed of a stat-identifier followed by it value. Below are + currently known dimm performance stats which are reported: + + * "CtlResCt" : Controller Reset Count + * "CtlResTm" : Controller Reset Elapsed Time + * "PonSecs " : Power-on Seconds + * "MemLife " : Life Remaining + * "CritRscU" : Critical Resource Utilization + * "HostLCnt" : Host Load Count + * "HostSCnt" : Host Store Count + * "HostSDur" : Host Store Duration + * "HostLDur" : Host Load Duration + * "MedRCnt " : Media Read Count + * "MedWCnt " : Media Write Count + * "MedRDur " : Media Read Duration + * "MedWDur " : Media Write Duration + * "CchRHCnt" : Cache Read Hit Count + * "CchWHCnt" : Cache Write Hit Count + * "FastWCnt" : Fast Write Count \ No newline at end of file diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index 3d1235a76ba9..f37f3f70007d 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -64,6 +64,26 @@ PAPR_PMEM_HEALTH_FATAL | \ PAPR_PMEM_HEALTH_UNHEALTHY) +#define PAPR_SCM_PERF_STATS_EYECATCHER __stringify(SCMSTATS) +#define PAPR_SCM_PERF_STATS_VERSION 0x1 + +/* Struct holding a single performance metric */ +struct papr_scm_perf_stat { + u8 stat_id[8]; + __be64 stat_val; +} __packed; + +/* Struct exchanged between kernel and PHYP for fetching drc perf stats */ +struct papr_scm_perf_stats { + u8 eye_catcher[8]; + /* Should be PAPR_SCM_PERF_STATS_VERSION */ + __be32 stats_version; + /* Number of stats following */ + __be32 num_statistics; + /* zero or more performance matrics */ + struct papr_scm_perf_stat scm_statistic[]; +} __packed; + /* private struct associated with each region */ struct papr_scm_priv { struct platform_device *pdev; @@ -92,6 +112,9 @@ struct papr_scm_priv { /* Health information for the dimm */ u64 health_bitmap; + + /* length of the stat buffer as expected by phyp */ + size_t stat_buffer_len; }; static LIST_HEAD(papr_nd_regions); @@ -200,6 +223,79 @@ err_out: return drc_pmem_bind(p); } +/* + * Query the Dimm performance stats from PHYP and copy them (if returned) to + * provided struct papr_scm_perf_stats instance 'stats' that can hold atleast + * (num_stats + header) bytes. + * - If buff_stats == NULL the return value is the size in byes of the buffer + * needed to hold all supported performance-statistics. + * - If buff_stats != NULL and num_stats == 0 then we copy all known + * performance-statistics to 'buff_stat' and expect to be large enough to + * hold them. + * - if buff_stats != NULL and num_stats > 0 then copy the requested + * performance-statistics to buff_stats. + */ +static ssize_t drc_pmem_query_stats(struct papr_scm_priv *p, + struct papr_scm_perf_stats *buff_stats, + unsigned int num_stats) +{ + unsigned long ret[PLPAR_HCALL_BUFSIZE]; + size_t size; + s64 rc; + + /* Setup the out buffer */ + if (buff_stats) { + memcpy(buff_stats->eye_catcher, + PAPR_SCM_PERF_STATS_EYECATCHER, 8); + buff_stats->stats_version = + cpu_to_be32(PAPR_SCM_PERF_STATS_VERSION); + buff_stats->num_statistics = + cpu_to_be32(num_stats); + + /* + * Calculate the buffer size based on num-stats provided + * or use the prefetched max buffer length + */ + if (num_stats) + /* Calculate size from the num_stats */ + size = sizeof(struct papr_scm_perf_stats) + + num_stats * sizeof(struct papr_scm_perf_stat); + else + size = p->stat_buffer_len; + } else { + /* In case of no out buffer ignore the size */ + size = 0; + } + + /* Do the HCALL asking PHYP for info */ + rc = plpar_hcall(H_SCM_PERFORMANCE_STATS, ret, p->drc_index, + buff_stats ? virt_to_phys(buff_stats) : 0, + size); + + /* Check if the error was due to an unknown stat-id */ + if (rc == H_PARTIAL) { + dev_err(&p->pdev->dev, + "Unknown performance stats, Err:0x%016lX\n", ret[0]); + return -ENOENT; + } else if (rc != H_SUCCESS) { + dev_err(&p->pdev->dev, + "Failed to query performance stats, Err:%lld\n", rc); + return -EIO; + + } else if (!size) { + /* Handle case where stat buffer size was requested */ + dev_dbg(&p->pdev->dev, + "Performance stats size %ld\n", ret[0]); + return ret[0]; + } + + /* Successfully fetched the requested stats from phyp */ + dev_dbg(&p->pdev->dev, + "Performance stats returned %d stats\n", + be32_to_cpu(buff_stats->num_statistics)); + return 0; +} + /* * Issue hcall to retrieve dimm health info and populate papr_scm_priv with the * health information. @@ -637,6 +733,48 @@ static int papr_scm_ndctl(struct nvdimm_bus_descriptor *nd_desc, return 0; } +static ssize_t perf_stats_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int index, rc; + struct seq_buf s; + struct papr_scm_perf_stat *stat; + struct papr_scm_perf_stats *stats; + struct nvdimm *dimm = to_nvdimm(dev); + struct papr_scm_priv *p = nvdimm_provider_data(dimm); + + if (!p->stat_buffer_len) + return -ENOENT; + + /* Allocate the buffer for phyp where stats are written */ + stats = kzalloc(p->stat_buffer_len, GFP_KERNEL); + if (!stats) + return -ENOMEM; + + /* Ask phyp to return all dimm perf stats */ + rc = drc_pmem_query_stats(p, stats, 0); + if (rc) + goto free_stats; + /* + * Go through the returned output buffer and print stats and + * values. Since stat_id is essentially a char string of + * 8 bytes, simply use the string format specifier to print it. + */ + seq_buf_init(&s, buf, PAGE_SIZE); + for (index = 0, stat = stats->scm_statistic; + index < be32_to_cpu(stats->num_statistics); + ++index, ++stat) { + seq_buf_printf(&s, "%.8s = 0x%016llX\n", + stat->stat_id, + be64_to_cpu(stat->stat_val)); + } + +free_stats: + kfree(stats); + return rc ? rc : seq_buf_used(&s); +} +DEVICE_ATTR_RO(perf_stats); + static ssize_t flags_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -682,6 +820,7 @@ DEVICE_ATTR_RO(flags); /* papr_scm specific dimm attributes */ static struct attribute *papr_nd_attributes[] = { &dev_attr_flags.attr, + &dev_attr_perf_stats.attr, NULL, }; @@ -702,6 +841,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) struct nd_region_desc ndr_desc; unsigned long dimm_flags; int target_nid, online_nid; + ssize_t stat_size; p->bus_desc.ndctl = papr_scm_ndctl; p->bus_desc.module = THIS_MODULE; @@ -769,6 +909,16 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p) list_add_tail(&p->region_list, &papr_nd_regions); mutex_unlock(&papr_ndr_lock); + /* Try retriving the stat buffer and see if its supported */ + stat_size = drc_pmem_query_stats(p, NULL, 0); + if (stat_size > 0) { + p->stat_buffer_len = stat_size; + dev_dbg(&p->pdev->dev, "Max perf-stat size %lu-bytes\n", + p->stat_buffer_len); + } else { + dev_info(&p->pdev->dev, "Dimm performance stats unavailable\n"); + } + return 0; err: nvdimm_bus_unregister(p->bus); -- cgit v1.2.3