From 4a5b69464e51f4a8dd432e8c2a1468630df1a53c Mon Sep 17 00:00:00 2001 From: Julien Grall Date: Tue, 28 Jul 2015 10:10:42 +0100 Subject: xen/events: Support event channel rebind on ARM Currently, the event channel rebind code is gated with the presence of the vector callback. The virtual interrupt controller on ARM has the concept of per-CPU interrupt (PPI) which allow us to support per-VCPU event channel. Therefore there is no need of vector callback for ARM. Xen is already using a free PPI to notify the guest VCPU of an event. Furthermore, the xen code initialization in Linux (see arch/arm/xen/enlighten.c) is requesting correctly a per-CPU IRQ. Introduce new helper xen_support_evtchn_rebind to allow architecture decide whether rebind an event is support or not. It will always return true on ARM and keep the same behavior on x86. This is also allow us to drop the usage of xen_have_vector_callback entirely in the ARM code. Signed-off-by: Julien Grall Signed-off-by: David Vrabel --- include/xen/events.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/xen/events.h b/include/xen/events.h index 7d95fdf9cf3e..88da2abaf535 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -92,7 +92,6 @@ void xen_hvm_callback_vector(void); #ifdef CONFIG_TRACING #define trace_xen_hvm_callback_vector xen_hvm_callback_vector #endif -extern int xen_have_vector_callback; int xen_set_callback_via(uint64_t via); void xen_evtchn_do_upcall(struct pt_regs *regs); void xen_hvm_evtchn_do_upcall(void); -- cgit v1.2.3 From 17fb46b1190b677a37cdd636e2aa30052109f51b Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 17 Jul 2015 06:51:22 +0200 Subject: xen: sync with xen headers Use the newest headers from the xen tree to get some new structure layouts. Signed-off-by: Juergen Gross Reviewed-by: David Vrabel Acked-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- arch/x86/include/asm/xen/interface.h | 96 ++++++++++++++++++++++++++++++++---- include/xen/interface/xen.h | 35 +++++++------ 2 files changed, 107 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index 3400dbaec3c3..3b88eeacdbda 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -3,12 +3,38 @@ * * Guest OS interface to x86 Xen. * - * Copyright (c) 2004, K A Fraser + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2004-2006, K A Fraser */ #ifndef _ASM_X86_XEN_INTERFACE_H #define _ASM_X86_XEN_INTERFACE_H +/* + * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field + * in a struct in memory. + * XEN_GUEST_HANDLE_PARAM represent a guest pointer, when passed as an + * hypercall argument. + * XEN_GUEST_HANDLE_PARAM and XEN_GUEST_HANDLE are the same on X86 but + * they might not be on other architectures. + */ #ifdef __XEN__ #define __DEFINE_GUEST_HANDLE(name, type) \ typedef struct { type *p; } __guest_handle_ ## name @@ -88,13 +114,16 @@ DEFINE_GUEST_HANDLE(xen_ulong_t); * start of the GDT because some stupid OSes export hard-coded selector values * in their ABI. These hard-coded values are always near the start of the GDT, * so Xen places itself out of the way, at the far end of the GDT. + * + * NB The LDT is set using the MMUEXT_SET_LDT op of HYPERVISOR_mmuext_op */ #define FIRST_RESERVED_GDT_PAGE 14 #define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096) #define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8) /* - * Send an array of these to HYPERVISOR_set_trap_table() + * Send an array of these to HYPERVISOR_set_trap_table(). + * Terminate the array with a sentinel entry, with traps[].address==0. * The privilege level specifies which modes may enter a trap via a software * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate * privilege levels as follows: @@ -118,10 +147,41 @@ struct trap_info { DEFINE_GUEST_HANDLE_STRUCT(trap_info); struct arch_shared_info { - unsigned long max_pfn; /* max pfn that appears in table */ - /* Frame containing list of mfns containing list of mfns containing p2m. */ - unsigned long pfn_to_mfn_frame_list_list; - unsigned long nmi_reason; + /* + * Number of valid entries in the p2m table(s) anchored at + * pfn_to_mfn_frame_list_list and/or p2m_vaddr. + */ + unsigned long max_pfn; + /* + * Frame containing list of mfns containing list of mfns containing p2m. + * A value of 0 indicates it has not yet been set up, ~0 indicates it + * has been set to invalid e.g. due to the p2m being too large for the + * 3-level p2m tree. In this case the linear mapper p2m list anchored + * at p2m_vaddr is to be used. + */ + xen_pfn_t pfn_to_mfn_frame_list_list; + unsigned long nmi_reason; + /* + * Following three fields are valid if p2m_cr3 contains a value + * different from 0. + * p2m_cr3 is the root of the address space where p2m_vaddr is valid. + * p2m_cr3 is in the same format as a cr3 value in the vcpu register + * state and holds the folded machine frame number (via xen_pfn_to_cr3) + * of a L3 or L4 page table. + * p2m_vaddr holds the virtual address of the linear p2m list. All + * entries in the range [0...max_pfn[ are accessible via this pointer. + * p2m_generation will be incremented by the guest before and after each + * change of the mappings of the p2m list. p2m_generation starts at 0 + * and a value with the least significant bit set indicates that a + * mapping update is in progress. This allows guest external software + * (e.g. in Dom0) to verify that read mappings are consistent and + * whether they have changed since the last check. + * Modifying a p2m element in the linear p2m list is allowed via an + * atomic write only. + */ + unsigned long p2m_cr3; /* cr3 value of the p2m address space */ + unsigned long p2m_vaddr; /* virtual address of the p2m list */ + unsigned long p2m_generation; /* generation count of p2m mapping */ }; #endif /* !__ASSEMBLY__ */ @@ -137,13 +197,31 @@ struct arch_shared_info { /* * The following is all CPU context. Note that the fpu_ctxt block is filled * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used. + * + * Also note that when calling DOMCTL_setvcpucontext and VCPU_initialise + * for HVM and PVH guests, not all information in this structure is updated: + * + * - For HVM guests, the structures read include: fpu_ctxt (if + * VGCT_I387_VALID is set), flags, user_regs, debugreg[*] + * + * - PVH guests are the same as HVM guests, but additionally use ctrlreg[3] to + * set cr3. All other fields not used should be set to 0. */ struct vcpu_guest_context { /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */ struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */ -#define VGCF_I387_VALID (1<<0) -#define VGCF_HVM_GUEST (1<<1) -#define VGCF_IN_KERNEL (1<<2) +#define VGCF_I387_VALID (1<<0) +#define VGCF_IN_KERNEL (1<<2) +#define _VGCF_i387_valid 0 +#define VGCF_i387_valid (1<<_VGCF_i387_valid) +#define _VGCF_in_kernel 2 +#define VGCF_in_kernel (1<<_VGCF_in_kernel) +#define _VGCF_failsafe_disables_events 3 +#define VGCF_failsafe_disables_events (1<<_VGCF_failsafe_disables_events) +#define _VGCF_syscall_disables_events 4 +#define VGCF_syscall_disables_events (1<<_VGCF_syscall_disables_events) +#define _VGCF_online 5 +#define VGCF_online (1<<_VGCF_online) unsigned long flags; /* VGCF_* flags */ struct cpu_user_regs user_regs; /* User-level CPU registers */ struct trap_info trap_ctxt[256]; /* Virtual IDT */ diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index a48378958062..8194270edcf0 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -585,26 +585,29 @@ struct shared_info { }; /* - * Start-of-day memory layout for the initial domain (DOM0): + * Start-of-day memory layout + * * 1. The domain is started within contiguous virtual-memory region. * 2. The contiguous region begins and ends on an aligned 4MB boundary. - * 3. The region start corresponds to the load address of the OS image. - * If the load address is not 4MB aligned then the address is rounded down. - * 4. This the order of bootstrap elements in the initial virtual region: + * 3. This the order of bootstrap elements in the initial virtual region: * a. relocated kernel image * b. initial ram disk [mod_start, mod_len] + * (may be omitted) * c. list of allocated page frames [mfn_list, nr_pages] + * (unless relocated due to XEN_ELFNOTE_INIT_P2M) * d. start_info_t structure [register ESI (x86)] - * e. bootstrap page tables [pt_base, CR3 (x86)] - * f. bootstrap stack [register ESP (x86)] - * 5. Bootstrap elements are packed together, but each is 4kB-aligned. - * 6. The initial ram disk may be omitted. - * 7. The list of page frames forms a contiguous 'pseudo-physical' memory + * in case of dom0 this page contains the console info, too + * e. unless dom0: xenstore ring page + * f. unless dom0: console ring page + * g. bootstrap page tables [pt_base, CR3 (x86)] + * h. bootstrap stack [register ESP (x86)] + * 4. Bootstrap elements are packed together, but each is 4kB-aligned. + * 5. The list of page frames forms a contiguous 'pseudo-physical' memory * layout for the domain. In particular, the bootstrap virtual-memory * region is a 1:1 mapping to the first section of the pseudo-physical map. - * 8. All bootstrap elements are mapped read-writable for the guest OS. The + * 6. All bootstrap elements are mapped read-writable for the guest OS. The * only exception is the bootstrap page table, which is mapped read-only. - * 9. There is guaranteed to be at least 512kB padding after the final + * 7. There is guaranteed to be at least 512kB padding after the final * bootstrap element. If necessary, the bootstrap virtual region is * extended by an extra 4MB to ensure this. */ @@ -641,10 +644,12 @@ struct start_info { }; /* These flags are passed in the 'flags' field of start_info_t. */ -#define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */ -#define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ -#define SIF_MULTIBOOT_MOD (1<<2) /* Is mod_start a multiboot module? */ -#define SIF_MOD_START_PFN (1<<3) /* Is mod_start a PFN? */ +#define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */ +#define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ +#define SIF_MULTIBOOT_MOD (1<<2) /* Is mod_start a multiboot module? */ +#define SIF_MOD_START_PFN (1<<3) /* Is mod_start a PFN? */ +#define SIF_VIRT_P2M_4TOOLS (1<<4) /* Do Xen tools understand a virt. mapped */ + /* P->M making the 3 level tree obsolete? */ #define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */ /* -- cgit v1.2.3 From 2592dbbbf4c67501c2bd2dcf89c2b8924d592a9f Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 17 Jul 2015 06:51:33 +0200 Subject: mm: provide early_memremap_ro to establish read-only mapping During early boot as Xen pv domain the kernel needs to map some page tables supplied by the hypervisor read only. This is needed to be able to relocate some data structures conflicting with the physical memory map especially on systems with huge RAM (above 512GB). Provide the function early_memremap_ro() to provide this read only mapping. Signed-off-by: Juergen Gross Acked-by: Konrad Rzeszutek Wilk Acked-by: Vlastimil Babka Signed-off-by: David Vrabel --- include/asm-generic/early_ioremap.h | 2 ++ include/asm-generic/fixmap.h | 3 +++ mm/early_ioremap.c | 12 ++++++++++++ 3 files changed, 17 insertions(+) (limited to 'include') diff --git a/include/asm-generic/early_ioremap.h b/include/asm-generic/early_ioremap.h index a5de55c04fb2..316bd043319e 100644 --- a/include/asm-generic/early_ioremap.h +++ b/include/asm-generic/early_ioremap.h @@ -11,6 +11,8 @@ extern void __iomem *early_ioremap(resource_size_t phys_addr, unsigned long size); extern void *early_memremap(resource_size_t phys_addr, unsigned long size); +extern void *early_memremap_ro(resource_size_t phys_addr, + unsigned long size); extern void early_iounmap(void __iomem *addr, unsigned long size); extern void early_memunmap(void *addr, unsigned long size); diff --git a/include/asm-generic/fixmap.h b/include/asm-generic/fixmap.h index f23174fb9ec4..1cbb8338edf3 100644 --- a/include/asm-generic/fixmap.h +++ b/include/asm-generic/fixmap.h @@ -46,6 +46,9 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr) #ifndef FIXMAP_PAGE_NORMAL #define FIXMAP_PAGE_NORMAL PAGE_KERNEL #endif +#if !defined(FIXMAP_PAGE_RO) && defined(PAGE_KERNEL_RO) +#define FIXMAP_PAGE_RO PAGE_KERNEL_RO +#endif #ifndef FIXMAP_PAGE_NOCACHE #define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_NOCACHE #endif diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c index e10ccd299d66..0cfadafb3fb0 100644 --- a/mm/early_ioremap.c +++ b/mm/early_ioremap.c @@ -217,6 +217,13 @@ early_memremap(resource_size_t phys_addr, unsigned long size) return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_NORMAL); } +#ifdef FIXMAP_PAGE_RO +void __init * +early_memremap_ro(resource_size_t phys_addr, unsigned long size) +{ + return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO); +} +#endif #else /* CONFIG_MMU */ void __init __iomem * @@ -231,6 +238,11 @@ early_memremap(resource_size_t phys_addr, unsigned long size) { return (void *)phys_addr; } +void __init * +early_memremap_ro(resource_size_t phys_addr, unsigned long size) +{ + return (void *)phys_addr; +} void __init early_iounmap(void __iomem *addr, unsigned long size) { -- cgit v1.2.3 From a11f4f0a4e18b4bdc7d5e36438711e038b7a1f74 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 10 Aug 2015 16:34:32 -0400 Subject: xen: xensyms support Export Xen symbols to dom0 via /proc/xen/xensyms (similar to /proc/kallsyms). Signed-off-by: Boris Ostrovsky Reviewed-by: David Vrabel Signed-off-by: David Vrabel --- drivers/xen/Kconfig | 8 +++ drivers/xen/xenfs/Makefile | 1 + drivers/xen/xenfs/super.c | 3 + drivers/xen/xenfs/xenfs.h | 1 + drivers/xen/xenfs/xensyms.c | 152 +++++++++++++++++++++++++++++++++++++++ include/xen/interface/platform.h | 18 +++++ 6 files changed, 183 insertions(+) create mode 100644 drivers/xen/xenfs/xensyms.c (limited to 'include') diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 7cd226da15fe..936760455dd5 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -280,4 +280,12 @@ config XEN_ACPI def_bool y depends on X86 && ACPI +config XEN_SYMS + bool "Xen symbols" + depends on X86 && XEN_DOM0 && XENFS + default y if KALLSYMS + help + Exports hypervisor symbols (along with their types and addresses) via + /proc/xen/xensyms file, similar to /proc/kallsyms + endmenu diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile index b019865fcc56..1a83010ddffa 100644 --- a/drivers/xen/xenfs/Makefile +++ b/drivers/xen/xenfs/Makefile @@ -2,3 +2,4 @@ obj-$(CONFIG_XENFS) += xenfs.o xenfs-y = super.o xenfs-$(CONFIG_XEN_DOM0) += xenstored.o +xenfs-$(CONFIG_XEN_SYMS) += xensyms.o diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c index 06092e0fe8ce..8559a71f36b1 100644 --- a/drivers/xen/xenfs/super.c +++ b/drivers/xen/xenfs/super.c @@ -57,6 +57,9 @@ static int xenfs_fill_super(struct super_block *sb, void *data, int silent) { "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR }, { "xsd_kva", &xsd_kva_file_ops, S_IRUSR|S_IWUSR}, { "xsd_port", &xsd_port_file_ops, S_IRUSR|S_IWUSR}, +#ifdef CONFIG_XEN_SYMS + { "xensyms", &xensyms_ops, S_IRUSR}, +#endif {""}, }; diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h index 6b80c7779c02..2c5934ea9b1e 100644 --- a/drivers/xen/xenfs/xenfs.h +++ b/drivers/xen/xenfs/xenfs.h @@ -3,5 +3,6 @@ extern const struct file_operations xsd_kva_file_ops; extern const struct file_operations xsd_port_file_ops; +extern const struct file_operations xensyms_ops; #endif /* _XENFS_XENBUS_H */ diff --git a/drivers/xen/xenfs/xensyms.c b/drivers/xen/xenfs/xensyms.c new file mode 100644 index 000000000000..f8b12856753f --- /dev/null +++ b/drivers/xen/xenfs/xensyms.c @@ -0,0 +1,152 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "xenfs.h" + + +#define XEN_KSYM_NAME_LEN 127 /* Hypervisor may have different name length */ + +struct xensyms { + struct xen_platform_op op; + char *name; + uint32_t namelen; +}; + +/* Grab next output page from the hypervisor */ +static int xensyms_next_sym(struct xensyms *xs) +{ + int ret; + struct xenpf_symdata *symdata = &xs->op.u.symdata; + uint64_t symnum; + + memset(xs->name, 0, xs->namelen); + symdata->namelen = xs->namelen; + + symnum = symdata->symnum; + + ret = HYPERVISOR_dom0_op(&xs->op); + if (ret < 0) + return ret; + + /* + * If hypervisor's symbol didn't fit into the buffer then allocate + * a larger buffer and try again. + */ + if (unlikely(symdata->namelen > xs->namelen)) { + kfree(xs->name); + + xs->namelen = symdata->namelen; + xs->name = kzalloc(xs->namelen, GFP_KERNEL); + if (!xs->name) + return -ENOMEM; + + set_xen_guest_handle(symdata->name, xs->name); + symdata->symnum--; /* Rewind */ + + ret = HYPERVISOR_dom0_op(&xs->op); + if (ret < 0) + return ret; + } + + if (symdata->symnum == symnum) + /* End of symbols */ + return 1; + + return 0; +} + +static void *xensyms_start(struct seq_file *m, loff_t *pos) +{ + struct xensyms *xs = (struct xensyms *)m->private; + + xs->op.u.symdata.symnum = *pos; + + if (xensyms_next_sym(xs)) + return NULL; + + return m->private; +} + +static void *xensyms_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct xensyms *xs = (struct xensyms *)m->private; + + xs->op.u.symdata.symnum = ++(*pos); + + if (xensyms_next_sym(xs)) + return NULL; + + return p; +} + +static int xensyms_show(struct seq_file *m, void *p) +{ + struct xensyms *xs = (struct xensyms *)m->private; + struct xenpf_symdata *symdata = &xs->op.u.symdata; + + seq_printf(m, "%016llx %c %s\n", symdata->address, + symdata->type, xs->name); + + return 0; +} + +static void xensyms_stop(struct seq_file *m, void *p) +{ +} + +static const struct seq_operations xensyms_seq_ops = { + .start = xensyms_start, + .next = xensyms_next, + .show = xensyms_show, + .stop = xensyms_stop, +}; + +static int xensyms_open(struct inode *inode, struct file *file) +{ + struct seq_file *m; + struct xensyms *xs; + int ret; + + ret = seq_open_private(file, &xensyms_seq_ops, + sizeof(struct xensyms)); + if (ret) + return ret; + + m = file->private_data; + xs = (struct xensyms *)m->private; + + xs->namelen = XEN_KSYM_NAME_LEN + 1; + xs->name = kzalloc(xs->namelen, GFP_KERNEL); + if (!xs->name) { + seq_release_private(inode, file); + return -ENOMEM; + } + set_xen_guest_handle(xs->op.u.symdata.name, xs->name); + xs->op.cmd = XENPF_get_symbol; + xs->op.u.symdata.namelen = xs->namelen; + + return 0; +} + +static int xensyms_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + struct xensyms *xs = (struct xensyms *)m->private; + + kfree(xs->name); + return seq_release_private(inode, file); +} + +const struct file_operations xensyms_ops = { + .open = xensyms_open, + .read = seq_read, + .llseek = seq_lseek, + .release = xensyms_release +}; diff --git a/include/xen/interface/platform.h b/include/xen/interface/platform.h index 5cc49ea8d840..8e035871360e 100644 --- a/include/xen/interface/platform.h +++ b/include/xen/interface/platform.h @@ -474,6 +474,23 @@ struct xenpf_core_parking { }; DEFINE_GUEST_HANDLE_STRUCT(xenpf_core_parking); +#define XENPF_get_symbol 63 +struct xenpf_symdata { + /* IN/OUT variables */ + uint32_t namelen; /* size of 'name' buffer */ + + /* IN/OUT variables */ + uint32_t symnum; /* IN: Symbol to read */ + /* OUT: Next available symbol. If same as IN */ + /* then we reached the end */ + + /* OUT variables */ + GUEST_HANDLE(char) name; + uint64_t address; + char type; +}; +DEFINE_GUEST_HANDLE_STRUCT(xenpf_symdata); + struct xen_platform_op { uint32_t cmd; uint32_t interface_version; /* XENPF_INTERFACE_VERSION */ @@ -495,6 +512,7 @@ struct xen_platform_op { struct xenpf_cpu_hotadd cpu_add; struct xenpf_mem_hotadd mem_add; struct xenpf_core_parking core_parking; + struct xenpf_symdata symdata; uint8_t pad[128]; } u; }; -- cgit v1.2.3 From 5f141548824cebbff2e838ff401c34e667797467 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 10 Aug 2015 16:34:33 -0400 Subject: xen/PMU: Sysfs interface for setting Xen PMU mode Set Xen's PMU mode via /sys/hypervisor/pmu/pmu_mode. Add XENPMU hypercall. Signed-off-by: Boris Ostrovsky Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: David Vrabel --- Documentation/ABI/testing/sysfs-hypervisor-pmu | 23 +++++ arch/x86/include/asm/xen/hypercall.h | 6 ++ arch/x86/xen/Kconfig | 1 + drivers/xen/Kconfig | 3 + drivers/xen/sys-hypervisor.c | 136 ++++++++++++++++++++++++- include/xen/interface/xen.h | 1 + include/xen/interface/xenpmu.h | 59 +++++++++++ 7 files changed, 228 insertions(+), 1 deletion(-) create mode 100644 Documentation/ABI/testing/sysfs-hypervisor-pmu create mode 100644 include/xen/interface/xenpmu.h (limited to 'include') diff --git a/Documentation/ABI/testing/sysfs-hypervisor-pmu b/Documentation/ABI/testing/sysfs-hypervisor-pmu new file mode 100644 index 000000000000..224faa105e18 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-hypervisor-pmu @@ -0,0 +1,23 @@ +What: /sys/hypervisor/pmu/pmu_mode +Date: August 2015 +KernelVersion: 4.3 +Contact: Boris Ostrovsky +Description: + Describes mode that Xen's performance-monitoring unit (PMU) + uses. Accepted values are + "off" -- PMU is disabled + "self" -- The guest can profile itself + "hv" -- The guest can profile itself and, if it is + privileged (e.g. dom0), the hypervisor + "all" -- The guest can profile itself, the hypervisor + and all other guests. Only available to + privileged guests. + +What: /sys/hypervisor/pmu/pmu_features +Date: August 2015 +KernelVersion: 4.3 +Contact: Boris Ostrovsky +Description: + Describes Xen PMU features (as an integer). A set bit indicates + that the corresponding feature is enabled. See + include/xen/interface/xenpmu.h for available features diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index ca08a27b90b3..83aea8055119 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -465,6 +465,12 @@ HYPERVISOR_tmem_op( return _hypercall1(int, tmem_op, op); } +static inline int +HYPERVISOR_xenpmu_op(unsigned int op, void *arg) +{ + return _hypercall2(int, xenpmu_op, op, arg); +} + static inline void MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) { diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 7bcf21b865d1..a8ffdb85656f 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -7,6 +7,7 @@ config XEN depends on PARAVIRT select PARAVIRT_CLOCK select XEN_HAVE_PVMMU + select XEN_HAVE_VPMU depends on X86_64 || (X86_32 && X86_PAE) depends on X86_TSC help diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 936760455dd5..73708acce3ca 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -288,4 +288,7 @@ config XEN_SYMS Exports hypervisor symbols (along with their types and addresses) via /proc/xen/xensyms file, similar to /proc/kallsyms +config XEN_HAVE_VPMU + bool + endmenu diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c index 96453f8a85c5..b5a7342e0ba5 100644 --- a/drivers/xen/sys-hypervisor.c +++ b/drivers/xen/sys-hypervisor.c @@ -20,6 +20,9 @@ #include #include #include +#ifdef CONFIG_XEN_HAVE_VPMU +#include +#endif #define HYPERVISOR_ATTR_RO(_name) \ static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name) @@ -368,6 +371,126 @@ static void xen_properties_destroy(void) sysfs_remove_group(hypervisor_kobj, &xen_properties_group); } +#ifdef CONFIG_XEN_HAVE_VPMU +struct pmu_mode { + const char *name; + uint32_t mode; +}; + +static struct pmu_mode pmu_modes[] = { + {"off", XENPMU_MODE_OFF}, + {"self", XENPMU_MODE_SELF}, + {"hv", XENPMU_MODE_HV}, + {"all", XENPMU_MODE_ALL} +}; + +static ssize_t pmu_mode_store(struct hyp_sysfs_attr *attr, + const char *buffer, size_t len) +{ + int ret; + struct xen_pmu_params xp; + int i; + + for (i = 0; i < ARRAY_SIZE(pmu_modes); i++) { + if (strncmp(buffer, pmu_modes[i].name, len - 1) == 0) { + xp.val = pmu_modes[i].mode; + break; + } + } + + if (i == ARRAY_SIZE(pmu_modes)) + return -EINVAL; + + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + ret = HYPERVISOR_xenpmu_op(XENPMU_mode_set, &xp); + if (ret) + return ret; + + return len; +} + +static ssize_t pmu_mode_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret; + struct xen_pmu_params xp; + int i; + uint32_t mode; + + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + ret = HYPERVISOR_xenpmu_op(XENPMU_mode_get, &xp); + if (ret) + return ret; + + mode = (uint32_t)xp.val; + for (i = 0; i < ARRAY_SIZE(pmu_modes); i++) { + if (mode == pmu_modes[i].mode) + return sprintf(buffer, "%s\n", pmu_modes[i].name); + } + + return -EINVAL; +} +HYPERVISOR_ATTR_RW(pmu_mode); + +static ssize_t pmu_features_store(struct hyp_sysfs_attr *attr, + const char *buffer, size_t len) +{ + int ret; + uint32_t features; + struct xen_pmu_params xp; + + ret = kstrtou32(buffer, 0, &features); + if (ret) + return ret; + + xp.val = features; + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + ret = HYPERVISOR_xenpmu_op(XENPMU_feature_set, &xp); + if (ret) + return ret; + + return len; +} + +static ssize_t pmu_features_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret; + struct xen_pmu_params xp; + + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + ret = HYPERVISOR_xenpmu_op(XENPMU_feature_get, &xp); + if (ret) + return ret; + + return sprintf(buffer, "0x%x\n", (uint32_t)xp.val); +} +HYPERVISOR_ATTR_RW(pmu_features); + +static struct attribute *xen_pmu_attrs[] = { + &pmu_mode_attr.attr, + &pmu_features_attr.attr, + NULL +}; + +static const struct attribute_group xen_pmu_group = { + .name = "pmu", + .attrs = xen_pmu_attrs, +}; + +static int __init xen_pmu_init(void) +{ + return sysfs_create_group(hypervisor_kobj, &xen_pmu_group); +} + +static void xen_pmu_destroy(void) +{ + sysfs_remove_group(hypervisor_kobj, &xen_pmu_group); +} +#endif + static int __init hyper_sysfs_init(void) { int ret; @@ -390,7 +513,15 @@ static int __init hyper_sysfs_init(void) ret = xen_properties_init(); if (ret) goto prop_out; - +#ifdef CONFIG_XEN_HAVE_VPMU + if (xen_initial_domain()) { + ret = xen_pmu_init(); + if (ret) { + xen_properties_destroy(); + goto prop_out; + } + } +#endif goto out; prop_out: @@ -407,6 +538,9 @@ out: static void __exit hyper_sysfs_exit(void) { +#ifdef CONFIG_XEN_HAVE_VPMU + xen_pmu_destroy(); +#endif xen_properties_destroy(); xen_compilation_destroy(); xen_sysfs_uuid_destroy(); diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index 8194270edcf0..e9d4501d1f5e 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -80,6 +80,7 @@ #define __HYPERVISOR_kexec_op 37 #define __HYPERVISOR_tmem_op 38 #define __HYPERVISOR_xc_reserved_op 39 /* reserved for XenClient */ +#define __HYPERVISOR_xenpmu_op 40 /* Architecture-specific hypercall definitions. */ #define __HYPERVISOR_arch_0 48 diff --git a/include/xen/interface/xenpmu.h b/include/xen/interface/xenpmu.h new file mode 100644 index 000000000000..eac1b498b89f --- /dev/null +++ b/include/xen/interface/xenpmu.h @@ -0,0 +1,59 @@ +#ifndef __XEN_PUBLIC_XENPMU_H__ +#define __XEN_PUBLIC_XENPMU_H__ + +#include "xen.h" + +#define XENPMU_VER_MAJ 0 +#define XENPMU_VER_MIN 1 + +/* + * ` enum neg_errnoval + * ` HYPERVISOR_xenpmu_op(enum xenpmu_op cmd, struct xenpmu_params *args); + * + * @cmd == XENPMU_* (PMU operation) + * @args == struct xenpmu_params + */ +/* ` enum xenpmu_op { */ +#define XENPMU_mode_get 0 /* Also used for getting PMU version */ +#define XENPMU_mode_set 1 +#define XENPMU_feature_get 2 +#define XENPMU_feature_set 3 +#define XENPMU_init 4 +#define XENPMU_finish 5 + +/* ` } */ + +/* Parameters structure for HYPERVISOR_xenpmu_op call */ +struct xen_pmu_params { + /* IN/OUT parameters */ + struct { + uint32_t maj; + uint32_t min; + } version; + uint64_t val; + + /* IN parameters */ + uint32_t vcpu; + uint32_t pad; +}; + +/* PMU modes: + * - XENPMU_MODE_OFF: No PMU virtualization + * - XENPMU_MODE_SELF: Guests can profile themselves + * - XENPMU_MODE_HV: Guests can profile themselves, dom0 profiles + * itself and Xen + * - XENPMU_MODE_ALL: Only dom0 has access to VPMU and it profiles + * everyone: itself, the hypervisor and the guests. + */ +#define XENPMU_MODE_OFF 0 +#define XENPMU_MODE_SELF (1<<0) +#define XENPMU_MODE_HV (1<<1) +#define XENPMU_MODE_ALL (1<<2) + +/* + * PMU features: + * - XENPMU_FEATURE_INTEL_BTS: Intel BTS support (ignored on AMD) + */ +#define XENPMU_FEATURE_INTEL_BTS 1 + +#endif /* __XEN_PUBLIC_XENPMU_H__ */ -- cgit v1.2.3 From 65d0cf0be79feebeb19e7626fd3ed41ae73f642d Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 10 Aug 2015 16:34:34 -0400 Subject: xen/PMU: Initialization code for Xen PMU Map shared data structure that will hold CPU registers, VPMU context, V/PCPU IDs of the CPU interrupted by PMU interrupt. Hypervisor fills this information in its handler and passes it to the guest for further processing. Set up PMU VIRQ. Now that perf infrastructure will assume that PMU is available on a PV guest we need to be careful and make sure that accesses via RDPMC instruction don't cause fatal traps by the hypervisor. Provide a nop RDPMC handler. For the same reason avoid issuing a warning on a write to APIC's LVTPC. Both of these will be made functional in later patches. Signed-off-by: Boris Ostrovsky Reviewed-by: David Vrabel Signed-off-by: David Vrabel --- arch/x86/include/asm/xen/interface.h | 123 +++++++++++++++++++++++++ arch/x86/xen/Makefile | 2 +- arch/x86/xen/apic.c | 3 + arch/x86/xen/enlighten.c | 12 ++- arch/x86/xen/pmu.c | 170 +++++++++++++++++++++++++++++++++++ arch/x86/xen/pmu.h | 11 +++ arch/x86/xen/smp.c | 29 +++++- arch/x86/xen/suspend.c | 23 +++-- include/xen/interface/xen.h | 1 + include/xen/interface/xenpmu.h | 33 +++++++ 10 files changed, 398 insertions(+), 9 deletions(-) create mode 100644 arch/x86/xen/pmu.c create mode 100644 arch/x86/xen/pmu.h (limited to 'include') diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index 3b88eeacdbda..62ca03ef5c65 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -250,6 +250,129 @@ struct vcpu_guest_context { #endif }; DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context); + +/* AMD PMU registers and structures */ +struct xen_pmu_amd_ctxt { + /* + * Offsets to counter and control MSRs (relative to xen_pmu_arch.c.amd). + * For PV(H) guests these fields are RO. + */ + uint32_t counters; + uint32_t ctrls; + + /* Counter MSRs */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + uint64_t regs[]; +#elif defined(__GNUC__) + uint64_t regs[0]; +#endif +}; + +/* Intel PMU registers and structures */ +struct xen_pmu_cntr_pair { + uint64_t counter; + uint64_t control; +}; + +struct xen_pmu_intel_ctxt { + /* + * Offsets to fixed and architectural counter MSRs (relative to + * xen_pmu_arch.c.intel). + * For PV(H) guests these fields are RO. + */ + uint32_t fixed_counters; + uint32_t arch_counters; + + /* PMU registers */ + uint64_t global_ctrl; + uint64_t global_ovf_ctrl; + uint64_t global_status; + uint64_t fixed_ctrl; + uint64_t ds_area; + uint64_t pebs_enable; + uint64_t debugctl; + + /* Fixed and architectural counter MSRs */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + uint64_t regs[]; +#elif defined(__GNUC__) + uint64_t regs[0]; +#endif +}; + +/* Sampled domain's registers */ +struct xen_pmu_regs { + uint64_t ip; + uint64_t sp; + uint64_t flags; + uint16_t cs; + uint16_t ss; + uint8_t cpl; + uint8_t pad[3]; +}; + +/* PMU flags */ +#define PMU_CACHED (1<<0) /* PMU MSRs are cached in the context */ +#define PMU_SAMPLE_USER (1<<1) /* Sample is from user or kernel mode */ +#define PMU_SAMPLE_REAL (1<<2) /* Sample is from realmode */ +#define PMU_SAMPLE_PV (1<<3) /* Sample from a PV guest */ + +/* + * Architecture-specific information describing state of the processor at + * the time of PMU interrupt. + * Fields of this structure marked as RW for guest should only be written by + * the guest when PMU_CACHED bit in pmu_flags is set (which is done by the + * hypervisor during PMU interrupt). Hypervisor will read updated data in + * XENPMU_flush hypercall and clear PMU_CACHED bit. + */ +struct xen_pmu_arch { + union { + /* + * Processor's registers at the time of interrupt. + * WO for hypervisor, RO for guests. + */ + struct xen_pmu_regs regs; + /* + * Padding for adding new registers to xen_pmu_regs in + * the future + */ +#define XENPMU_REGS_PAD_SZ 64 + uint8_t pad[XENPMU_REGS_PAD_SZ]; + } r; + + /* WO for hypervisor, RO for guest */ + uint64_t pmu_flags; + + /* + * APIC LVTPC register. + * RW for both hypervisor and guest. + * Only APIC_LVT_MASKED bit is loaded by the hypervisor into hardware + * during XENPMU_flush or XENPMU_lvtpc_set. + */ + union { + uint32_t lapic_lvtpc; + uint64_t pad; + } l; + + /* + * Vendor-specific PMU registers. + * RW for both hypervisor and guest (see exceptions above). + * Guest's updates to this field are verified and then loaded by the + * hypervisor into hardware during XENPMU_flush + */ + union { + struct xen_pmu_amd_ctxt amd; + struct xen_pmu_intel_ctxt intel; + + /* + * Padding for contexts (fixed parts only, does not include + * MSR banks that are specified by offsets) + */ +#define XENPMU_CTXT_PAD_SZ 128 + uint8_t pad[XENPMU_CTXT_PAD_SZ]; + } c; +}; + #endif /* !__ASSEMBLY__ */ /* diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 4b6e29ac0968..e47e52787d32 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -13,7 +13,7 @@ CFLAGS_mmu.o := $(nostackp) obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ grant-table.o suspend.o platform-pci-unplug.o \ - p2m.o apic.o + p2m.o apic.o pmu.o obj-$(CONFIG_EVENT_TRACING) += trace.o diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 70e060ad879a..d03ebfa89b9f 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -72,6 +72,9 @@ static u32 xen_apic_read(u32 reg) static void xen_apic_write(u32 reg, u32 val) { + if (reg == APIC_LVTPC) + return; + /* Warn to see if there's any stray references */ WARN(1,"register: %x, value: %x\n", reg, val); } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 373dbc9810d1..19072f91a8e2 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -84,6 +84,7 @@ #include "mmu.h" #include "smp.h" #include "multicalls.h" +#include "pmu.h" EXPORT_SYMBOL_GPL(hypercall_page); @@ -1082,6 +1083,11 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) return ret; } +unsigned long long xen_read_pmc(int counter) +{ + return 0; +} + void xen_setup_shared_info(void) { if (!xen_feature(XENFEAT_auto_translated_physmap)) { @@ -1216,7 +1222,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .write_msr = xen_write_msr_safe, .read_tsc = native_read_tsc, - .read_pmc = native_read_pmc, + .read_pmc = xen_read_pmc, .read_tscp = native_read_tscp, @@ -1267,6 +1273,10 @@ static const struct pv_apic_ops xen_apic_ops __initconst = { static void xen_reboot(int reason) { struct sched_shutdown r = { .reason = reason }; + int cpu; + + for_each_online_cpu(cpu) + xen_pmu_finish(cpu); if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) BUG(); diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c new file mode 100644 index 000000000000..1d1ae1b874ea --- /dev/null +++ b/arch/x86/xen/pmu.c @@ -0,0 +1,170 @@ +#include +#include + +#include +#include +#include +#include +#include + +#include "xen-ops.h" +#include "pmu.h" + +/* x86_pmu.handle_irq definition */ +#include "../kernel/cpu/perf_event.h" + + +/* Shared page between hypervisor and domain */ +static DEFINE_PER_CPU(struct xen_pmu_data *, xenpmu_shared); +#define get_xenpmu_data() per_cpu(xenpmu_shared, smp_processor_id()) + +/* perf callbacks */ +static int xen_is_in_guest(void) +{ + const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return 0; + } + + if (!xen_initial_domain() || (xenpmu_data->domain_id >= DOMID_SELF)) + return 0; + + return 1; +} + +static int xen_is_user_mode(void) +{ + const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return 0; + } + + if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV) + return (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER); + else + return !!(xenpmu_data->pmu.r.regs.cpl & 3); +} + +static unsigned long xen_get_guest_ip(void) +{ + const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return 0; + } + + return xenpmu_data->pmu.r.regs.ip; +} + +static struct perf_guest_info_callbacks xen_guest_cbs = { + .is_in_guest = xen_is_in_guest, + .is_user_mode = xen_is_user_mode, + .get_guest_ip = xen_get_guest_ip, +}; + +/* Convert registers from Xen's format to Linux' */ +static void xen_convert_regs(const struct xen_pmu_regs *xen_regs, + struct pt_regs *regs, uint64_t pmu_flags) +{ + regs->ip = xen_regs->ip; + regs->cs = xen_regs->cs; + regs->sp = xen_regs->sp; + + if (pmu_flags & PMU_SAMPLE_PV) { + if (pmu_flags & PMU_SAMPLE_USER) + regs->cs |= 3; + else + regs->cs &= ~3; + } else { + if (xen_regs->cpl) + regs->cs |= 3; + else + regs->cs &= ~3; + } +} + +irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id) +{ + int ret = IRQ_NONE; + struct pt_regs regs; + const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return ret; + } + + xen_convert_regs(&xenpmu_data->pmu.r.regs, ®s, + xenpmu_data->pmu.pmu_flags); + if (x86_pmu.handle_irq(®s)) + ret = IRQ_HANDLED; + + return ret; +} + +bool is_xen_pmu(int cpu) +{ + return (per_cpu(xenpmu_shared, cpu) != NULL); +} + +void xen_pmu_init(int cpu) +{ + int err; + struct xen_pmu_params xp; + unsigned long pfn; + struct xen_pmu_data *xenpmu_data; + + BUILD_BUG_ON(sizeof(struct xen_pmu_data) > PAGE_SIZE); + + if (xen_hvm_domain()) + return; + + xenpmu_data = (struct xen_pmu_data *)get_zeroed_page(GFP_KERNEL); + if (!xenpmu_data) { + pr_err("VPMU init: No memory\n"); + return; + } + pfn = virt_to_pfn(xenpmu_data); + + xp.val = pfn_to_mfn(pfn); + xp.vcpu = cpu; + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + err = HYPERVISOR_xenpmu_op(XENPMU_init, &xp); + if (err) + goto fail; + + per_cpu(xenpmu_shared, cpu) = xenpmu_data; + + if (cpu == 0) + perf_register_guest_info_callbacks(&xen_guest_cbs); + + return; + +fail: + pr_warn_once("Could not initialize VPMU for cpu %d, error %d\n", + cpu, err); + free_pages((unsigned long)xenpmu_data, 0); +} + +void xen_pmu_finish(int cpu) +{ + struct xen_pmu_params xp; + + if (xen_hvm_domain()) + return; + + xp.vcpu = cpu; + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + + (void)HYPERVISOR_xenpmu_op(XENPMU_finish, &xp); + + free_pages((unsigned long)per_cpu(xenpmu_shared, cpu), 0); + per_cpu(xenpmu_shared, cpu) = NULL; +} diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h new file mode 100644 index 000000000000..a76d2cf83581 --- /dev/null +++ b/arch/x86/xen/pmu.h @@ -0,0 +1,11 @@ +#ifndef __XEN_PMU_H +#define __XEN_PMU_H + +#include + +irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id); +void xen_pmu_init(int cpu); +void xen_pmu_finish(int cpu); +bool is_xen_pmu(int cpu); + +#endif /* __XEN_PMU_H */ diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 86484384492e..2a9ff7342791 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -38,6 +39,7 @@ #include "xen-ops.h" #include "mmu.h" #include "smp.h" +#include "pmu.h" cpumask_var_t xen_cpu_initialized_map; @@ -50,6 +52,7 @@ static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 }; +static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 }; static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); @@ -148,11 +151,18 @@ static void xen_smp_intr_free(unsigned int cpu) kfree(per_cpu(xen_irq_work, cpu).name); per_cpu(xen_irq_work, cpu).name = NULL; } + + if (per_cpu(xen_pmu_irq, cpu).irq >= 0) { + unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL); + per_cpu(xen_pmu_irq, cpu).irq = -1; + kfree(per_cpu(xen_pmu_irq, cpu).name); + per_cpu(xen_pmu_irq, cpu).name = NULL; + } }; static int xen_smp_intr_init(unsigned int cpu) { int rc; - char *resched_name, *callfunc_name, *debug_name; + char *resched_name, *callfunc_name, *debug_name, *pmu_name; resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, @@ -218,6 +228,18 @@ static int xen_smp_intr_init(unsigned int cpu) per_cpu(xen_irq_work, cpu).irq = rc; per_cpu(xen_irq_work, cpu).name = callfunc_name; + if (is_xen_pmu(cpu)) { + pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu); + rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu, + xen_pmu_irq_handler, + IRQF_PERCPU|IRQF_NOBALANCING, + pmu_name, NULL); + if (rc < 0) + goto fail; + per_cpu(xen_pmu_irq, cpu).irq = rc; + per_cpu(xen_pmu_irq, cpu).name = pmu_name; + } + return 0; fail: @@ -335,6 +357,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) } set_cpu_sibling_map(0); + xen_pmu_init(0); + if (xen_smp_intr_init(0)) BUG(); @@ -462,6 +486,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) if (rc) return rc; + xen_pmu_init(cpu); + rc = xen_smp_intr_init(cpu); if (rc) return rc; @@ -503,6 +529,7 @@ static void xen_cpu_die(unsigned int cpu) xen_smp_intr_free(cpu); xen_uninit_lock_cpu(cpu); xen_teardown_timer(cpu); + xen_pmu_finish(cpu); } } diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 53b4c0811f4f..feddabdab448 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -11,6 +11,7 @@ #include "xen-ops.h" #include "mmu.h" +#include "pmu.h" static void xen_pv_pre_suspend(void) { @@ -67,16 +68,26 @@ static void xen_pv_post_suspend(int suspend_cancelled) void xen_arch_pre_suspend(void) { - if (xen_pv_domain()) - xen_pv_pre_suspend(); + int cpu; + + for_each_online_cpu(cpu) + xen_pmu_finish(cpu); + + if (xen_pv_domain()) + xen_pv_pre_suspend(); } void xen_arch_post_suspend(int cancelled) { - if (xen_pv_domain()) - xen_pv_post_suspend(cancelled); - else - xen_hvm_post_suspend(cancelled); + int cpu; + + if (xen_pv_domain()) + xen_pv_post_suspend(cancelled); + else + xen_hvm_post_suspend(cancelled); + + for_each_online_cpu(cpu) + xen_pmu_init(cpu); } static void xen_vcpu_notify_restore(void *data) diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index e9d4501d1f5e..167071c290b3 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -113,6 +113,7 @@ #define VIRQ_MEM_EVENT 10 /* G. (DOM0) A memory event has occured */ #define VIRQ_XC_RESERVED 11 /* G. Reserved for XenClient */ #define VIRQ_ENOMEM 12 /* G. (DOM0) Low on heap memory */ +#define VIRQ_XENPMU 13 /* PMC interrupt */ /* Architecture-specific VIRQ definitions. */ #define VIRQ_ARCH_0 16 diff --git a/include/xen/interface/xenpmu.h b/include/xen/interface/xenpmu.h index eac1b498b89f..ca42301949b5 100644 --- a/include/xen/interface/xenpmu.h +++ b/include/xen/interface/xenpmu.h @@ -56,4 +56,37 @@ struct xen_pmu_params { */ #define XENPMU_FEATURE_INTEL_BTS 1 +/* + * Shared PMU data between hypervisor and PV(H) domains. + * + * The hypervisor fills out this structure during PMU interrupt and sends an + * interrupt to appropriate VCPU. + * Architecture-independent fields of xen_pmu_data are WO for the hypervisor + * and RO for the guest but some fields in xen_pmu_arch can be writable + * by both the hypervisor and the guest (see arch-$arch/pmu.h). + */ +struct xen_pmu_data { + /* Interrupted VCPU */ + uint32_t vcpu_id; + + /* + * Physical processor on which the interrupt occurred. On non-privileged + * guests set to vcpu_id; + */ + uint32_t pcpu_id; + + /* + * Domain that was interrupted. On non-privileged guests set to + * DOMID_SELF. + * On privileged guests can be DOMID_SELF, DOMID_XEN, or, when in + * XENPMU_MODE_ALL mode, domain ID of another domain. + */ + domid_t domain_id; + + uint8_t pad[6]; + + /* Architecture-specific information */ + struct xen_pmu_arch pmu; +}; + #endif /* __XEN_PUBLIC_XENPMU_H__ */ -- cgit v1.2.3 From 6b08cd6328c58a2ae190c5ee03a2ffcab5ef828e Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 10 Aug 2015 16:34:36 -0400 Subject: xen/PMU: Intercept PMU-related MSR and APIC accesses Provide interfaces for recognizing accesses to PMU-related MSRs and LVTPC APIC and process these accesses in Xen PMU code. (The interrupt handler performs XENPMU_flush right away in the beginning since no PMU emulation is available. It will be added with a later patch). Signed-off-by: Boris Ostrovsky Reviewed-by: David Vrabel Signed-off-by: David Vrabel --- arch/x86/xen/apic.c | 5 ++- arch/x86/xen/enlighten.c | 11 +++-- arch/x86/xen/pmu.c | 95 +++++++++++++++++++++++++++++++++++++++++- arch/x86/xen/pmu.h | 4 ++ include/xen/interface/xenpmu.h | 2 + 5 files changed, 109 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index d03ebfa89b9f..acda713ab5be 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -7,6 +7,7 @@ #include #include #include "xen-ops.h" +#include "pmu.h" #include "smp.h" static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) @@ -72,8 +73,10 @@ static u32 xen_apic_read(u32 reg) static void xen_apic_write(u32 reg, u32 val) { - if (reg == APIC_LVTPC) + if (reg == APIC_LVTPC) { + (void)pmu_apic_update(reg); return; + } /* Warn to see if there's any stray references */ WARN(1,"register: %x, value: %x\n", reg, val); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 19072f91a8e2..fdaba49f6759 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1031,6 +1031,9 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err) { u64 val; + if (pmu_msr_read(msr, &val, err)) + return val; + val = native_read_msr_safe(msr, err); switch (msr) { case MSR_IA32_APICBASE: @@ -1077,17 +1080,13 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) Xen console noise. */ default: - ret = native_write_msr_safe(msr, low, high); + if (!pmu_msr_write(msr, low, high, &ret)) + ret = native_write_msr_safe(msr, low, high); } return ret; } -unsigned long long xen_read_pmc(int counter) -{ - return 0; -} - void xen_setup_shared_info(void) { if (!xen_feature(XENFEAT_auto_translated_physmap)) { diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c index a4a6e4f04f37..f92b908e005f 100644 --- a/arch/x86/xen/pmu.c +++ b/arch/x86/xen/pmu.c @@ -51,6 +51,8 @@ static __read_mostly int amd_num_counters; /* Alias registers (0x4c1) for full-width writes to PMCs */ #define MSR_PMC_ALIAS_MASK (~(MSR_IA32_PERFCTR0 ^ MSR_IA32_PMC0)) +#define INTEL_PMC_TYPE_SHIFT 30 + static __read_mostly int intel_num_arch_counters, intel_num_fixed_counters; @@ -167,6 +169,91 @@ static int is_intel_pmu_msr(u32 msr_index, int *type, int *index) } } +bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err) +{ + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (is_amd_pmu_msr(msr)) { + *val = native_read_msr_safe(msr, err); + return true; + } + } else { + int type, index; + + if (is_intel_pmu_msr(msr, &type, &index)) { + *val = native_read_msr_safe(msr, err); + return true; + } + } + + return false; +} + +bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (is_amd_pmu_msr(msr)) { + *err = native_write_msr_safe(msr, low, high); + return true; + } + } else { + int type, index; + + if (is_intel_pmu_msr(msr, &type, &index)) { + *err = native_write_msr_safe(msr, low, high); + return true; + } + } + + return false; +} + +static unsigned long long xen_amd_read_pmc(int counter) +{ + uint32_t msr; + int err; + + msr = amd_counters_base + (counter * amd_msr_step); + return native_read_msr_safe(msr, &err); +} + +static unsigned long long xen_intel_read_pmc(int counter) +{ + int err; + uint32_t msr; + + if (counter & (1<pmu.l.lapic_lvtpc = val; + ret = HYPERVISOR_xenpmu_op(XENPMU_lvtpc_set, NULL); + + return ret; +} + /* perf callbacks */ static int xen_is_in_guest(void) { @@ -239,7 +326,7 @@ static void xen_convert_regs(const struct xen_pmu_regs *xen_regs, irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id) { - int ret = IRQ_NONE; + int err, ret = IRQ_NONE; struct pt_regs regs; const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); @@ -248,6 +335,12 @@ irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id) return ret; } + err = HYPERVISOR_xenpmu_op(XENPMU_flush, NULL); + if (err) { + pr_warn_once("%s: failed hypercall, err: %d\n", __func__, err); + return ret; + } + xen_convert_regs(&xenpmu_data->pmu.r.regs, ®s, xenpmu_data->pmu.pmu_flags); if (x86_pmu.handle_irq(®s)) diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h index a76d2cf83581..af5f0ad94078 100644 --- a/arch/x86/xen/pmu.h +++ b/arch/x86/xen/pmu.h @@ -7,5 +7,9 @@ irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id); void xen_pmu_init(int cpu); void xen_pmu_finish(int cpu); bool is_xen_pmu(int cpu); +bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err); +bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err); +int pmu_apic_update(uint32_t reg); +unsigned long long xen_read_pmc(int counter); #endif /* __XEN_PMU_H */ diff --git a/include/xen/interface/xenpmu.h b/include/xen/interface/xenpmu.h index ca42301949b5..139efc91bceb 100644 --- a/include/xen/interface/xenpmu.h +++ b/include/xen/interface/xenpmu.h @@ -20,6 +20,8 @@ #define XENPMU_feature_set 3 #define XENPMU_init 4 #define XENPMU_finish 5 +#define XENPMU_lvtpc_set 6 +#define XENPMU_flush 7 /* ` } */ -- cgit v1.2.3 From 626d7508664c4bc8e67f496da4387ecd0c410b8c Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 4 Sep 2015 14:05:51 +0200 Subject: xen: switch extra memory accounting to use pfns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of using physical addresses for accounting of extra memory areas available for ballooning switch to pfns as this is much less error prone regarding partial pages. Reported-by: Roger Pau MonnĂ© Tested-by: Roger Pau MonnĂ© Signed-off-by: Juergen Gross Signed-off-by: David Vrabel --- arch/x86/xen/setup.c | 80 ++++++++++++++++++++++++++++----------------------- drivers/xen/balloon.c | 6 ++-- include/xen/page.h | 4 +-- 3 files changed, 49 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 70de4c8b8f27..f5ef6746d47a 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -90,62 +90,69 @@ static void __init xen_parse_512gb(void) xen_512gb_limit = val; } -static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size) +static void __init xen_add_extra_mem(unsigned long start_pfn, + unsigned long n_pfns) { int i; + /* + * No need to check for zero size, should happen rarely and will only + * write a new entry regarded to be unused due to zero size. + */ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { /* Add new region. */ - if (xen_extra_mem[i].size == 0) { - xen_extra_mem[i].start = start; - xen_extra_mem[i].size = size; + if (xen_extra_mem[i].n_pfns == 0) { + xen_extra_mem[i].start_pfn = start_pfn; + xen_extra_mem[i].n_pfns = n_pfns; break; } /* Append to existing region. */ - if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) { - xen_extra_mem[i].size += size; + if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns == + start_pfn) { + xen_extra_mem[i].n_pfns += n_pfns; break; } } if (i == XEN_EXTRA_MEM_MAX_REGIONS) printk(KERN_WARNING "Warning: not enough extra memory regions\n"); - memblock_reserve(start, size); + memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); } -static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size) +static void __init xen_del_extra_mem(unsigned long start_pfn, + unsigned long n_pfns) { int i; - phys_addr_t start_r, size_r; + unsigned long start_r, size_r; for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { - start_r = xen_extra_mem[i].start; - size_r = xen_extra_mem[i].size; + start_r = xen_extra_mem[i].start_pfn; + size_r = xen_extra_mem[i].n_pfns; /* Start of region. */ - if (start_r == start) { - BUG_ON(size > size_r); - xen_extra_mem[i].start += size; - xen_extra_mem[i].size -= size; + if (start_r == start_pfn) { + BUG_ON(n_pfns > size_r); + xen_extra_mem[i].start_pfn += n_pfns; + xen_extra_mem[i].n_pfns -= n_pfns; break; } /* End of region. */ - if (start_r + size_r == start + size) { - BUG_ON(size > size_r); - xen_extra_mem[i].size -= size; + if (start_r + size_r == start_pfn + n_pfns) { + BUG_ON(n_pfns > size_r); + xen_extra_mem[i].n_pfns -= n_pfns; break; } /* Mid of region. */ - if (start > start_r && start < start_r + size_r) { - BUG_ON(start + size > start_r + size_r); - xen_extra_mem[i].size = start - start_r; + if (start_pfn > start_r && start_pfn < start_r + size_r) { + BUG_ON(start_pfn + n_pfns > start_r + size_r); + xen_extra_mem[i].n_pfns = start_pfn - start_r; /* Calling memblock_reserve() again is okay. */ - xen_add_extra_mem(start + size, start_r + size_r - - (start + size)); + xen_add_extra_mem(start_pfn + n_pfns, start_r + size_r - + (start_pfn + n_pfns)); break; } } - memblock_free(start, size); + memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); } /* @@ -156,11 +163,10 @@ static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size) unsigned long __ref xen_chk_extra_mem(unsigned long pfn) { int i; - phys_addr_t addr = PFN_PHYS(pfn); for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { - if (addr >= xen_extra_mem[i].start && - addr < xen_extra_mem[i].start + xen_extra_mem[i].size) + if (pfn >= xen_extra_mem[i].start_pfn && + pfn < xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns) return INVALID_P2M_ENTRY; } @@ -176,10 +182,10 @@ void __init xen_inv_extra_mem(void) int i; for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { - if (!xen_extra_mem[i].size) + if (!xen_extra_mem[i].n_pfns) continue; - pfn_s = PFN_DOWN(xen_extra_mem[i].start); - pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size); + pfn_s = xen_extra_mem[i].start_pfn; + pfn_e = pfn_s + xen_extra_mem[i].n_pfns; for (pfn = pfn_s; pfn < pfn_e; pfn++) set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } @@ -507,7 +513,7 @@ void __init xen_remap_memory(void) } else if (pfn_s + len == xen_remap_buf.target_pfn) { len += xen_remap_buf.size; } else { - xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len)); + xen_del_extra_mem(pfn_s, len); pfn_s = xen_remap_buf.target_pfn; len = xen_remap_buf.size; } @@ -517,7 +523,7 @@ void __init xen_remap_memory(void) } if (pfn_s != ~0UL && len) - xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len)); + xen_del_extra_mem(pfn_s, len); set_pte_mfn(buf, mfn_save, PAGE_KERNEL); @@ -744,7 +750,7 @@ static void __init xen_reserve_xen_mfnlist(void) **/ char * __init xen_memory_setup(void) { - unsigned long max_pfn; + unsigned long max_pfn, pfn_s, n_pfns; phys_addr_t mem_end, addr, size, chunk_size; u32 type; int rc; @@ -831,9 +837,11 @@ char * __init xen_memory_setup(void) chunk_size = min(size, mem_end - addr); } else if (extra_pages) { chunk_size = min(size, PFN_PHYS(extra_pages)); - extra_pages -= PFN_DOWN(chunk_size); - xen_add_extra_mem(addr, chunk_size); - xen_max_p2m_pfn = PFN_DOWN(addr + chunk_size); + pfn_s = PFN_UP(addr); + n_pfns = PFN_DOWN(addr + chunk_size) - pfn_s; + extra_pages -= n_pfns; + xen_add_extra_mem(pfn_s, n_pfns); + xen_max_p2m_pfn = pfn_s + n_pfns; } else type = E820_UNUSABLE; } diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index bf4a23c7c591..1fa633b2d556 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -638,9 +638,9 @@ static int __init balloon_init(void) * regions (see arch/x86/xen/setup.c). */ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) - if (xen_extra_mem[i].size) - balloon_add_region(PFN_UP(xen_extra_mem[i].start), - PFN_DOWN(xen_extra_mem[i].size)); + if (xen_extra_mem[i].n_pfns) + balloon_add_region(xen_extra_mem[i].start_pfn, + xen_extra_mem[i].n_pfns); return 0; } diff --git a/include/xen/page.h b/include/xen/page.h index c5ed20bb3fe9..a5983da2f5cd 100644 --- a/include/xen/page.h +++ b/include/xen/page.h @@ -9,8 +9,8 @@ static inline unsigned long page_to_mfn(struct page *page) } struct xen_memory_region { - phys_addr_t start; - phys_addr_t size; + unsigned long start_pfn; + unsigned long n_pfns; }; #define XEN_EXTRA_MEM_MAX_REGIONS 128 /* == E820MAX */ -- cgit v1.2.3