diff options
Diffstat (limited to 'arch')
79 files changed, 1994 insertions, 704 deletions
diff --git a/arch/arm/boot/dts/am4372.dtsi b/arch/arm/boot/dts/am4372.dtsi index 564900b9fcce..0447c04a40cc 100644 --- a/arch/arm/boot/dts/am4372.dtsi +++ b/arch/arm/boot/dts/am4372.dtsi @@ -358,6 +358,8 @@ interrupts = <GIC_SPI 75 IRQ_TYPE_LEVEL_HIGH GIC_SPI 76 IRQ_TYPE_LEVEL_HIGH>; ti,hwmods = "rtc"; + clocks = <&clk_32768_ck>; + clock-names = "int-clk"; status = "disabled"; }; diff --git a/arch/arm/boot/dts/am437x-gp-evm.dts b/arch/arm/boot/dts/am437x-gp-evm.dts index 215775dc6948..22038f21f228 100644 --- a/arch/arm/boot/dts/am437x-gp-evm.dts +++ b/arch/arm/boot/dts/am437x-gp-evm.dts @@ -112,6 +112,13 @@ clock-frequency = <12000000>; }; + /* fixed 32k external oscillator clock */ + clk_32k_rtc: clk_32k_rtc { + #clock-cells = <0>; + compatible = "fixed-clock"; + clock-frequency = <32768>; + }; + sound0: sound@0 { compatible = "simple-audio-card"; simple-audio-card,name = "AM437x-GP-EVM"; @@ -941,3 +948,9 @@ tx-num-evt = <32>; rx-num-evt = <32>; }; + +&rtc { + clocks = <&clk_32k_rtc>, <&clk_32768_ck>; + clock-names = "ext-clk", "int-clk"; + status = "okay"; +}; diff --git a/arch/arm/boot/dts/am437x-idk-evm.dts b/arch/arm/boot/dts/am437x-idk-evm.dts index 378344271746..af25801418b4 100644 --- a/arch/arm/boot/dts/am437x-idk-evm.dts +++ b/arch/arm/boot/dts/am437x-idk-evm.dts @@ -110,6 +110,13 @@ gpios = <&gpio4 2 GPIO_ACTIVE_LOW>; }; }; + + /* fixed 32k external oscillator clock */ + clk_32k_rtc: clk_32k_rtc { + #clock-cells = <0>; + compatible = "fixed-clock"; + clock-frequency = <32768>; + }; }; &am43xx_pinmux { @@ -394,6 +401,8 @@ }; &rtc { + clocks = <&clk_32k_rtc>, <&clk_32768_ck>; + clock-names = "ext-clk", "int-clk"; status = "okay"; }; diff --git a/arch/arm/boot/dts/am437x-sk-evm.dts b/arch/arm/boot/dts/am437x-sk-evm.dts index 22af44894c66..7da7c2da4af1 100644 --- a/arch/arm/boot/dts/am437x-sk-evm.dts +++ b/arch/arm/boot/dts/am437x-sk-evm.dts @@ -24,6 +24,13 @@ display0 = &lcd0; }; + /* fixed 32k external oscillator clock */ + clk_32k_rtc: clk_32k_rtc { + #clock-cells = <0>; + compatible = "fixed-clock"; + clock-frequency = <32768>; + }; + backlight { compatible = "pwm-backlight"; pwms = <&ecap0 0 50000 PWM_POLARITY_INVERTED>; @@ -697,6 +704,8 @@ }; &rtc { + clocks = <&clk_32k_rtc>, <&clk_32768_ck>; + clock-names = "ext-clk", "int-clk"; status = "okay"; }; diff --git a/arch/arm/configs/cm_x2xx_defconfig b/arch/arm/configs/cm_x2xx_defconfig index dc01c049a520..3b32d5fd9326 100644 --- a/arch/arm/configs/cm_x2xx_defconfig +++ b/arch/arm/configs/cm_x2xx_defconfig @@ -157,7 +157,7 @@ CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_HEARTBEAT=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_V3020=y -CONFIG_RTC_DRV_SA1100=y +CONFIG_RTC_DRV_PXA=y CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y CONFIG_INOTIFY=y diff --git a/arch/arm/configs/em_x270_defconfig b/arch/arm/configs/em_x270_defconfig index 4560c9ca6636..8e10df7ba1b4 100644 --- a/arch/arm/configs/em_x270_defconfig +++ b/arch/arm/configs/em_x270_defconfig @@ -157,7 +157,7 @@ CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_HEARTBEAT=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_V3020=y -CONFIG_RTC_DRV_SA1100=y +CONFIG_RTC_DRV_PXA=y CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y CONFIG_INOTIFY=y diff --git a/arch/arm/configs/magician_defconfig b/arch/arm/configs/magician_defconfig index 557dd291288b..a5b4920cd6d4 100644 --- a/arch/arm/configs/magician_defconfig +++ b/arch/arm/configs/magician_defconfig @@ -150,7 +150,7 @@ CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_BACKLIGHT=y CONFIG_RTC_CLASS=y CONFIG_RTC_DEBUG=y -CONFIG_RTC_DRV_SA1100=y +CONFIG_RTC_DRV_PXA=y CONFIG_EXT2_FS=y CONFIG_INOTIFY=y CONFIG_MSDOS_FS=m diff --git a/arch/arm/configs/palmz72_defconfig b/arch/arm/configs/palmz72_defconfig index 4baa83c1c577..83c135e19aba 100644 --- a/arch/arm/configs/palmz72_defconfig +++ b/arch/arm/configs/palmz72_defconfig @@ -67,7 +67,7 @@ CONFIG_MMC=y CONFIG_MMC_DEBUG=y CONFIG_MMC_PXA=y CONFIG_RTC_CLASS=y -CONFIG_RTC_DRV_SA1100=y +CONFIG_RTC_DRV_PXA=y CONFIG_EXT2_FS=y CONFIG_EXT3_FS=y # CONFIG_DNOTIFY is not set diff --git a/arch/arm/configs/pcm027_defconfig b/arch/arm/configs/pcm027_defconfig index 0a847d04ddc1..b5624e325817 100644 --- a/arch/arm/configs/pcm027_defconfig +++ b/arch/arm/configs/pcm027_defconfig @@ -82,7 +82,7 @@ CONFIG_MMC=y CONFIG_MMC_PXA=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_PCF8563=m -CONFIG_RTC_DRV_SA1100=m +CONFIG_RTC_DRV_PXA=m CONFIG_EXT2_FS=m CONFIG_EXT3_FS=m # CONFIG_DNOTIFY is not set diff --git a/arch/arm/configs/trizeps4_defconfig b/arch/arm/configs/trizeps4_defconfig index 932ee4e4a13a..4bc870028035 100644 --- a/arch/arm/configs/trizeps4_defconfig +++ b/arch/arm/configs/trizeps4_defconfig @@ -177,7 +177,7 @@ CONFIG_NEW_LEDS=y CONFIG_RTC_CLASS=y # CONFIG_RTC_HCTOSYS is not set CONFIG_RTC_DRV_PCF8583=m -CONFIG_RTC_DRV_SA1100=y +CONFIG_RTC_DRV_PXA=y CONFIG_EXT2_FS=y CONFIG_EXT2_FS_XATTR=y CONFIG_EXT2_FS_POSIX_ACL=y diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h index 53c15dec7af6..be1d07d59ee9 100644 --- a/arch/arm/include/asm/irq.h +++ b/arch/arm/include/asm/irq.h @@ -35,6 +35,11 @@ extern void (*handle_arch_irq)(struct pt_regs *); extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); #endif +#ifdef CONFIG_SMP +extern void arch_trigger_all_cpu_backtrace(bool); +#define arch_trigger_all_cpu_backtrace(x) arch_trigger_all_cpu_backtrace(x) +#endif + #endif #endif diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index b7f6fb462ea0..98d58bb04ac5 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h @@ -119,12 +119,6 @@ #endif /* - * Convert a physical address to a Page Frame Number and back - */ -#define __phys_to_pfn(paddr) ((unsigned long)((paddr) >> PAGE_SHIFT)) -#define __pfn_to_phys(pfn) ((phys_addr_t)(pfn) << PAGE_SHIFT) - -/* * Convert a page to/from a physical address */ #define page_to_phys(page) (__pfn_to_phys(page_to_pfn(page))) diff --git a/arch/arm/include/asm/xen/events.h b/arch/arm/include/asm/xen/events.h index 8b1f37bfeeec..71e473d05fcc 100644 --- a/arch/arm/include/asm/xen/events.h +++ b/arch/arm/include/asm/xen/events.h @@ -20,4 +20,10 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) atomic64_t, \ counter), (val)) +/* Rebind event channel is supported by default */ +static inline bool xen_support_evtchn_rebind(void) +{ + return true; +} + #endif /* _ASM_ARM_XEN_EVENTS_H */ diff --git a/arch/arm/include/asm/xen/page.h b/arch/arm/include/asm/xen/page.h index 1bee8ca12494..98b1084f8282 100644 --- a/arch/arm/include/asm/xen/page.h +++ b/arch/arm/include/asm/xen/page.h @@ -54,26 +54,14 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) #define mfn_to_local_pfn(mfn) mfn_to_pfn(mfn) -static inline xmaddr_t phys_to_machine(xpaddr_t phys) -{ - unsigned offset = phys.paddr & ~PAGE_MASK; - return XMADDR(PFN_PHYS(pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset); -} - -static inline xpaddr_t machine_to_phys(xmaddr_t machine) -{ - unsigned offset = machine.maddr & ~PAGE_MASK; - return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset); -} /* VIRT <-> MACHINE conversion */ -#define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v)))) #define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v))) #define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) +/* Only used in PV code. But ARM guests are always HVM. */ static inline xmaddr_t arbitrary_virt_to_machine(void *vaddr) { - /* TODO: assuming it is mapped in the kernel 1:1 */ - return virt_to_machine(vaddr); + BUG(); } /* TODO: this shouldn't be here but it is because the frontend drivers diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index ba0063c539c3..48185a773852 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -21,6 +21,7 @@ #include <linux/cpu.h> #include <linux/seq_file.h> #include <linux/irq.h> +#include <linux/nmi.h> #include <linux/percpu.h> #include <linux/clockchips.h> #include <linux/completion.h> @@ -72,6 +73,7 @@ enum ipi_msg_type { IPI_CPU_STOP, IPI_IRQ_WORK, IPI_COMPLETION, + IPI_CPU_BACKTRACE = 15, }; static DECLARE_COMPLETION(cpu_running); @@ -643,6 +645,12 @@ void handle_IPI(int ipinr, struct pt_regs *regs) irq_exit(); break; + case IPI_CPU_BACKTRACE: + irq_enter(); + nmi_cpu_backtrace(regs); + irq_exit(); + break; + default: pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr); @@ -737,3 +745,13 @@ static int __init register_cpufreq_notifier(void) core_initcall(register_cpufreq_notifier); #endif + +static void raise_nmi(cpumask_t *mask) +{ + smp_cross_call(mask, IPI_CPU_BACKTRACE); +} + +void arch_trigger_all_cpu_backtrace(bool include_self) +{ + nmi_trigger_all_cpu_backtrace(include_self, raise_nmi); +} diff --git a/arch/arm/mach-clps711x/board-cdb89712.c b/arch/arm/mach-clps711x/board-cdb89712.c index 1ec378c334e5..972abdb10028 100644 --- a/arch/arm/mach-clps711x/board-cdb89712.c +++ b/arch/arm/mach-clps711x/board-cdb89712.c @@ -95,7 +95,7 @@ static struct physmap_flash_data cdb89712_bootrom_pdata __initdata = { static struct resource cdb89712_bootrom_resources[] __initdata = { DEFINE_RES_NAMED(CS7_PHYS_BASE, SZ_128, "BOOTROM", IORESOURCE_MEM | - IORESOURCE_CACHEABLE | IORESOURCE_READONLY), + IORESOURCE_READONLY), }; static struct platform_device cdb89712_bootrom_pdev __initdata = { diff --git a/arch/arm/mach-mmp/include/mach/regs-rtc.h b/arch/arm/mach-mmp/include/mach/regs-rtc.h deleted file mode 100644 index 5bff886a3941..000000000000 --- a/arch/arm/mach-mmp/include/mach/regs-rtc.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef __ASM_MACH_REGS_RTC_H -#define __ASM_MACH_REGS_RTC_H - -#include <mach/addr-map.h> - -#define RTC_VIRT_BASE (APB_VIRT_BASE + 0x10000) -#define RTC_REG(x) (*((volatile u32 __iomem *)(RTC_VIRT_BASE + (x)))) - -/* - * Real Time Clock - */ - -#define RCNR RTC_REG(0x00) /* RTC Count Register */ -#define RTAR RTC_REG(0x04) /* RTC Alarm Register */ -#define RTSR RTC_REG(0x08) /* RTC Status Register */ -#define RTTR RTC_REG(0x0C) /* RTC Timer Trim Register */ - -#define RTSR_HZE (1 << 3) /* HZ interrupt enable */ -#define RTSR_ALE (1 << 2) /* RTC alarm interrupt enable */ -#define RTSR_HZ (1 << 1) /* HZ rising-edge detected */ -#define RTSR_AL (1 << 0) /* RTC alarm detected */ - -#endif /* __ASM_MACH_REGS_RTC_H */ diff --git a/arch/arm/mach-pxa/devices.c b/arch/arm/mach-pxa/devices.c index e6ce669b54af..c62473235a13 100644 --- a/arch/arm/mach-pxa/devices.c +++ b/arch/arm/mach-pxa/devices.c @@ -440,25 +440,11 @@ struct platform_device pxa_device_rtc = { .resource = pxa_rtc_resources, }; -static struct resource sa1100_rtc_resources[] = { - { - .start = IRQ_RTC1Hz, - .end = IRQ_RTC1Hz, - .name = "rtc 1Hz", - .flags = IORESOURCE_IRQ, - }, { - .start = IRQ_RTCAlrm, - .end = IRQ_RTCAlrm, - .name = "rtc alarm", - .flags = IORESOURCE_IRQ, - }, -}; - struct platform_device sa1100_device_rtc = { .name = "sa1100-rtc", .id = -1, - .num_resources = ARRAY_SIZE(sa1100_rtc_resources), - .resource = sa1100_rtc_resources, + .num_resources = ARRAY_SIZE(pxa_rtc_resources), + .resource = pxa_rtc_resources, }; static struct resource pxa_ac97_resources[] = { diff --git a/arch/arm/mach-pxa/pxa27x.c b/arch/arm/mach-pxa/pxa27x.c index e6aae9e8adfb..221260d5d109 100644 --- a/arch/arm/mach-pxa/pxa27x.c +++ b/arch/arm/mach-pxa/pxa27x.c @@ -282,7 +282,6 @@ static struct platform_device *devices[] __initdata = { &pxa_device_asoc_ssp2, &pxa_device_asoc_ssp3, &pxa_device_asoc_platform, - &sa1100_device_rtc, &pxa_device_rtc, &pxa27x_device_ssp1, &pxa27x_device_ssp2, diff --git a/arch/arm/mach-pxa/pxa3xx.c b/arch/arm/mach-pxa/pxa3xx.c index 165638462a2f..ce0f8d6242e2 100644 --- a/arch/arm/mach-pxa/pxa3xx.c +++ b/arch/arm/mach-pxa/pxa3xx.c @@ -394,7 +394,6 @@ static struct platform_device *devices[] __initdata = { &pxa_device_asoc_ssp3, &pxa_device_asoc_ssp4, &pxa_device_asoc_platform, - &sa1100_device_rtc, &pxa_device_rtc, &pxa3xx_device_ssp1, &pxa3xx_device_ssp2, diff --git a/arch/arm/mach-sa1100/include/mach/SA-1100.h b/arch/arm/mach-sa1100/include/mach/SA-1100.h index 0ac6cc08a19c..7972617cca64 100644 --- a/arch/arm/mach-sa1100/include/mach/SA-1100.h +++ b/arch/arm/mach-sa1100/include/mach/SA-1100.h @@ -858,40 +858,6 @@ /* - * Real-Time Clock (RTC) control registers - * - * Registers - * RTAR Real-Time Clock (RTC) Alarm Register (read/write). - * RCNR Real-Time Clock (RTC) CouNt Register (read/write). - * RTTR Real-Time Clock (RTC) Trim Register (read/write). - * RTSR Real-Time Clock (RTC) Status Register (read/write). - * - * Clocks - * frtx, Trtx Frequency, period of the real-time clock crystal - * (32.768 kHz nominal). - * frtc, Trtc Frequency, period of the real-time clock counter - * (1 Hz nominal). - */ - -#define RTAR __REG(0x90010000) /* RTC Alarm Reg. */ -#define RCNR __REG(0x90010004) /* RTC CouNt Reg. */ -#define RTTR __REG(0x90010008) /* RTC Trim Reg. */ -#define RTSR __REG(0x90010010) /* RTC Status Reg. */ - -#define RTTR_C Fld (16, 0) /* clock divider Count - 1 */ -#define RTTR_D Fld (10, 16) /* trim Delete count */ - /* frtc = (1023*(C + 1) - D)*frtx/ */ - /* (1023*(C + 1)^2) */ - /* Trtc = (1023*(C + 1)^2)*Trtx/ */ - /* (1023*(C + 1) - D) */ - -#define RTSR_AL 0x00000001 /* ALarm detected */ -#define RTSR_HZ 0x00000002 /* 1 Hz clock detected */ -#define RTSR_ALE 0x00000004 /* ALarm interrupt Enable */ -#define RTSR_HZE 0x00000008 /* 1 Hz clock interrupt Enable */ - - -/* * Power Manager (PM) control registers * * Registers diff --git a/arch/arm/mach-shmobile/pm-rcar.c b/arch/arm/mach-shmobile/pm-rcar.c index 4092ad16e0a4..0af05d288b09 100644 --- a/arch/arm/mach-shmobile/pm-rcar.c +++ b/arch/arm/mach-shmobile/pm-rcar.c @@ -12,7 +12,7 @@ #include <linux/err.h> #include <linux/mm.h> #include <linux/spinlock.h> -#include <asm/io.h> +#include <linux/io.h> #include "pm-rcar.h" /* SYSC Common */ diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index 6c09cc440a2b..c50c8d33f874 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -45,13 +45,6 @@ static struct vcpu_info __percpu *xen_vcpu_info; unsigned long xen_released_pages; struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; -/* TODO: to be removed */ -__read_mostly int xen_have_vector_callback; -EXPORT_SYMBOL_GPL(xen_have_vector_callback); - -int xen_platform_pci_unplug = XEN_UNPLUG_ALL; -EXPORT_SYMBOL_GPL(xen_platform_pci_unplug); - static __read_mostly unsigned int xen_events_irq; static __initdata struct device_node *xen_node; diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 44a59c20e773..6b4c3ad75a2a 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -81,12 +81,6 @@ #define __phys_to_virt(x) ((unsigned long)((x) - PHYS_OFFSET + PAGE_OFFSET)) /* - * Convert a physical address to a Page Frame Number and back - */ -#define __phys_to_pfn(paddr) ((unsigned long)((paddr) >> PAGE_SHIFT)) -#define __pfn_to_phys(pfn) ((phys_addr_t)(pfn) << PAGE_SHIFT) - -/* * Convert a page to/from a physical address */ #define page_to_phys(page) (__pfn_to_phys(page_to_pfn(page))) diff --git a/arch/arm64/include/asm/xen/events.h b/arch/arm64/include/asm/xen/events.h index 86553213c132..4318866d053c 100644 --- a/arch/arm64/include/asm/xen/events.h +++ b/arch/arm64/include/asm/xen/events.h @@ -18,4 +18,10 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) #define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) +/* Rebind event channel is supported by default */ +static inline bool xen_support_evtchn_rebind(void) +{ + return true; +} + #endif /* _ASM_ARM64_XEN_EVENTS_H */ diff --git a/arch/ia64/include/asm/io.h b/arch/ia64/include/asm/io.h index 80a7e34be009..9041bbe2b7b4 100644 --- a/arch/ia64/include/asm/io.h +++ b/arch/ia64/include/asm/io.h @@ -435,6 +435,7 @@ static inline void __iomem * ioremap_cache (unsigned long phys_addr, unsigned lo { return ioremap(phys_addr, size); } +#define ioremap_cache ioremap_cache /* diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c index 4826ff957a3d..5fa3848ba224 100644 --- a/arch/ia64/kernel/cyclone.c +++ b/arch/ia64/kernel/cyclone.c @@ -4,7 +4,7 @@ #include <linux/errno.h> #include <linux/timex.h> #include <linux/clocksource.h> -#include <asm/io.h> +#include <linux/io.h> /* IBM Summit (EXA) Cyclone counter code*/ #define CYCLONE_CBAR_ADDR 0xFEB00CD0 diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 97e48b0eefc7..1841ef69183d 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -645,7 +645,7 @@ mem_init (void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { pg_data_t *pgdat; struct zone *zone; @@ -656,7 +656,7 @@ int arch_add_memory(int nid, u64 start, u64 size) pgdat = NODE_DATA(nid); zone = pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL); + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); ret = __add_pages(nid, zone, start_pfn, nr_pages); if (ret) diff --git a/arch/m68k/coldfire/m54xx.c b/arch/m68k/coldfire/m54xx.c index 075aaabd1360..f7836c6a6b60 100644 --- a/arch/m68k/coldfire/m54xx.c +++ b/arch/m68k/coldfire/m54xx.c @@ -25,6 +25,7 @@ #include <asm/m54xxgpt.h> #ifdef CONFIG_MMU #include <asm/mmu_context.h> +#include <linux/pfn.h> #endif /***************************************************************************/ @@ -91,13 +92,13 @@ static void __init mcf54xx_bootmem_alloc(void) m68k_memory[0].size = _ramend - _rambase; /* compute total pages in system */ - num_pages = (_ramend - _rambase) >> PAGE_SHIFT; + num_pages = PFN_DOWN(_ramend - _rambase); /* page numbers */ memstart = PAGE_ALIGN(_ramstart); - min_low_pfn = _rambase >> PAGE_SHIFT; - start_pfn = memstart >> PAGE_SHIFT; - max_low_pfn = _ramend >> PAGE_SHIFT; + min_low_pfn = PFN_DOWN(_rambase); + start_pfn = PFN_DOWN(memstart); + max_low_pfn = PFN_DOWN(_ramend); high_memory = (void *)_ramend; m68k_virt_to_node_shift = fls(_ramend - _rambase - 1) - 6; diff --git a/arch/m68k/coldfire/pit.c b/arch/m68k/coldfire/pit.c index 493b3111d4c1..d86a9ffb3f13 100644 --- a/arch/m68k/coldfire/pit.c +++ b/arch/m68k/coldfire/pit.c @@ -42,37 +42,28 @@ static u32 pit_cnt; * This is also called after resume to bring the PIT into operation again. */ -static void init_cf_pit_timer(enum clock_event_mode mode, - struct clock_event_device *evt) +static int cf_pit_set_periodic(struct clock_event_device *evt) { - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - - __raw_writew(MCFPIT_PCSR_DISABLE, TA(MCFPIT_PCSR)); - __raw_writew(PIT_CYCLES_PER_JIFFY, TA(MCFPIT_PMR)); - __raw_writew(MCFPIT_PCSR_EN | MCFPIT_PCSR_PIE | \ - MCFPIT_PCSR_OVW | MCFPIT_PCSR_RLD | \ - MCFPIT_PCSR_CLK64, TA(MCFPIT_PCSR)); - break; - - case CLOCK_EVT_MODE_SHUTDOWN: - case CLOCK_EVT_MODE_UNUSED: - - __raw_writew(MCFPIT_PCSR_DISABLE, TA(MCFPIT_PCSR)); - break; - - case CLOCK_EVT_MODE_ONESHOT: - - __raw_writew(MCFPIT_PCSR_DISABLE, TA(MCFPIT_PCSR)); - __raw_writew(MCFPIT_PCSR_EN | MCFPIT_PCSR_PIE | \ - MCFPIT_PCSR_OVW | MCFPIT_PCSR_CLK64, \ - TA(MCFPIT_PCSR)); - break; - - case CLOCK_EVT_MODE_RESUME: - /* Nothing to do here */ - break; - } + __raw_writew(MCFPIT_PCSR_DISABLE, TA(MCFPIT_PCSR)); + __raw_writew(PIT_CYCLES_PER_JIFFY, TA(MCFPIT_PMR)); + __raw_writew(MCFPIT_PCSR_EN | MCFPIT_PCSR_PIE | + MCFPIT_PCSR_OVW | MCFPIT_PCSR_RLD | + MCFPIT_PCSR_CLK64, TA(MCFPIT_PCSR)); + return 0; +} + +static int cf_pit_set_oneshot(struct clock_event_device *evt) +{ + __raw_writew(MCFPIT_PCSR_DISABLE, TA(MCFPIT_PCSR)); + __raw_writew(MCFPIT_PCSR_EN | MCFPIT_PCSR_PIE | + MCFPIT_PCSR_OVW | MCFPIT_PCSR_CLK64, TA(MCFPIT_PCSR)); + return 0; +} + +static int cf_pit_shutdown(struct clock_event_device *evt) +{ + __raw_writew(MCFPIT_PCSR_DISABLE, TA(MCFPIT_PCSR)); + return 0; } /* @@ -88,12 +79,15 @@ static int cf_pit_next_event(unsigned long delta, } struct clock_event_device cf_pit_clockevent = { - .name = "pit", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, - .set_mode = init_cf_pit_timer, - .set_next_event = cf_pit_next_event, - .shift = 32, - .irq = MCF_IRQ_PIT1, + .name = "pit", + .features = CLOCK_EVT_FEAT_PERIODIC | + CLOCK_EVT_FEAT_ONESHOT, + .set_state_shutdown = cf_pit_shutdown, + .set_state_periodic = cf_pit_set_periodic, + .set_state_oneshot = cf_pit_set_oneshot, + .set_next_event = cf_pit_next_event, + .shift = 32, + .irq = MCF_IRQ_PIT1, }; diff --git a/arch/metag/include/asm/ftrace.h b/arch/metag/include/asm/ftrace.h index 2901f0f7d944..a2269d60a945 100644 --- a/arch/metag/include/asm/ftrace.h +++ b/arch/metag/include/asm/ftrace.h @@ -6,7 +6,7 @@ #ifndef __ASSEMBLY__ extern void mcount_wrapper(void); -#define MCOUNT_ADDR ((long)(mcount_wrapper)) +#define MCOUNT_ADDR ((unsigned long)(mcount_wrapper)) static inline unsigned long ftrace_call_adjust(unsigned long addr) { diff --git a/arch/microblaze/include/asm/ftrace.h b/arch/microblaze/include/asm/ftrace.h index fd2fa2eca62f..da0144f40d99 100644 --- a/arch/microblaze/include/asm/ftrace.h +++ b/arch/microblaze/include/asm/ftrace.h @@ -3,7 +3,7 @@ #ifdef CONFIG_FUNCTION_TRACER -#define MCOUNT_ADDR ((long)(_mcount)) +#define MCOUNT_ADDR ((unsigned long)(_mcount)) #define MCOUNT_INSN_SIZE 8 /* sizeof mcount call */ #ifndef __ASSEMBLY__ diff --git a/arch/mips/configs/pistachio_defconfig b/arch/mips/configs/pistachio_defconfig index 1646cce032c3..642b50946943 100644 --- a/arch/mips/configs/pistachio_defconfig +++ b/arch/mips/configs/pistachio_defconfig @@ -320,7 +320,6 @@ CONFIG_KEYS=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_YAMA=y -CONFIG_SECURITY_YAMA_STACKED=y CONFIG_DEFAULT_SECURITY_DAC=y CONFIG_CRYPTO_AUTHENC=y CONFIG_CRYPTO_HMAC=y diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 4ca54fdd8768..b9b4af2af9a5 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -67,7 +67,7 @@ UTS_MACHINE := $(OLDARCH) ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y) override CC += -mlittle-endian -ifneq ($(COMPILER),clang) +ifneq ($(cc-name),clang) override CC += -mno-strict-align endif override AS += -mlittle-endian @@ -353,7 +353,7 @@ TOUT := .tmp_gas_check # - Require gcc 4.0 or above on 64-bit # - gcc-4.2.0 has issues compiling modules on 64-bit checkbin: - @if test "${COMPILER}" != "clang" \ + @if test "$(cc-name)" != "clang" \ && test "$(cc-version)" = "0304" ; then \ if ! /bin/echo mftb 5 | $(AS) -v -mppc -many -o $(TOUT) >/dev/null 2>&1 ; then \ echo -n '*** ${VERSION}.${PATCHLEVEL} kernels no longer build '; \ @@ -362,14 +362,14 @@ checkbin: false; \ fi ; \ fi - @if test "${COMPILER}" != "clang" \ + @if test "$(cc-name)" != "clang" \ && test "$(cc-version)" -lt "0400" \ && test "x${CONFIG_PPC64}" = "xy" ; then \ echo -n "Sorry, GCC v4.0 or above is required to build " ; \ echo "the 64-bit powerpc kernel." ; \ false ; \ fi - @if test "${COMPILER}" != "clang" \ + @if test "$(cc-name)" != "clang" \ && test "$(cc-fullversion)" = "040200" \ && test "x${CONFIG_MODULES}${CONFIG_PPC64}" = "xyy" ; then \ echo -n '*** GCC-4.2.0 cannot compile the 64-bit powerpc ' ; \ diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index e3661872fbea..ef89b1465573 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -2,7 +2,7 @@ #define _ASM_POWERPC_FTRACE #ifdef CONFIG_FUNCTION_TRACER -#define MCOUNT_ADDR ((long)(_mcount)) +#define MCOUNT_ADDR ((unsigned long)(_mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ #ifdef __ASSEMBLY__ diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c index c8c62c7fc31c..2e710c15893f 100644 --- a/arch/powerpc/kernel/pci_of_scan.c +++ b/arch/powerpc/kernel/pci_of_scan.c @@ -102,7 +102,7 @@ static void of_pci_parse_addrs(struct device_node *node, struct pci_dev *dev) res = &dev->resource[(i - PCI_BASE_ADDRESS_0) >> 2]; } else if (i == dev->rom_base_reg) { res = &dev->resource[PCI_ROM_RESOURCE]; - flags |= IORESOURCE_READONLY | IORESOURCE_CACHEABLE; + flags |= IORESOURCE_READONLY; } else { printk(KERN_ERR "PCI: bad cfg reg num 0x%x\n", i); continue; diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index e1fe333da946..22d94c3e6fc4 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -113,7 +113,7 @@ int memory_add_physaddr_to_nid(u64 start) } #endif -int arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { struct pglist_data *pgdata; struct zone *zone; @@ -128,7 +128,7 @@ int arch_add_memory(int nid, u64 start, u64 size) /* this should work for most non-highmem platforms */ zone = pgdata->node_zones + - zone_for_memory(nid, start, size, 0); + zone_for_memory(nid, start, size, 0, for_device); return __add_pages(nid, zone, start_pfn, nr_pages); } diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index f86250c48b53..d2b79bc336c1 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -141,13 +141,14 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio) */ static long axon_ram_direct_access(struct block_device *device, sector_t sector, - void **kaddr, unsigned long *pfn, long size) + void __pmem **kaddr, unsigned long *pfn) { struct axon_ram_bank *bank = device->bd_disk->private_data; loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT; + void *addr = (void *)(bank->ph_addr + offset); - *kaddr = (void *)(bank->ph_addr + offset); - *pfn = virt_to_phys(*kaddr) >> PAGE_SHIFT; + *kaddr = (void __pmem *)addr; + *pfn = virt_to_phys(addr) >> PAGE_SHIFT; return bank->size - offset; } diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 2963b563621c..c3c07d3505ba 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -169,7 +169,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end) #endif #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { unsigned long normal_end_pfn = PFN_DOWN(memblock_end_of_DRAM()); unsigned long dma_end_pfn = PFN_DOWN(MAX_DMA_ADDRESS); diff --git a/arch/sh/include/asm/ftrace.h b/arch/sh/include/asm/ftrace.h index e79fb6ebaa42..1f157b86eaa7 100644 --- a/arch/sh/include/asm/ftrace.h +++ b/arch/sh/include/asm/ftrace.h @@ -9,7 +9,7 @@ #ifndef __ASSEMBLY__ extern void mcount(void); -#define MCOUNT_ADDR ((long)(mcount)) +#define MCOUNT_ADDR ((unsigned long)(mcount)) #ifdef CONFIG_DYNAMIC_FTRACE #define CALL_ADDR ((long)(ftrace_call)) diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h index 93ec9066dbef..3280a6bfa503 100644 --- a/arch/sh/include/asm/io.h +++ b/arch/sh/include/asm/io.h @@ -342,6 +342,7 @@ ioremap_cache(phys_addr_t offset, unsigned long size) { return __ioremap_mode(offset, size, PAGE_KERNEL); } +#define ioremap_cache ioremap_cache #ifdef CONFIG_HAVE_IOREMAP_PROT static inline void __iomem * diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 17f486233db0..75491862d900 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -485,7 +485,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) #endif #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { pg_data_t *pgdat; unsigned long start_pfn = PFN_DOWN(start); @@ -496,7 +496,8 @@ int arch_add_memory(int nid, u64 start, u64 size) /* We only have ZONE_NORMAL, so this is easy.. */ ret = __add_pages(nid, pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL), + zone_for_memory(nid, start, size, ZONE_NORMAL, + for_device), start_pfn, nr_pages); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/sparc/include/asm/ftrace.h b/arch/sparc/include/asm/ftrace.h index 9ec94ad116fb..3192a8e42fd6 100644 --- a/arch/sparc/include/asm/ftrace.h +++ b/arch/sparc/include/asm/ftrace.h @@ -2,7 +2,7 @@ #define _ASM_SPARC64_FTRACE #ifdef CONFIG_MCOUNT -#define MCOUNT_ADDR ((long)(_mcount)) +#define MCOUNT_ADDR ((unsigned long)(_mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ #ifndef __ASSEMBLY__ diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c index 3a14a35592fe..b91d7f146175 100644 --- a/arch/sparc/kernel/pci.c +++ b/arch/sparc/kernel/pci.c @@ -231,8 +231,7 @@ static void pci_parse_of_addrs(struct platform_device *op, res = &dev->resource[(i - PCI_BASE_ADDRESS_0) >> 2]; } else if (i == dev->rom_base_reg) { res = &dev->resource[PCI_ROM_RESOURCE]; - flags |= IORESOURCE_READONLY | IORESOURCE_CACHEABLE - | IORESOURCE_SIZEALIGN; + flags |= IORESOURCE_READONLY | IORESOURCE_SIZEALIGN; } else { printk(KERN_ERR "PCI: bad cfg reg num 0x%x\n", i); continue; diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c index 5bd252e3fdc5..d4e1fc41d06d 100644 --- a/arch/tile/mm/init.c +++ b/arch/tile/mm/init.c @@ -863,7 +863,7 @@ void __init mem_init(void) * memory to the highmem for now. */ #ifndef CONFIG_NEED_MULTIPLE_NODES -int arch_add_memory(u64 start, u64 size) +int arch_add_memory(u64 start, u64 size, bool for_device) { struct pglist_data *pgdata = &contig_page_data; struct zone *zone = pgdata->node_zones + MAX_NR_ZONES-1; diff --git a/arch/unicore32/include/asm/memory.h b/arch/unicore32/include/asm/memory.h index debafc40200a..3bb0a29fd2d7 100644 --- a/arch/unicore32/include/asm/memory.h +++ b/arch/unicore32/include/asm/memory.h @@ -61,12 +61,6 @@ #endif /* - * Convert a physical address to a Page Frame Number and back - */ -#define __phys_to_pfn(paddr) ((paddr) >> PAGE_SHIFT) -#define __pfn_to_phys(pfn) ((pfn) << PAGE_SHIFT) - -/* * Convert a page to/from a physical address */ #define page_to_phys(page) (__pfn_to_phys(page_to_pfn(page))) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 117e2f373e50..cc0d73eac047 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -27,7 +27,8 @@ config X86 select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_PMEM_API + select ARCH_HAS_PMEM_API if X86_64 + select ARCH_HAS_MMIO_FLUSH select ARCH_HAS_SG_CHAIN select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI @@ -1450,10 +1451,14 @@ config ILLEGAL_POINTER_VALUE source "mm/Kconfig" +config X86_PMEM_LEGACY_DEVICE + bool + config X86_PMEM_LEGACY - bool "Support non-standard NVDIMMs and ADR protected memory" + tristate "Support non-standard NVDIMMs and ADR protected memory" depends on PHYS_ADDR_T_64BIT depends on BLK_DEV + select X86_PMEM_LEGACY_DEVICE select LIBNVDIMM help Treat memory marked using the non-standard e820 type of 12 as used diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 64d7cf1b50e1..440df0c7a2ee 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -294,6 +294,7 @@ static struct ahash_alg ghash_async_alg = { .cra_name = "ghash", .cra_driver_name = "ghash-clmulni", .cra_priority = 400, + .cra_ctxsize = sizeof(struct ghash_async_ctx), .cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC, .cra_blocksize = GHASH_BLOCK_SIZE, .cra_type = &crypto_ahash_type, diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 9bf3ea14b9f0..e63aa38e85fb 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -89,6 +89,8 @@ int set_pages_rw(struct page *page, int numpages); void clflush_cache_range(void *addr, unsigned int size); +#define mmio_flush_range(addr, size) clflush_cache_range(addr, size) + #ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void); extern const int rodata_test_data; @@ -109,75 +111,4 @@ static inline int rodata_test(void) } #endif -#ifdef ARCH_HAS_NOCACHE_UACCESS - -/** - * arch_memcpy_to_pmem - copy data to persistent memory - * @dst: destination buffer for the copy - * @src: source buffer for the copy - * @n: length of the copy in bytes - * - * Copy data to persistent memory media via non-temporal stores so that - * a subsequent arch_wmb_pmem() can flush cpu and memory controller - * write buffers to guarantee durability. - */ -static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, - size_t n) -{ - int unwritten; - - /* - * We are copying between two kernel buffers, if - * __copy_from_user_inatomic_nocache() returns an error (page - * fault) we would have already reported a general protection fault - * before the WARN+BUG. - */ - unwritten = __copy_from_user_inatomic_nocache((void __force *) dst, - (void __user *) src, n); - if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n", - __func__, dst, src, unwritten)) - BUG(); -} - -/** - * arch_wmb_pmem - synchronize writes to persistent memory - * - * After a series of arch_memcpy_to_pmem() operations this drains data - * from cpu write buffers and any platform (memory controller) buffers - * to ensure that written data is durable on persistent memory media. - */ -static inline void arch_wmb_pmem(void) -{ - /* - * wmb() to 'sfence' all previous writes such that they are - * architecturally visible to 'pcommit'. Note, that we've - * already arranged for pmem writes to avoid the cache via - * arch_memcpy_to_pmem(). - */ - wmb(); - pcommit_sfence(); -} - -static inline bool __arch_has_wmb_pmem(void) -{ -#ifdef CONFIG_X86_64 - /* - * We require that wmb() be an 'sfence', that is only guaranteed on - * 64-bit builds - */ - return static_cpu_has(X86_FEATURE_PCOMMIT); -#else - return false; -#endif -} -#else /* ARCH_HAS_NOCACHE_UACCESS i.e. ARCH=um */ -extern void arch_memcpy_to_pmem(void __pmem *dst, const void *src, size_t n); -extern void arch_wmb_pmem(void); - -static inline bool __arch_has_wmb_pmem(void) -{ - return false; -} -#endif - #endif /* _ASM_X86_CACHEFLUSH_H */ diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index f45acad3c4b6..24938852db30 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -3,9 +3,9 @@ #ifdef CONFIG_FUNCTION_TRACER #ifdef CC_USING_FENTRY -# define MCOUNT_ADDR ((long)(__fentry__)) +# define MCOUNT_ADDR ((unsigned long)(__fentry__)) #else -# define MCOUNT_ADDR ((long)(mcount)) +# define MCOUNT_ADDR ((unsigned long)(mcount)) #endif #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 7cfc085b6879..de25aad07853 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -250,12 +250,6 @@ static inline void flush_write_buffers(void) #endif } -static inline void __pmem *arch_memremap_pmem(resource_size_t offset, - unsigned long size) -{ - return (void __force __pmem *) ioremap_cache(offset, size); -} - #endif /* __KERNEL__ */ extern void native_io_delay(void); diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h new file mode 100644 index 000000000000..d8ce3ec816ab --- /dev/null +++ b/arch/x86/include/asm/pmem.h @@ -0,0 +1,153 @@ +/* + * Copyright(c) 2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef __ASM_X86_PMEM_H__ +#define __ASM_X86_PMEM_H__ + +#include <linux/uaccess.h> +#include <asm/cacheflush.h> +#include <asm/cpufeature.h> +#include <asm/special_insns.h> + +#ifdef CONFIG_ARCH_HAS_PMEM_API +/** + * arch_memcpy_to_pmem - copy data to persistent memory + * @dst: destination buffer for the copy + * @src: source buffer for the copy + * @n: length of the copy in bytes + * + * Copy data to persistent memory media via non-temporal stores so that + * a subsequent arch_wmb_pmem() can flush cpu and memory controller + * write buffers to guarantee durability. + */ +static inline void arch_memcpy_to_pmem(void __pmem *dst, const void *src, + size_t n) +{ + int unwritten; + + /* + * We are copying between two kernel buffers, if + * __copy_from_user_inatomic_nocache() returns an error (page + * fault) we would have already reported a general protection fault + * before the WARN+BUG. + */ + unwritten = __copy_from_user_inatomic_nocache((void __force *) dst, + (void __user *) src, n); + if (WARN(unwritten, "%s: fault copying %p <- %p unwritten: %d\n", + __func__, dst, src, unwritten)) + BUG(); +} + +/** + * arch_wmb_pmem - synchronize writes to persistent memory + * + * After a series of arch_memcpy_to_pmem() operations this drains data + * from cpu write buffers and any platform (memory controller) buffers + * to ensure that written data is durable on persistent memory media. + */ +static inline void arch_wmb_pmem(void) +{ + /* + * wmb() to 'sfence' all previous writes such that they are + * architecturally visible to 'pcommit'. Note, that we've + * already arranged for pmem writes to avoid the cache via + * arch_memcpy_to_pmem(). + */ + wmb(); + pcommit_sfence(); +} + +/** + * __arch_wb_cache_pmem - write back a cache range with CLWB + * @vaddr: virtual start address + * @size: number of bytes to write back + * + * Write back a cache range using the CLWB (cache line write back) + * instruction. This function requires explicit ordering with an + * arch_wmb_pmem() call. This API is internal to the x86 PMEM implementation. + */ +static inline void __arch_wb_cache_pmem(void *vaddr, size_t size) +{ + u16 x86_clflush_size = boot_cpu_data.x86_clflush_size; + unsigned long clflush_mask = x86_clflush_size - 1; + void *vend = vaddr + size; + void *p; + + for (p = (void *)((unsigned long)vaddr & ~clflush_mask); + p < vend; p += x86_clflush_size) + clwb(p); +} + +/* + * copy_from_iter_nocache() on x86 only uses non-temporal stores for iovec + * iterators, so for other types (bvec & kvec) we must do a cache write-back. + */ +static inline bool __iter_needs_pmem_wb(struct iov_iter *i) +{ + return iter_is_iovec(i) == false; +} + +/** + * arch_copy_from_iter_pmem - copy data from an iterator to PMEM + * @addr: PMEM destination address + * @bytes: number of bytes to copy + * @i: iterator with source data + * + * Copy data from the iterator 'i' to the PMEM buffer starting at 'addr'. + * This function requires explicit ordering with an arch_wmb_pmem() call. + */ +static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes, + struct iov_iter *i) +{ + void *vaddr = (void __force *)addr; + size_t len; + + /* TODO: skip the write-back by always using non-temporal stores */ + len = copy_from_iter_nocache(vaddr, bytes, i); + + if (__iter_needs_pmem_wb(i)) + __arch_wb_cache_pmem(vaddr, bytes); + + return len; +} + +/** + * arch_clear_pmem - zero a PMEM memory range + * @addr: virtual start address + * @size: number of bytes to zero + * + * Write zeros into the memory range starting at 'addr' for 'size' bytes. + * This function requires explicit ordering with an arch_wmb_pmem() call. + */ +static inline void arch_clear_pmem(void __pmem *addr, size_t size) +{ + void *vaddr = (void __force *)addr; + + /* TODO: implement the zeroing via non-temporal writes */ + if (size == PAGE_SIZE && ((unsigned long)vaddr & ~PAGE_MASK) == 0) + clear_page(vaddr); + else + memset(vaddr, 0, size); + + __arch_wb_cache_pmem(vaddr, size); +} + +static inline bool __arch_has_wmb_pmem(void) +{ + /* + * We require that wmb() be an 'sfence', that is only guaranteed on + * 64-bit builds + */ + return static_cpu_has(X86_FEATURE_PCOMMIT); +} +#endif /* CONFIG_ARCH_HAS_PMEM_API */ +#endif /* __ASM_X86_PMEM_H__ */ diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h index 608a79d5a466..e6911caf5bbf 100644 --- a/arch/x86/include/asm/xen/events.h +++ b/arch/x86/include/asm/xen/events.h @@ -20,4 +20,15 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) /* No need for a barrier -- XCHG is a barrier on x86. */ #define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) +extern int xen_have_vector_callback; + +/* + * Events delivered via platform PCI interrupts are always + * routed to vcpu 0 and hence cannot be rebound. + */ +static inline bool xen_support_evtchn_rebind(void) +{ + return (!xen_hvm_domain() || xen_have_vector_callback); +} + #endif /* _ASM_X86_XEN_EVENTS_H */ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index ca08a27b90b3..83aea8055119 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -465,6 +465,12 @@ HYPERVISOR_tmem_op( return _hypercall1(int, tmem_op, op); } +static inline int +HYPERVISOR_xenpmu_op(unsigned int op, void *arg) +{ + return _hypercall2(int, xenpmu_op, op, arg); +} + static inline void MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) { diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index 3400dbaec3c3..62ca03ef5c65 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -3,12 +3,38 @@ * * Guest OS interface to x86 Xen. * - * Copyright (c) 2004, K A Fraser + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (c) 2004-2006, K A Fraser */ #ifndef _ASM_X86_XEN_INTERFACE_H #define _ASM_X86_XEN_INTERFACE_H +/* + * XEN_GUEST_HANDLE represents a guest pointer, when passed as a field + * in a struct in memory. + * XEN_GUEST_HANDLE_PARAM represent a guest pointer, when passed as an + * hypercall argument. + * XEN_GUEST_HANDLE_PARAM and XEN_GUEST_HANDLE are the same on X86 but + * they might not be on other architectures. + */ #ifdef __XEN__ #define __DEFINE_GUEST_HANDLE(name, type) \ typedef struct { type *p; } __guest_handle_ ## name @@ -88,13 +114,16 @@ DEFINE_GUEST_HANDLE(xen_ulong_t); * start of the GDT because some stupid OSes export hard-coded selector values * in their ABI. These hard-coded values are always near the start of the GDT, * so Xen places itself out of the way, at the far end of the GDT. + * + * NB The LDT is set using the MMUEXT_SET_LDT op of HYPERVISOR_mmuext_op */ #define FIRST_RESERVED_GDT_PAGE 14 #define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096) #define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8) /* - * Send an array of these to HYPERVISOR_set_trap_table() + * Send an array of these to HYPERVISOR_set_trap_table(). + * Terminate the array with a sentinel entry, with traps[].address==0. * The privilege level specifies which modes may enter a trap via a software * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate * privilege levels as follows: @@ -118,10 +147,41 @@ struct trap_info { DEFINE_GUEST_HANDLE_STRUCT(trap_info); struct arch_shared_info { - unsigned long max_pfn; /* max pfn that appears in table */ - /* Frame containing list of mfns containing list of mfns containing p2m. */ - unsigned long pfn_to_mfn_frame_list_list; - unsigned long nmi_reason; + /* + * Number of valid entries in the p2m table(s) anchored at + * pfn_to_mfn_frame_list_list and/or p2m_vaddr. + */ + unsigned long max_pfn; + /* + * Frame containing list of mfns containing list of mfns containing p2m. + * A value of 0 indicates it has not yet been set up, ~0 indicates it + * has been set to invalid e.g. due to the p2m being too large for the + * 3-level p2m tree. In this case the linear mapper p2m list anchored + * at p2m_vaddr is to be used. + */ + xen_pfn_t pfn_to_mfn_frame_list_list; + unsigned long nmi_reason; + /* + * Following three fields are valid if p2m_cr3 contains a value + * different from 0. + * p2m_cr3 is the root of the address space where p2m_vaddr is valid. + * p2m_cr3 is in the same format as a cr3 value in the vcpu register + * state and holds the folded machine frame number (via xen_pfn_to_cr3) + * of a L3 or L4 page table. + * p2m_vaddr holds the virtual address of the linear p2m list. All + * entries in the range [0...max_pfn[ are accessible via this pointer. + * p2m_generation will be incremented by the guest before and after each + * change of the mappings of the p2m list. p2m_generation starts at 0 + * and a value with the least significant bit set indicates that a + * mapping update is in progress. This allows guest external software + * (e.g. in Dom0) to verify that read mappings are consistent and + * whether they have changed since the last check. + * Modifying a p2m element in the linear p2m list is allowed via an + * atomic write only. + */ + unsigned long p2m_cr3; /* cr3 value of the p2m address space */ + unsigned long p2m_vaddr; /* virtual address of the p2m list */ + unsigned long p2m_generation; /* generation count of p2m mapping */ }; #endif /* !__ASSEMBLY__ */ @@ -137,13 +197,31 @@ struct arch_shared_info { /* * The following is all CPU context. Note that the fpu_ctxt block is filled * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used. + * + * Also note that when calling DOMCTL_setvcpucontext and VCPU_initialise + * for HVM and PVH guests, not all information in this structure is updated: + * + * - For HVM guests, the structures read include: fpu_ctxt (if + * VGCT_I387_VALID is set), flags, user_regs, debugreg[*] + * + * - PVH guests are the same as HVM guests, but additionally use ctrlreg[3] to + * set cr3. All other fields not used should be set to 0. */ struct vcpu_guest_context { /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */ struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */ -#define VGCF_I387_VALID (1<<0) -#define VGCF_HVM_GUEST (1<<1) -#define VGCF_IN_KERNEL (1<<2) +#define VGCF_I387_VALID (1<<0) +#define VGCF_IN_KERNEL (1<<2) +#define _VGCF_i387_valid 0 +#define VGCF_i387_valid (1<<_VGCF_i387_valid) +#define _VGCF_in_kernel 2 +#define VGCF_in_kernel (1<<_VGCF_in_kernel) +#define _VGCF_failsafe_disables_events 3 +#define VGCF_failsafe_disables_events (1<<_VGCF_failsafe_disables_events) +#define _VGCF_syscall_disables_events 4 +#define VGCF_syscall_disables_events (1<<_VGCF_syscall_disables_events) +#define _VGCF_online 5 +#define VGCF_online (1<<_VGCF_online) unsigned long flags; /* VGCF_* flags */ struct cpu_user_regs user_regs; /* User-level CPU registers */ struct trap_info trap_ctxt[256]; /* Virtual IDT */ @@ -172,6 +250,129 @@ struct vcpu_guest_context { #endif }; DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context); + +/* AMD PMU registers and structures */ +struct xen_pmu_amd_ctxt { + /* + * Offsets to counter and control MSRs (relative to xen_pmu_arch.c.amd). + * For PV(H) guests these fields are RO. + */ + uint32_t counters; + uint32_t ctrls; + + /* Counter MSRs */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + uint64_t regs[]; +#elif defined(__GNUC__) + uint64_t regs[0]; +#endif +}; + +/* Intel PMU registers and structures */ +struct xen_pmu_cntr_pair { + uint64_t counter; + uint64_t control; +}; + +struct xen_pmu_intel_ctxt { + /* + * Offsets to fixed and architectural counter MSRs (relative to + * xen_pmu_arch.c.intel). + * For PV(H) guests these fields are RO. + */ + uint32_t fixed_counters; + uint32_t arch_counters; + + /* PMU registers */ + uint64_t global_ctrl; + uint64_t global_ovf_ctrl; + uint64_t global_status; + uint64_t fixed_ctrl; + uint64_t ds_area; + uint64_t pebs_enable; + uint64_t debugctl; + + /* Fixed and architectural counter MSRs */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + uint64_t regs[]; +#elif defined(__GNUC__) + uint64_t regs[0]; +#endif +}; + +/* Sampled domain's registers */ +struct xen_pmu_regs { + uint64_t ip; + uint64_t sp; + uint64_t flags; + uint16_t cs; + uint16_t ss; + uint8_t cpl; + uint8_t pad[3]; +}; + +/* PMU flags */ +#define PMU_CACHED (1<<0) /* PMU MSRs are cached in the context */ +#define PMU_SAMPLE_USER (1<<1) /* Sample is from user or kernel mode */ +#define PMU_SAMPLE_REAL (1<<2) /* Sample is from realmode */ +#define PMU_SAMPLE_PV (1<<3) /* Sample from a PV guest */ + +/* + * Architecture-specific information describing state of the processor at + * the time of PMU interrupt. + * Fields of this structure marked as RW for guest should only be written by + * the guest when PMU_CACHED bit in pmu_flags is set (which is done by the + * hypervisor during PMU interrupt). Hypervisor will read updated data in + * XENPMU_flush hypercall and clear PMU_CACHED bit. + */ +struct xen_pmu_arch { + union { + /* + * Processor's registers at the time of interrupt. + * WO for hypervisor, RO for guests. + */ + struct xen_pmu_regs regs; + /* + * Padding for adding new registers to xen_pmu_regs in + * the future + */ +#define XENPMU_REGS_PAD_SZ 64 + uint8_t pad[XENPMU_REGS_PAD_SZ]; + } r; + + /* WO for hypervisor, RO for guest */ + uint64_t pmu_flags; + + /* + * APIC LVTPC register. + * RW for both hypervisor and guest. + * Only APIC_LVT_MASKED bit is loaded by the hypervisor into hardware + * during XENPMU_flush or XENPMU_lvtpc_set. + */ + union { + uint32_t lapic_lvtpc; + uint64_t pad; + } l; + + /* + * Vendor-specific PMU registers. + * RW for both hypervisor and guest (see exceptions above). + * Guest's updates to this field are verified and then loaded by the + * hypervisor into hardware during XENPMU_flush + */ + union { + struct xen_pmu_amd_ctxt amd; + struct xen_pmu_intel_ctxt intel; + + /* + * Padding for contexts (fixed parts only, does not include + * MSR banks that are specified by offsets) + */ +#define XENPMU_CTXT_PAD_SZ 128 + uint8_t pad[XENPMU_CTXT_PAD_SZ]; + } c; +}; + #endif /* !__ASSEMBLY__ */ /* diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index c44a5d53e464..a3804fbe1f36 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -35,9 +35,7 @@ typedef struct xpaddr { #define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) #define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT) -/* Maximum amount of memory we can handle in a domain in pages */ -#define MAX_DOMAIN_PAGES \ - ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) +#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) extern unsigned long *machine_to_phys_mapping; extern unsigned long machine_to_phys_nr; @@ -48,8 +46,8 @@ extern unsigned long xen_max_p2m_pfn; extern unsigned long get_phys_to_machine(unsigned long pfn); extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); -extern unsigned long set_phys_range_identity(unsigned long pfn_s, - unsigned long pfn_e); +extern unsigned long __init set_phys_range_identity(unsigned long pfn_s, + unsigned long pfn_e); extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, struct gnttab_map_grant_ref *kmap_ops, diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h index 0f457e6eab18..9dafe59cf6e2 100644 --- a/arch/x86/include/uapi/asm/e820.h +++ b/arch/x86/include/uapi/asm/e820.h @@ -37,7 +37,7 @@ /* * This is a non-standardized way to represent ADR or NVDIMM regions that * persist over a reboot. The kernel will ignore their special capabilities - * unless the CONFIG_X86_PMEM_LEGACY=y option is set. + * unless the CONFIG_X86_PMEM_LEGACY option is set. * * ( Note that older platforms also used 6 for the same type of memory, * but newer versions switched to 12 as 6 was assigned differently. Some diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 3c3622176340..9ffdf25e5b86 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -94,7 +94,7 @@ obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o -obj-$(CONFIG_X86_PMEM_LEGACY) += pmem.o +obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 6873ab925d00..045e424fb368 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -28,146 +28,21 @@ u64 hw_nmi_get_sample_period(int watchdog_thresh) #endif #ifdef arch_trigger_all_cpu_backtrace -/* For reliability, we're prepared to waste bits here. */ -static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; -static cpumask_t printtrace_mask; - -#define NMI_BUF_SIZE 4096 - -struct nmi_seq_buf { - unsigned char buffer[NMI_BUF_SIZE]; - struct seq_buf seq; -}; - -/* Safe printing in NMI context */ -static DEFINE_PER_CPU(struct nmi_seq_buf, nmi_print_seq); - -/* "in progress" flag of arch_trigger_all_cpu_backtrace */ -static unsigned long backtrace_flag; - -static void print_seq_line(struct nmi_seq_buf *s, int start, int end) +static void nmi_raise_cpu_backtrace(cpumask_t *mask) { - const char *buf = s->buffer + start; - - printk("%.*s", (end - start) + 1, buf); + apic->send_IPI_mask(mask, NMI_VECTOR); } void arch_trigger_all_cpu_backtrace(bool include_self) { - struct nmi_seq_buf *s; - int len; - int cpu; - int i; - int this_cpu = get_cpu(); - - if (test_and_set_bit(0, &backtrace_flag)) { - /* - * If there is already a trigger_all_cpu_backtrace() in progress - * (backtrace_flag == 1), don't output double cpu dump infos. - */ - put_cpu(); - return; - } - - cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); - if (!include_self) - cpumask_clear_cpu(this_cpu, to_cpumask(backtrace_mask)); - - cpumask_copy(&printtrace_mask, to_cpumask(backtrace_mask)); - /* - * Set up per_cpu seq_buf buffers that the NMIs running on the other - * CPUs will write to. - */ - for_each_cpu(cpu, to_cpumask(backtrace_mask)) { - s = &per_cpu(nmi_print_seq, cpu); - seq_buf_init(&s->seq, s->buffer, NMI_BUF_SIZE); - } - - if (!cpumask_empty(to_cpumask(backtrace_mask))) { - pr_info("sending NMI to %s CPUs:\n", - (include_self ? "all" : "other")); - apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR); - } - - /* Wait for up to 10 seconds for all CPUs to do the backtrace */ - for (i = 0; i < 10 * 1000; i++) { - if (cpumask_empty(to_cpumask(backtrace_mask))) - break; - mdelay(1); - touch_softlockup_watchdog(); - } - - /* - * Now that all the NMIs have triggered, we can dump out their - * back traces safely to the console. - */ - for_each_cpu(cpu, &printtrace_mask) { - int last_i = 0; - - s = &per_cpu(nmi_print_seq, cpu); - len = seq_buf_used(&s->seq); - if (!len) - continue; - - /* Print line by line. */ - for (i = 0; i < len; i++) { - if (s->buffer[i] == '\n') { - print_seq_line(s, last_i, i); - last_i = i + 1; - } - } - /* Check if there was a partial line. */ - if (last_i < len) { - print_seq_line(s, last_i, len - 1); - pr_cont("\n"); - } - } - - clear_bit(0, &backtrace_flag); - smp_mb__after_atomic(); - put_cpu(); -} - -/* - * It is not safe to call printk() directly from NMI handlers. - * It may be fine if the NMI detected a lock up and we have no choice - * but to do so, but doing a NMI on all other CPUs to get a back trace - * can be done with a sysrq-l. We don't want that to lock up, which - * can happen if the NMI interrupts a printk in progress. - * - * Instead, we redirect the vprintk() to this nmi_vprintk() that writes - * the content into a per cpu seq_buf buffer. Then when the NMIs are - * all done, we can safely dump the contents of the seq_buf to a printk() - * from a non NMI context. - */ -static int nmi_vprintk(const char *fmt, va_list args) -{ - struct nmi_seq_buf *s = this_cpu_ptr(&nmi_print_seq); - unsigned int len = seq_buf_used(&s->seq); - - seq_buf_vprintf(&s->seq, fmt, args); - return seq_buf_used(&s->seq) - len; + nmi_trigger_all_cpu_backtrace(include_self, nmi_raise_cpu_backtrace); } static int arch_trigger_all_cpu_backtrace_handler(unsigned int cmd, struct pt_regs *regs) { - int cpu; - - cpu = smp_processor_id(); - - if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - printk_func_t printk_func_save = this_cpu_read(printk_func); - - /* Replace printk to write into the NMI seq */ - this_cpu_write(printk_func, nmi_vprintk); - printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); - show_regs(regs); - this_cpu_write(printk_func, printk_func_save); - - cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); + if (nmi_cpu_backtrace(regs)) return NMI_HANDLED; - } return NMI_DONE; } diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 961e51e9c6f6..0f8a6bbaaa44 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -533,7 +533,9 @@ static int bzImage64_verify_sig(const char *kernel, unsigned long kernel_len) int ret; ret = verify_pefile_signature(kernel, kernel_len, - system_trusted_keyring, &trusted); + system_trusted_keyring, + VERIFYING_KEXEC_PE_SIGNATURE, + &trusted); if (ret < 0) return ret; if (!trusted) diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 64f90f53bb85..4f00b63d7ff3 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -3,80 +3,17 @@ * Copyright (c) 2015, Intel Corporation. */ #include <linux/platform_device.h> -#include <linux/libnvdimm.h> #include <linux/module.h> -#include <asm/e820.h> - -static void e820_pmem_release(struct device *dev) -{ - struct nvdimm_bus *nvdimm_bus = dev->platform_data; - - if (nvdimm_bus) - nvdimm_bus_unregister(nvdimm_bus); -} - -static struct platform_device e820_pmem = { - .name = "e820_pmem", - .id = -1, - .dev = { - .release = e820_pmem_release, - }, -}; - -static const struct attribute_group *e820_pmem_attribute_groups[] = { - &nvdimm_bus_attribute_group, - NULL, -}; - -static const struct attribute_group *e820_pmem_region_attribute_groups[] = { - &nd_region_attribute_group, - &nd_device_attribute_group, - NULL, -}; static __init int register_e820_pmem(void) { - static struct nvdimm_bus_descriptor nd_desc; - struct device *dev = &e820_pmem.dev; - struct nvdimm_bus *nvdimm_bus; - int rc, i; - - rc = platform_device_register(&e820_pmem); - if (rc) - return rc; - - nd_desc.attr_groups = e820_pmem_attribute_groups; - nd_desc.provider_name = "e820"; - nvdimm_bus = nvdimm_bus_register(dev, &nd_desc); - if (!nvdimm_bus) - goto err; - dev->platform_data = nvdimm_bus; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - struct resource res = { - .flags = IORESOURCE_MEM, - .start = ei->addr, - .end = ei->addr + ei->size - 1, - }; - struct nd_region_desc ndr_desc; - - if (ei->type != E820_PRAM) - continue; - - memset(&ndr_desc, 0, sizeof(ndr_desc)); - ndr_desc.res = &res; - ndr_desc.attr_groups = e820_pmem_region_attribute_groups; - ndr_desc.numa_node = NUMA_NO_NODE; - if (!nvdimm_pmem_region_create(nvdimm_bus, &ndr_desc)) - goto err; - } - - return 0; - - err: - dev_err(dev, "failed to register legacy persistent memory ranges\n"); - platform_device_unregister(&e820_pmem); - return -ENXIO; + struct platform_device *pdev; + + /* + * See drivers/nvdimm/e820.c for the implementation, this is + * simply here to trigger the module to load on demand. + */ + pdev = platform_device_alloc("e820_pmem", -1); + return platform_device_add(pdev); } device_initcall(register_e820_pmem); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 68aec42545c2..7562f42914b4 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -823,11 +823,11 @@ void __init mem_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { struct pglist_data *pgdata = NODE_DATA(nid); struct zone *zone = pgdata->node_zones + - zone_for_memory(nid, start, size, ZONE_HIGHMEM); + zone_for_memory(nid, start, size, ZONE_HIGHMEM, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3fba623e3ba5..30564e2752d3 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -687,11 +687,11 @@ static void update_end_of_memory_vars(u64 start, u64 size) * Memory is added always to NORMAL zone. This means you will never get * additional DMA/DMA32 memory. */ -int arch_add_memory(int nid, u64 start, u64 size) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { struct pglist_data *pgdat = NODE_DATA(nid); struct zone *zone = pgdat->node_zones + - zone_for_memory(nid, start, size, ZONE_NORMAL); + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 484145368a24..c7b15f3e2cf3 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -7,6 +7,7 @@ config XEN depends on PARAVIRT select PARAVIRT_CLOCK select XEN_HAVE_PVMMU + select XEN_HAVE_VPMU depends on X86_64 || (X86_32 && X86_PAE) depends on X86_LOCAL_APIC && X86_TSC help @@ -23,14 +24,18 @@ config XEN_PVHVM def_bool y depends on XEN && PCI && X86_LOCAL_APIC -config XEN_MAX_DOMAIN_MEMORY - int - default 500 if X86_64 - default 64 if X86_32 - depends on XEN - help - This only affects the sizing of some bss arrays, the unused - portions of which are freed. +config XEN_512GB + bool "Limit Xen pv-domain memory to 512GB" + depends on XEN && X86_64 + default y + help + Limit paravirtualized user domains to 512GB of RAM. + + The Xen tools and crash dump analysis tools might not support + pv-domains with more than 512 GB of RAM. This option controls the + default setting of the kernel to use only up to 512 GB or more. + It is always possible to change the default via specifying the + boot parameter "xen_512gb_limit". config XEN_SAVE_RESTORE bool diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 4b6e29ac0968..e47e52787d32 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -13,7 +13,7 @@ CFLAGS_mmu.o := $(nostackp) obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ grant-table.o suspend.o platform-pci-unplug.o \ - p2m.o apic.o + p2m.o apic.o pmu.o obj-$(CONFIG_EVENT_TRACING) += trace.o diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 70e060ad879a..acda713ab5be 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -7,6 +7,7 @@ #include <xen/xen.h> #include <xen/interface/physdev.h> #include "xen-ops.h" +#include "pmu.h" #include "smp.h" static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) @@ -72,6 +73,11 @@ static u32 xen_apic_read(u32 reg) static void xen_apic_write(u32 reg, u32 val) { + if (reg == APIC_LVTPC) { + (void)pmu_apic_update(reg); + return; + } + /* Warn to see if there's any stray references */ WARN(1,"register: %x, value: %x\n", reg, val); } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index d9cfa452da9d..30d12afe52ed 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -84,6 +84,7 @@ #include "mmu.h" #include "smp.h" #include "multicalls.h" +#include "pmu.h" EXPORT_SYMBOL_GPL(hypercall_page); @@ -1010,8 +1011,7 @@ static void xen_write_cr0(unsigned long cr0) static void xen_write_cr4(unsigned long cr4) { - cr4 &= ~X86_CR4_PGE; - cr4 &= ~X86_CR4_PSE; + cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE); native_write_cr4(cr4); } @@ -1030,6 +1030,9 @@ static u64 xen_read_msr_safe(unsigned int msr, int *err) { u64 val; + if (pmu_msr_read(msr, &val, err)) + return val; + val = native_read_msr_safe(msr, err); switch (msr) { case MSR_IA32_APICBASE: @@ -1076,7 +1079,8 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) Xen console noise. */ default: - ret = native_write_msr_safe(msr, low, high); + if (!pmu_msr_write(msr, low, high, &ret)) + ret = native_write_msr_safe(msr, low, high); } return ret; @@ -1215,7 +1219,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .read_msr = xen_read_msr_safe, .write_msr = xen_write_msr_safe, - .read_pmc = native_read_pmc, + .read_pmc = xen_read_pmc, .iret = xen_iret, #ifdef CONFIG_X86_64 @@ -1264,6 +1268,10 @@ static const struct pv_apic_ops xen_apic_ops __initconst = { static void xen_reboot(int reason) { struct sched_shutdown r = { .reason = reason }; + int cpu; + + for_each_online_cpu(cpu) + xen_pmu_finish(cpu); if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) BUG(); @@ -1607,7 +1615,9 @@ asmlinkage __visible void __init xen_start_kernel(void) early_boot_irqs_disabled = true; xen_raw_console_write("mapping kernel into physical memory\n"); - xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages); + xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, + xen_start_info->nr_pages); + xen_reserve_special_pages(); /* * Modify the cache mode translation tables to match Xen's PAT diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index dd151b2045b0..2c50b445884e 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -116,6 +116,7 @@ static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ +static phys_addr_t xen_pt_base, xen_pt_size __initdata; /* * Just beyond the highest usermode address. STACK_TOP_MAX has a @@ -1093,6 +1094,16 @@ static void xen_exit_mmap(struct mm_struct *mm) static void xen_post_allocator_init(void); +static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn) +{ + struct mmuext_op op; + + op.cmd = cmd; + op.arg1.mfn = pfn_to_mfn(pfn); + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); +} + #ifdef CONFIG_X86_64 static void __init xen_cleanhighmap(unsigned long vaddr, unsigned long vaddr_end) @@ -1114,6 +1125,83 @@ static void __init xen_cleanhighmap(unsigned long vaddr, xen_mc_flush(); } +/* + * Make a page range writeable and free it. + */ +static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size) +{ + void *vaddr = __va(paddr); + void *vaddr_end = vaddr + size; + + for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) + make_lowmem_page_readwrite(vaddr); + + memblock_free(paddr, size); +} + +static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin) +{ + unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK; + + if (unpin) + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa)); + ClearPagePinned(virt_to_page(__va(pa))); + xen_free_ro_pages(pa, PAGE_SIZE); +} + +/* + * Since it is well isolated we can (and since it is perhaps large we should) + * also free the page tables mapping the initial P->M table. + */ +static void __init xen_cleanmfnmap(unsigned long vaddr) +{ + unsigned long va = vaddr & PMD_MASK; + unsigned long pa; + pgd_t *pgd = pgd_offset_k(va); + pud_t *pud_page = pud_offset(pgd, 0); + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned int i; + bool unpin; + + unpin = (vaddr == 2 * PGDIR_SIZE); + set_pgd(pgd, __pgd(0)); + do { + pud = pud_page + pud_index(va); + if (pud_none(*pud)) { + va += PUD_SIZE; + } else if (pud_large(*pud)) { + pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, PUD_SIZE); + va += PUD_SIZE; + } else { + pmd = pmd_offset(pud, va); + if (pmd_large(*pmd)) { + pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, PMD_SIZE); + } else if (!pmd_none(*pmd)) { + pte = pte_offset_kernel(pmd, va); + set_pmd(pmd, __pmd(0)); + for (i = 0; i < PTRS_PER_PTE; ++i) { + if (pte_none(pte[i])) + break; + pa = pte_pfn(pte[i]) << PAGE_SHIFT; + xen_free_ro_pages(pa, PAGE_SIZE); + } + xen_cleanmfnmap_free_pgtbl(pte, unpin); + } + va += PMD_SIZE; + if (pmd_index(va)) + continue; + set_pud(pud, __pud(0)); + xen_cleanmfnmap_free_pgtbl(pmd, unpin); + } + + } while (pud_index(va) || pmd_index(va)); + xen_cleanmfnmap_free_pgtbl(pud_page, unpin); +} + static void __init xen_pagetable_p2m_free(void) { unsigned long size; @@ -1128,18 +1216,31 @@ static void __init xen_pagetable_p2m_free(void) /* using __ka address and sticking INVALID_P2M_ENTRY! */ memset((void *)xen_start_info->mfn_list, 0xff, size); - /* We should be in __ka space. */ - BUG_ON(xen_start_info->mfn_list < __START_KERNEL_map); addr = xen_start_info->mfn_list; - /* We roundup to the PMD, which means that if anybody at this stage is - * using the __ka address of xen_start_info or xen_start_info->shared_info - * they are in going to crash. Fortunatly we have already revectored - * in xen_setup_kernel_pagetable and in xen_setup_shared_info. */ + /* + * We could be in __ka space. + * We roundup to the PMD, which means that if anybody at this stage is + * using the __ka address of xen_start_info or + * xen_start_info->shared_info they are in going to crash. Fortunatly + * we have already revectored in xen_setup_kernel_pagetable and in + * xen_setup_shared_info. + */ size = roundup(size, PMD_SIZE); - xen_cleanhighmap(addr, addr + size); - size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); - memblock_free(__pa(xen_start_info->mfn_list), size); + if (addr >= __START_KERNEL_map) { + xen_cleanhighmap(addr, addr + size); + size = PAGE_ALIGN(xen_start_info->nr_pages * + sizeof(unsigned long)); + memblock_free(__pa(addr), size); + } else { + xen_cleanmfnmap(addr); + } +} + +static void __init xen_pagetable_cleanhighmap(void) +{ + unsigned long size; + unsigned long addr; /* At this stage, cleanup_highmap has already cleaned __ka space * from _brk_limit way up to the max_pfn_mapped (which is the end of @@ -1172,6 +1273,8 @@ static void __init xen_pagetable_p2m_setup(void) #ifdef CONFIG_X86_64 xen_pagetable_p2m_free(); + + xen_pagetable_cleanhighmap(); #endif /* And revector! Bye bye old array */ xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; @@ -1461,6 +1564,24 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) #else /* CONFIG_X86_64 */ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) { + unsigned long pfn; + + if (xen_feature(XENFEAT_writable_page_tables) || + xen_feature(XENFEAT_auto_translated_physmap) || + xen_start_info->mfn_list >= __START_KERNEL_map) + return pte; + + /* + * Pages belonging to the initial p2m list mapped outside the default + * address range must be mapped read-only. This region contains the + * page tables for mapping the p2m list, too, and page tables MUST be + * mapped read-only. + */ + pfn = pte_pfn(pte); + if (pfn >= xen_start_info->first_p2m_pfn && + pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames) + pte = __pte_ma(pte_val_ma(pte) & ~_PAGE_RW); + return pte; } #endif /* CONFIG_X86_64 */ @@ -1489,15 +1610,6 @@ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) native_set_pte(ptep, pte); } -static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn) -{ - struct mmuext_op op; - op.cmd = cmd; - op.arg1.mfn = pfn_to_mfn(pfn); - if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) - BUG(); -} - /* Early in boot, while setting up the initial pagetable, assume everything is pinned. */ static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) @@ -1815,7 +1927,10 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) * mappings. Considering that on Xen after the kernel mappings we * have the mappings of some pages that don't exist in pfn space, we * set max_pfn_mapped to the last real pfn mapped. */ - max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); + if (xen_start_info->mfn_list < __START_KERNEL_map) + max_pfn_mapped = xen_start_info->first_p2m_pfn; + else + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); pt_base = PFN_DOWN(__pa(xen_start_info->pt_base)); pt_end = pt_base + xen_start_info->nr_pt_frames; @@ -1855,6 +1970,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) /* Graft it onto L4[511][510] */ copy_page(level2_kernel_pgt, l2); + /* Copy the initial P->M table mappings if necessary. */ + i = pgd_index(xen_start_info->mfn_list); + if (i && i < pgd_index(__START_KERNEL_map)) + init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i]; + if (!xen_feature(XENFEAT_auto_translated_physmap)) { /* Make pagetable pieces RO */ set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); @@ -1894,10 +2014,192 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) check_pt_base(&pt_base, &pt_end, addr[i]); /* Our (by three pages) smaller Xen pagetable that we are using */ - memblock_reserve(PFN_PHYS(pt_base), (pt_end - pt_base) * PAGE_SIZE); + xen_pt_base = PFN_PHYS(pt_base); + xen_pt_size = (pt_end - pt_base) * PAGE_SIZE; + memblock_reserve(xen_pt_base, xen_pt_size); + /* Revector the xen_start_info */ xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); } + +/* + * Read a value from a physical address. + */ +static unsigned long __init xen_read_phys_ulong(phys_addr_t addr) +{ + unsigned long *vaddr; + unsigned long val; + + vaddr = early_memremap_ro(addr, sizeof(val)); + val = *vaddr; + early_memunmap(vaddr, sizeof(val)); + return val; +} + +/* + * Translate a virtual address to a physical one without relying on mapped + * page tables. + */ +static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) +{ + phys_addr_t pa; + pgd_t pgd; + pud_t pud; + pmd_t pmd; + pte_t pte; + + pa = read_cr3(); + pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) * + sizeof(pgd))); + if (!pgd_present(pgd)) + return 0; + + pa = pgd_val(pgd) & PTE_PFN_MASK; + pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) * + sizeof(pud))); + if (!pud_present(pud)) + return 0; + pa = pud_pfn(pud) << PAGE_SHIFT; + if (pud_large(pud)) + return pa + (vaddr & ~PUD_MASK); + + pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) * + sizeof(pmd))); + if (!pmd_present(pmd)) + return 0; + pa = pmd_pfn(pmd) << PAGE_SHIFT; + if (pmd_large(pmd)) + return pa + (vaddr & ~PMD_MASK); + + pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) * + sizeof(pte))); + if (!pte_present(pte)) + return 0; + pa = pte_pfn(pte) << PAGE_SHIFT; + + return pa | (vaddr & ~PAGE_MASK); +} + +/* + * Find a new area for the hypervisor supplied p2m list and relocate the p2m to + * this area. + */ +void __init xen_relocate_p2m(void) +{ + phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys; + unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end; + int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud; + pte_t *pt; + pmd_t *pmd; + pud_t *pud; + pgd_t *pgd; + unsigned long *new_p2m; + + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); + n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT; + n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT; + n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT; + n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; + n_frames = n_pte + n_pt + n_pmd + n_pud; + + new_area = xen_find_free_area(PFN_PHYS(n_frames)); + if (!new_area) { + xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n"); + BUG(); + } + + /* + * Setup the page tables for addressing the new p2m list. + * We have asked the hypervisor to map the p2m list at the user address + * PUD_SIZE. It may have done so, or it may have used a kernel space + * address depending on the Xen version. + * To avoid any possible virtual address collision, just use + * 2 * PUD_SIZE for the new area. + */ + pud_phys = new_area; + pmd_phys = pud_phys + PFN_PHYS(n_pud); + pt_phys = pmd_phys + PFN_PHYS(n_pmd); + p2m_pfn = PFN_DOWN(pt_phys) + n_pt; + + pgd = __va(read_cr3()); + new_p2m = (unsigned long *)(2 * PGDIR_SIZE); + for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { + pud = early_memremap(pud_phys, PAGE_SIZE); + clear_page(pud); + for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); + idx_pmd++) { + pmd = early_memremap(pmd_phys, PAGE_SIZE); + clear_page(pmd); + for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); + idx_pt++) { + pt = early_memremap(pt_phys, PAGE_SIZE); + clear_page(pt); + for (idx_pte = 0; + idx_pte < min(n_pte, PTRS_PER_PTE); + idx_pte++) { + set_pte(pt + idx_pte, + pfn_pte(p2m_pfn, PAGE_KERNEL)); + p2m_pfn++; + } + n_pte -= PTRS_PER_PTE; + early_memunmap(pt, PAGE_SIZE); + make_lowmem_page_readonly(__va(pt_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, + PFN_DOWN(pt_phys)); + set_pmd(pmd + idx_pt, + __pmd(_PAGE_TABLE | pt_phys)); + pt_phys += PAGE_SIZE; + } + n_pt -= PTRS_PER_PMD; + early_memunmap(pmd, PAGE_SIZE); + make_lowmem_page_readonly(__va(pmd_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, + PFN_DOWN(pmd_phys)); + set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); + pmd_phys += PAGE_SIZE; + } + n_pmd -= PTRS_PER_PUD; + early_memunmap(pud, PAGE_SIZE); + make_lowmem_page_readonly(__va(pud_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); + set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); + pud_phys += PAGE_SIZE; + } + + /* Now copy the old p2m info to the new area. */ + memcpy(new_p2m, xen_p2m_addr, size); + xen_p2m_addr = new_p2m; + + /* Release the old p2m list and set new list info. */ + p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list)); + BUG_ON(!p2m_pfn); + p2m_pfn_end = p2m_pfn + PFN_DOWN(size); + + if (xen_start_info->mfn_list < __START_KERNEL_map) { + pfn = xen_start_info->first_p2m_pfn; + pfn_end = xen_start_info->first_p2m_pfn + + xen_start_info->nr_p2m_frames; + set_pgd(pgd + 1, __pgd(0)); + } else { + pfn = p2m_pfn; + pfn_end = p2m_pfn_end; + } + + memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn)); + while (pfn < pfn_end) { + if (pfn == p2m_pfn) { + pfn = p2m_pfn_end; + continue; + } + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); + pfn++; + } + + xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; + xen_start_info->first_p2m_pfn = PFN_DOWN(new_area); + xen_start_info->nr_p2m_frames = n_frames; +} + #else /* !CONFIG_X86_64 */ static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); @@ -1938,18 +2240,41 @@ static void __init xen_write_cr3_init(unsigned long cr3) pv_mmu_ops.write_cr3 = &xen_write_cr3; } +/* + * For 32 bit domains xen_start_info->pt_base is the pgd address which might be + * not the first page table in the page table pool. + * Iterate through the initial page tables to find the real page table base. + */ +static phys_addr_t xen_find_pt_base(pmd_t *pmd) +{ + phys_addr_t pt_base, paddr; + unsigned pmdidx; + + pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd)); + + for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) + if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) { + paddr = m2p(pmd[pmdidx].pmd); + pt_base = min(pt_base, paddr); + } + + return pt_base; +} + void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pmd_t *kernel_pmd; + kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); + + xen_pt_base = xen_find_pt_base(kernel_pmd); + xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE; + initial_kernel_pmd = extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); - max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + - xen_start_info->nr_pt_frames * PAGE_SIZE + - 512*1024); + max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024); - kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); copy_page(initial_kernel_pmd, kernel_pmd); xen_map_identity_early(initial_kernel_pmd, max_pfn); @@ -1968,11 +2293,33 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) PFN_DOWN(__pa(initial_page_table))); xen_write_cr3(__pa(initial_page_table)); - memblock_reserve(__pa(xen_start_info->pt_base), - xen_start_info->nr_pt_frames * PAGE_SIZE); + memblock_reserve(xen_pt_base, xen_pt_size); } #endif /* CONFIG_X86_64 */ +void __init xen_reserve_special_pages(void) +{ + phys_addr_t paddr; + + memblock_reserve(__pa(xen_start_info), PAGE_SIZE); + if (xen_start_info->store_mfn) { + paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn)); + memblock_reserve(paddr, PAGE_SIZE); + } + if (!xen_initial_domain()) { + paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn)); + memblock_reserve(paddr, PAGE_SIZE); + } +} + +void __init xen_pt_check_e820(void) +{ + if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) { + xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n"); + BUG(); + } +} + static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 8b7f18e200aa..bfc08b13044b 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -79,10 +79,14 @@ #include <xen/balloon.h> #include <xen/grant_table.h> -#include "p2m.h" #include "multicalls.h" #include "xen-ops.h" +#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) +#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) + +#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) + #define PMDS_PER_MID_PAGE (P2M_MID_PER_PAGE / PTRS_PER_PTE) unsigned long *xen_p2m_addr __read_mostly; @@ -199,7 +203,8 @@ void __ref xen_build_mfn_list_list(void) unsigned int level, topidx, mididx; unsigned long *mid_mfn_p; - if (xen_feature(XENFEAT_auto_translated_physmap)) + if (xen_feature(XENFEAT_auto_translated_physmap) || + xen_start_info->flags & SIF_VIRT_P2M_4TOOLS) return; /* Pre-initialize p2m_top_mfn to be completely missing */ @@ -260,9 +265,16 @@ void xen_setup_mfn_list_list(void) BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); - HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = - virt_to_mfn(p2m_top_mfn); + if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS) + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = ~0UL; + else + HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = + virt_to_mfn(p2m_top_mfn); HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; + HYPERVISOR_shared_info->arch.p2m_generation = 0; + HYPERVISOR_shared_info->arch.p2m_vaddr = (unsigned long)xen_p2m_addr; + HYPERVISOR_shared_info->arch.p2m_cr3 = + xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); } /* Set up p2m_top to point to the domain-builder provided p2m pages */ @@ -478,8 +490,12 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg) ptechk = lookup_address(vaddr, &level); if (ptechk == pte_pg) { + HYPERVISOR_shared_info->arch.p2m_generation++; + wmb(); /* Tools are synchronizing via p2m_generation. */ set_pmd(pmdp, __pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE)); + wmb(); /* Tools are synchronizing via p2m_generation. */ + HYPERVISOR_shared_info->arch.p2m_generation++; pte_newpg[i] = NULL; } @@ -505,7 +521,7 @@ static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg) */ static bool alloc_p2m(unsigned long pfn) { - unsigned topidx, mididx; + unsigned topidx; unsigned long *top_mfn_p, *mid_mfn; pte_t *ptep, *pte_pg; unsigned int level; @@ -513,9 +529,6 @@ static bool alloc_p2m(unsigned long pfn) unsigned long addr = (unsigned long)(xen_p2m_addr + pfn); unsigned long p2m_pfn; - topidx = p2m_top_index(pfn); - mididx = p2m_mid_index(pfn); - ptep = lookup_address(addr, &level); BUG_ON(!ptep || level != PG_LEVEL_4K); pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1)); @@ -527,7 +540,8 @@ static bool alloc_p2m(unsigned long pfn) return false; } - if (p2m_top_mfn) { + if (p2m_top_mfn && pfn < MAX_P2M_PFN) { + topidx = p2m_top_index(pfn); top_mfn_p = &p2m_top_mfn[topidx]; mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]); @@ -577,10 +591,14 @@ static bool alloc_p2m(unsigned long pfn) spin_lock_irqsave(&p2m_update_lock, flags); if (pte_pfn(*ptep) == p2m_pfn) { + HYPERVISOR_shared_info->arch.p2m_generation++; + wmb(); /* Tools are synchronizing via p2m_generation. */ set_pte(ptep, pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL)); + wmb(); /* Tools are synchronizing via p2m_generation. */ + HYPERVISOR_shared_info->arch.p2m_generation++; if (mid_mfn) - mid_mfn[mididx] = virt_to_mfn(p2m); + mid_mfn[p2m_mid_index(pfn)] = virt_to_mfn(p2m); p2m = NULL; } @@ -630,6 +648,11 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) return true; } + /* + * The interface requires atomic updates on p2m elements. + * xen_safe_write_ulong() is using __put_user which does an atomic + * store via asm(). + */ if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn))) return true; diff --git a/arch/x86/xen/p2m.h b/arch/x86/xen/p2m.h deleted file mode 100644 index ad8aee24ab72..000000000000 --- a/arch/x86/xen/p2m.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _XEN_P2M_H -#define _XEN_P2M_H - -#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) -#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) -#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) - -#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) - -#define MAX_REMAP_RANGES 10 - -extern unsigned long __init set_phys_range_identity(unsigned long pfn_s, - unsigned long pfn_e); - -#endif /* _XEN_P2M_H */ diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index a8261716d58d..9586ff32810c 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -68,7 +68,7 @@ static int check_platform_magic(void) return 0; } -bool xen_has_pv_devices() +bool xen_has_pv_devices(void) { if (!xen_domain()) return false; diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c new file mode 100644 index 000000000000..724a08740a04 --- /dev/null +++ b/arch/x86/xen/pmu.c @@ -0,0 +1,570 @@ +#include <linux/types.h> +#include <linux/interrupt.h> + +#include <asm/xen/hypercall.h> +#include <xen/page.h> +#include <xen/interface/xen.h> +#include <xen/interface/vcpu.h> +#include <xen/interface/xenpmu.h> + +#include "xen-ops.h" +#include "pmu.h" + +/* x86_pmu.handle_irq definition */ +#include "../kernel/cpu/perf_event.h" + +#define XENPMU_IRQ_PROCESSING 1 +struct xenpmu { + /* Shared page between hypervisor and domain */ + struct xen_pmu_data *xenpmu_data; + + uint8_t flags; +}; +static DEFINE_PER_CPU(struct xenpmu, xenpmu_shared); +#define get_xenpmu_data() (this_cpu_ptr(&xenpmu_shared)->xenpmu_data) +#define get_xenpmu_flags() (this_cpu_ptr(&xenpmu_shared)->flags) + +/* Macro for computing address of a PMU MSR bank */ +#define field_offset(ctxt, field) ((void *)((uintptr_t)ctxt + \ + (uintptr_t)ctxt->field)) + +/* AMD PMU */ +#define F15H_NUM_COUNTERS 6 +#define F10H_NUM_COUNTERS 4 + +static __read_mostly uint32_t amd_counters_base; +static __read_mostly uint32_t amd_ctrls_base; +static __read_mostly int amd_msr_step; +static __read_mostly int k7_counters_mirrored; +static __read_mostly int amd_num_counters; + +/* Intel PMU */ +#define MSR_TYPE_COUNTER 0 +#define MSR_TYPE_CTRL 1 +#define MSR_TYPE_GLOBAL 2 +#define MSR_TYPE_ARCH_COUNTER 3 +#define MSR_TYPE_ARCH_CTRL 4 + +/* Number of general pmu registers (CPUID.EAX[0xa].EAX[8..15]) */ +#define PMU_GENERAL_NR_SHIFT 8 +#define PMU_GENERAL_NR_BITS 8 +#define PMU_GENERAL_NR_MASK (((1 << PMU_GENERAL_NR_BITS) - 1) \ + << PMU_GENERAL_NR_SHIFT) + +/* Number of fixed pmu registers (CPUID.EDX[0xa].EDX[0..4]) */ +#define PMU_FIXED_NR_SHIFT 0 +#define PMU_FIXED_NR_BITS 5 +#define PMU_FIXED_NR_MASK (((1 << PMU_FIXED_NR_BITS) - 1) \ + << PMU_FIXED_NR_SHIFT) + +/* Alias registers (0x4c1) for full-width writes to PMCs */ +#define MSR_PMC_ALIAS_MASK (~(MSR_IA32_PERFCTR0 ^ MSR_IA32_PMC0)) + +#define INTEL_PMC_TYPE_SHIFT 30 + +static __read_mostly int intel_num_arch_counters, intel_num_fixed_counters; + + +static void xen_pmu_arch_init(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + + switch (boot_cpu_data.x86) { + case 0x15: + amd_num_counters = F15H_NUM_COUNTERS; + amd_counters_base = MSR_F15H_PERF_CTR; + amd_ctrls_base = MSR_F15H_PERF_CTL; + amd_msr_step = 2; + k7_counters_mirrored = 1; + break; + case 0x10: + case 0x12: + case 0x14: + case 0x16: + default: + amd_num_counters = F10H_NUM_COUNTERS; + amd_counters_base = MSR_K7_PERFCTR0; + amd_ctrls_base = MSR_K7_EVNTSEL0; + amd_msr_step = 1; + k7_counters_mirrored = 0; + break; + } + } else { + uint32_t eax, ebx, ecx, edx; + + cpuid(0xa, &eax, &ebx, &ecx, &edx); + + intel_num_arch_counters = (eax & PMU_GENERAL_NR_MASK) >> + PMU_GENERAL_NR_SHIFT; + intel_num_fixed_counters = (edx & PMU_FIXED_NR_MASK) >> + PMU_FIXED_NR_SHIFT; + } +} + +static inline uint32_t get_fam15h_addr(u32 addr) +{ + switch (addr) { + case MSR_K7_PERFCTR0: + case MSR_K7_PERFCTR1: + case MSR_K7_PERFCTR2: + case MSR_K7_PERFCTR3: + return MSR_F15H_PERF_CTR + (addr - MSR_K7_PERFCTR0); + case MSR_K7_EVNTSEL0: + case MSR_K7_EVNTSEL1: + case MSR_K7_EVNTSEL2: + case MSR_K7_EVNTSEL3: + return MSR_F15H_PERF_CTL + (addr - MSR_K7_EVNTSEL0); + default: + break; + } + + return addr; +} + +static inline bool is_amd_pmu_msr(unsigned int msr) +{ + if ((msr >= MSR_F15H_PERF_CTL && + msr < MSR_F15H_PERF_CTR + (amd_num_counters * 2)) || + (msr >= MSR_K7_EVNTSEL0 && + msr < MSR_K7_PERFCTR0 + amd_num_counters)) + return true; + + return false; +} + +static int is_intel_pmu_msr(u32 msr_index, int *type, int *index) +{ + u32 msr_index_pmc; + + switch (msr_index) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + case MSR_IA32_DS_AREA: + case MSR_IA32_PEBS_ENABLE: + *type = MSR_TYPE_CTRL; + return true; + + case MSR_CORE_PERF_GLOBAL_CTRL: + case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + *type = MSR_TYPE_GLOBAL; + return true; + + default: + + if ((msr_index >= MSR_CORE_PERF_FIXED_CTR0) && + (msr_index < MSR_CORE_PERF_FIXED_CTR0 + + intel_num_fixed_counters)) { + *index = msr_index - MSR_CORE_PERF_FIXED_CTR0; + *type = MSR_TYPE_COUNTER; + return true; + } + + if ((msr_index >= MSR_P6_EVNTSEL0) && + (msr_index < MSR_P6_EVNTSEL0 + intel_num_arch_counters)) { + *index = msr_index - MSR_P6_EVNTSEL0; + *type = MSR_TYPE_ARCH_CTRL; + return true; + } + + msr_index_pmc = msr_index & MSR_PMC_ALIAS_MASK; + if ((msr_index_pmc >= MSR_IA32_PERFCTR0) && + (msr_index_pmc < MSR_IA32_PERFCTR0 + + intel_num_arch_counters)) { + *type = MSR_TYPE_ARCH_COUNTER; + *index = msr_index_pmc - MSR_IA32_PERFCTR0; + return true; + } + return false; + } +} + +static bool xen_intel_pmu_emulate(unsigned int msr, u64 *val, int type, + int index, bool is_read) +{ + uint64_t *reg = NULL; + struct xen_pmu_intel_ctxt *ctxt; + uint64_t *fix_counters; + struct xen_pmu_cntr_pair *arch_cntr_pair; + struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); + + + if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) + return false; + + ctxt = &xenpmu_data->pmu.c.intel; + + switch (msr) { + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + reg = &ctxt->global_ovf_ctrl; + break; + case MSR_CORE_PERF_GLOBAL_STATUS: + reg = &ctxt->global_status; + break; + case MSR_CORE_PERF_GLOBAL_CTRL: + reg = &ctxt->global_ctrl; + break; + case MSR_CORE_PERF_FIXED_CTR_CTRL: + reg = &ctxt->fixed_ctrl; + break; + default: + switch (type) { + case MSR_TYPE_COUNTER: + fix_counters = field_offset(ctxt, fixed_counters); + reg = &fix_counters[index]; + break; + case MSR_TYPE_ARCH_COUNTER: + arch_cntr_pair = field_offset(ctxt, arch_counters); + reg = &arch_cntr_pair[index].counter; + break; + case MSR_TYPE_ARCH_CTRL: + arch_cntr_pair = field_offset(ctxt, arch_counters); + reg = &arch_cntr_pair[index].control; + break; + default: + return false; + } + } + + if (reg) { + if (is_read) + *val = *reg; + else { + *reg = *val; + + if (msr == MSR_CORE_PERF_GLOBAL_OVF_CTRL) + ctxt->global_status &= (~(*val)); + } + return true; + } + + return false; +} + +static bool xen_amd_pmu_emulate(unsigned int msr, u64 *val, bool is_read) +{ + uint64_t *reg = NULL; + int i, off = 0; + struct xen_pmu_amd_ctxt *ctxt; + uint64_t *counter_regs, *ctrl_regs; + struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); + + if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) + return false; + + if (k7_counters_mirrored && + ((msr >= MSR_K7_EVNTSEL0) && (msr <= MSR_K7_PERFCTR3))) + msr = get_fam15h_addr(msr); + + ctxt = &xenpmu_data->pmu.c.amd; + for (i = 0; i < amd_num_counters; i++) { + if (msr == amd_ctrls_base + off) { + ctrl_regs = field_offset(ctxt, ctrls); + reg = &ctrl_regs[i]; + break; + } else if (msr == amd_counters_base + off) { + counter_regs = field_offset(ctxt, counters); + reg = &counter_regs[i]; + break; + } + off += amd_msr_step; + } + + if (reg) { + if (is_read) + *val = *reg; + else + *reg = *val; + + return true; + } + return false; +} + +bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (is_amd_pmu_msr(msr)) { + if (!xen_amd_pmu_emulate(msr, val, 1)) + *val = native_read_msr_safe(msr, err); + return true; + } + } else { + int type, index; + + if (is_intel_pmu_msr(msr, &type, &index)) { + if (!xen_intel_pmu_emulate(msr, val, type, index, 1)) + *val = native_read_msr_safe(msr, err); + return true; + } + } + + return false; +} + +bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err) +{ + uint64_t val = ((uint64_t)high << 32) | low; + + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + if (is_amd_pmu_msr(msr)) { + if (!xen_amd_pmu_emulate(msr, &val, 0)) + *err = native_write_msr_safe(msr, low, high); + return true; + } + } else { + int type, index; + + if (is_intel_pmu_msr(msr, &type, &index)) { + if (!xen_intel_pmu_emulate(msr, &val, type, index, 0)) + *err = native_write_msr_safe(msr, low, high); + return true; + } + } + + return false; +} + +static unsigned long long xen_amd_read_pmc(int counter) +{ + struct xen_pmu_amd_ctxt *ctxt; + uint64_t *counter_regs; + struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); + + if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) { + uint32_t msr; + int err; + + msr = amd_counters_base + (counter * amd_msr_step); + return native_read_msr_safe(msr, &err); + } + + ctxt = &xenpmu_data->pmu.c.amd; + counter_regs = field_offset(ctxt, counters); + return counter_regs[counter]; +} + +static unsigned long long xen_intel_read_pmc(int counter) +{ + struct xen_pmu_intel_ctxt *ctxt; + uint64_t *fixed_counters; + struct xen_pmu_cntr_pair *arch_cntr_pair; + struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); + + if (!xenpmu_data || !(xenpmu_flags & XENPMU_IRQ_PROCESSING)) { + uint32_t msr; + int err; + + if (counter & (1 << INTEL_PMC_TYPE_SHIFT)) + msr = MSR_CORE_PERF_FIXED_CTR0 + (counter & 0xffff); + else + msr = MSR_IA32_PERFCTR0 + counter; + + return native_read_msr_safe(msr, &err); + } + + ctxt = &xenpmu_data->pmu.c.intel; + if (counter & (1 << INTEL_PMC_TYPE_SHIFT)) { + fixed_counters = field_offset(ctxt, fixed_counters); + return fixed_counters[counter & 0xffff]; + } + + arch_cntr_pair = field_offset(ctxt, arch_counters); + return arch_cntr_pair[counter].counter; +} + +unsigned long long xen_read_pmc(int counter) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return xen_amd_read_pmc(counter); + else + return xen_intel_read_pmc(counter); +} + +int pmu_apic_update(uint32_t val) +{ + int ret; + struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return -EINVAL; + } + + xenpmu_data->pmu.l.lapic_lvtpc = val; + + if (get_xenpmu_flags() & XENPMU_IRQ_PROCESSING) + return 0; + + ret = HYPERVISOR_xenpmu_op(XENPMU_lvtpc_set, NULL); + + return ret; +} + +/* perf callbacks */ +static int xen_is_in_guest(void) +{ + const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return 0; + } + + if (!xen_initial_domain() || (xenpmu_data->domain_id >= DOMID_SELF)) + return 0; + + return 1; +} + +static int xen_is_user_mode(void) +{ + const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return 0; + } + + if (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_PV) + return (xenpmu_data->pmu.pmu_flags & PMU_SAMPLE_USER); + else + return !!(xenpmu_data->pmu.r.regs.cpl & 3); +} + +static unsigned long xen_get_guest_ip(void) +{ + const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return 0; + } + + return xenpmu_data->pmu.r.regs.ip; +} + +static struct perf_guest_info_callbacks xen_guest_cbs = { + .is_in_guest = xen_is_in_guest, + .is_user_mode = xen_is_user_mode, + .get_guest_ip = xen_get_guest_ip, +}; + +/* Convert registers from Xen's format to Linux' */ +static void xen_convert_regs(const struct xen_pmu_regs *xen_regs, + struct pt_regs *regs, uint64_t pmu_flags) +{ + regs->ip = xen_regs->ip; + regs->cs = xen_regs->cs; + regs->sp = xen_regs->sp; + + if (pmu_flags & PMU_SAMPLE_PV) { + if (pmu_flags & PMU_SAMPLE_USER) + regs->cs |= 3; + else + regs->cs &= ~3; + } else { + if (xen_regs->cpl) + regs->cs |= 3; + else + regs->cs &= ~3; + } +} + +irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id) +{ + int err, ret = IRQ_NONE; + struct pt_regs regs; + const struct xen_pmu_data *xenpmu_data = get_xenpmu_data(); + uint8_t xenpmu_flags = get_xenpmu_flags(); + + if (!xenpmu_data) { + pr_warn_once("%s: pmudata not initialized\n", __func__); + return ret; + } + + this_cpu_ptr(&xenpmu_shared)->flags = + xenpmu_flags | XENPMU_IRQ_PROCESSING; + xen_convert_regs(&xenpmu_data->pmu.r.regs, ®s, + xenpmu_data->pmu.pmu_flags); + if (x86_pmu.handle_irq(®s)) + ret = IRQ_HANDLED; + + /* Write out cached context to HW */ + err = HYPERVISOR_xenpmu_op(XENPMU_flush, NULL); + this_cpu_ptr(&xenpmu_shared)->flags = xenpmu_flags; + if (err) { + pr_warn_once("%s: failed hypercall, err: %d\n", __func__, err); + return IRQ_NONE; + } + + return ret; +} + +bool is_xen_pmu(int cpu) +{ + return (get_xenpmu_data() != NULL); +} + +void xen_pmu_init(int cpu) +{ + int err; + struct xen_pmu_params xp; + unsigned long pfn; + struct xen_pmu_data *xenpmu_data; + + BUILD_BUG_ON(sizeof(struct xen_pmu_data) > PAGE_SIZE); + + if (xen_hvm_domain()) + return; + + xenpmu_data = (struct xen_pmu_data *)get_zeroed_page(GFP_KERNEL); + if (!xenpmu_data) { + pr_err("VPMU init: No memory\n"); + return; + } + pfn = virt_to_pfn(xenpmu_data); + + xp.val = pfn_to_mfn(pfn); + xp.vcpu = cpu; + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + err = HYPERVISOR_xenpmu_op(XENPMU_init, &xp); + if (err) + goto fail; + + per_cpu(xenpmu_shared, cpu).xenpmu_data = xenpmu_data; + per_cpu(xenpmu_shared, cpu).flags = 0; + + if (cpu == 0) { + perf_register_guest_info_callbacks(&xen_guest_cbs); + xen_pmu_arch_init(); + } + + return; + +fail: + pr_warn_once("Could not initialize VPMU for cpu %d, error %d\n", + cpu, err); + free_pages((unsigned long)xenpmu_data, 0); +} + +void xen_pmu_finish(int cpu) +{ + struct xen_pmu_params xp; + + if (xen_hvm_domain()) + return; + + xp.vcpu = cpu; + xp.version.maj = XENPMU_VER_MAJ; + xp.version.min = XENPMU_VER_MIN; + + (void)HYPERVISOR_xenpmu_op(XENPMU_finish, &xp); + + free_pages((unsigned long)per_cpu(xenpmu_shared, cpu).xenpmu_data, 0); + per_cpu(xenpmu_shared, cpu).xenpmu_data = NULL; +} diff --git a/arch/x86/xen/pmu.h b/arch/x86/xen/pmu.h new file mode 100644 index 000000000000..af5f0ad94078 --- /dev/null +++ b/arch/x86/xen/pmu.h @@ -0,0 +1,15 @@ +#ifndef __XEN_PMU_H +#define __XEN_PMU_H + +#include <xen/interface/xenpmu.h> + +irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id); +void xen_pmu_init(int cpu); +void xen_pmu_finish(int cpu); +bool is_xen_pmu(int cpu); +bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err); +bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err); +int pmu_apic_update(uint32_t reg); +unsigned long long xen_read_pmc(int counter); + +#endif /* __XEN_PMU_H */ diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 55f388ef481a..f5ef6746d47a 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -27,17 +27,23 @@ #include <xen/interface/memory.h> #include <xen/interface/physdev.h> #include <xen/features.h> +#include <xen/hvc-console.h> #include "xen-ops.h" #include "vdso.h" -#include "p2m.h" #include "mmu.h" +#define GB(x) ((uint64_t)(x) * 1024 * 1024 * 1024) + /* Amount of extra memory space we add to the e820 ranges */ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; /* Number of pages released from the initial allocation. */ unsigned long xen_released_pages; +/* E820 map used during setting up memory. */ +static struct e820entry xen_e820_map[E820MAX] __initdata; +static u32 xen_e820_map_entries __initdata; + /* * Buffer used to remap identity mapped pages. We only need the virtual space. * The physical page behind this address is remapped as needed to different @@ -64,62 +70,89 @@ static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY; */ #define EXTRA_MEM_RATIO (10) -static void __init xen_add_extra_mem(phys_addr_t start, phys_addr_t size) +static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB); + +static void __init xen_parse_512gb(void) +{ + bool val = false; + char *arg; + + arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit"); + if (!arg) + return; + + arg = strstr(xen_start_info->cmd_line, "xen_512gb_limit="); + if (!arg) + val = true; + else if (strtobool(arg + strlen("xen_512gb_limit="), &val)) + return; + + xen_512gb_limit = val; +} + +static void __init xen_add_extra_mem(unsigned long start_pfn, + unsigned long n_pfns) { int i; + /* + * No need to check for zero size, should happen rarely and will only + * write a new entry regarded to be unused due to zero size. + */ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { /* Add new region. */ - if (xen_extra_mem[i].size == 0) { - xen_extra_mem[i].start = start; - xen_extra_mem[i].size = size; + if (xen_extra_mem[i].n_pfns == 0) { + xen_extra_mem[i].start_pfn = start_pfn; + xen_extra_mem[i].n_pfns = n_pfns; break; } /* Append to existing region. */ - if (xen_extra_mem[i].start + xen_extra_mem[i].size == start) { - xen_extra_mem[i].size += size; + if (xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns == + start_pfn) { + xen_extra_mem[i].n_pfns += n_pfns; break; } } if (i == XEN_EXTRA_MEM_MAX_REGIONS) printk(KERN_WARNING "Warning: not enough extra memory regions\n"); - memblock_reserve(start, size); + memblock_reserve(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); } -static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size) +static void __init xen_del_extra_mem(unsigned long start_pfn, + unsigned long n_pfns) { int i; - phys_addr_t start_r, size_r; + unsigned long start_r, size_r; for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { - start_r = xen_extra_mem[i].start; - size_r = xen_extra_mem[i].size; + start_r = xen_extra_mem[i].start_pfn; + size_r = xen_extra_mem[i].n_pfns; /* Start of region. */ - if (start_r == start) { - BUG_ON(size > size_r); - xen_extra_mem[i].start += size; - xen_extra_mem[i].size -= size; + if (start_r == start_pfn) { + BUG_ON(n_pfns > size_r); + xen_extra_mem[i].start_pfn += n_pfns; + xen_extra_mem[i].n_pfns -= n_pfns; break; } /* End of region. */ - if (start_r + size_r == start + size) { - BUG_ON(size > size_r); - xen_extra_mem[i].size -= size; + if (start_r + size_r == start_pfn + n_pfns) { + BUG_ON(n_pfns > size_r); + xen_extra_mem[i].n_pfns -= n_pfns; break; } /* Mid of region. */ - if (start > start_r && start < start_r + size_r) { - BUG_ON(start + size > start_r + size_r); - xen_extra_mem[i].size = start - start_r; + if (start_pfn > start_r && start_pfn < start_r + size_r) { + BUG_ON(start_pfn + n_pfns > start_r + size_r); + xen_extra_mem[i].n_pfns = start_pfn - start_r; /* Calling memblock_reserve() again is okay. */ - xen_add_extra_mem(start + size, start_r + size_r - - (start + size)); + xen_add_extra_mem(start_pfn + n_pfns, start_r + size_r - + (start_pfn + n_pfns)); break; } } - memblock_free(start, size); + memblock_free(PFN_PHYS(start_pfn), PFN_PHYS(n_pfns)); } /* @@ -130,11 +163,10 @@ static void __init xen_del_extra_mem(phys_addr_t start, phys_addr_t size) unsigned long __ref xen_chk_extra_mem(unsigned long pfn) { int i; - phys_addr_t addr = PFN_PHYS(pfn); for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { - if (addr >= xen_extra_mem[i].start && - addr < xen_extra_mem[i].start + xen_extra_mem[i].size) + if (pfn >= xen_extra_mem[i].start_pfn && + pfn < xen_extra_mem[i].start_pfn + xen_extra_mem[i].n_pfns) return INVALID_P2M_ENTRY; } @@ -150,10 +182,10 @@ void __init xen_inv_extra_mem(void) int i; for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) { - if (!xen_extra_mem[i].size) + if (!xen_extra_mem[i].n_pfns) continue; - pfn_s = PFN_DOWN(xen_extra_mem[i].start); - pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size); + pfn_s = xen_extra_mem[i].start_pfn; + pfn_e = pfn_s + xen_extra_mem[i].n_pfns; for (pfn = pfn_s; pfn < pfn_e; pfn++) set_phys_to_machine(pfn, INVALID_P2M_ENTRY); } @@ -164,15 +196,13 @@ void __init xen_inv_extra_mem(void) * This function updates min_pfn with the pfn found and returns * the size of that range or zero if not found. */ -static unsigned long __init xen_find_pfn_range( - const struct e820entry *list, size_t map_size, - unsigned long *min_pfn) +static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn) { - const struct e820entry *entry; + const struct e820entry *entry = xen_e820_map; unsigned int i; unsigned long done = 0; - for (i = 0, entry = list; i < map_size; i++, entry++) { + for (i = 0; i < xen_e820_map_entries; i++, entry++) { unsigned long s_pfn; unsigned long e_pfn; @@ -221,7 +251,7 @@ static int __init xen_free_mfn(unsigned long mfn) * as a fallback if the remapping fails. */ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, - unsigned long end_pfn, unsigned long nr_pages, unsigned long *released) + unsigned long end_pfn, unsigned long nr_pages) { unsigned long pfn, end; int ret; @@ -241,7 +271,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret); if (ret == 1) { - (*released)++; + xen_released_pages++; if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY)) break; } else @@ -356,9 +386,8 @@ static void __init xen_do_set_identity_and_remap_chunk( * to Xen and not remapped. */ static unsigned long __init xen_set_identity_and_remap_chunk( - const struct e820entry *list, size_t map_size, unsigned long start_pfn, - unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn, - unsigned long *released, unsigned long *remapped) + unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, + unsigned long remap_pfn) { unsigned long pfn; unsigned long i = 0; @@ -379,12 +408,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk( if (cur_pfn + size > nr_pages) size = nr_pages - cur_pfn; - remap_range_size = xen_find_pfn_range(list, map_size, - &remap_pfn); + remap_range_size = xen_find_pfn_range(&remap_pfn); if (!remap_range_size) { pr_warning("Unable to find available pfn range, not remapping identity pages\n"); xen_set_identity_and_release_chunk(cur_pfn, - cur_pfn + left, nr_pages, released); + cur_pfn + left, nr_pages); break; } /* Adjust size to fit in current e820 RAM region */ @@ -396,7 +424,6 @@ static unsigned long __init xen_set_identity_and_remap_chunk( /* Update variables to reflect new mappings. */ i += size; remap_pfn += size; - *remapped += size; } /* @@ -411,15 +438,11 @@ static unsigned long __init xen_set_identity_and_remap_chunk( return remap_pfn; } -static void __init xen_set_identity_and_remap( - const struct e820entry *list, size_t map_size, unsigned long nr_pages, - unsigned long *released, unsigned long *remapped) +static void __init xen_set_identity_and_remap(unsigned long nr_pages) { phys_addr_t start = 0; unsigned long last_pfn = nr_pages; - const struct e820entry *entry; - unsigned long num_released = 0; - unsigned long num_remapped = 0; + const struct e820entry *entry = xen_e820_map; int i; /* @@ -433,9 +456,9 @@ static void __init xen_set_identity_and_remap( * example) the DMI tables in a reserved region that begins on * a non-page boundary. */ - for (i = 0, entry = list; i < map_size; i++, entry++) { + for (i = 0; i < xen_e820_map_entries; i++, entry++) { phys_addr_t end = entry->addr + entry->size; - if (entry->type == E820_RAM || i == map_size - 1) { + if (entry->type == E820_RAM || i == xen_e820_map_entries - 1) { unsigned long start_pfn = PFN_DOWN(start); unsigned long end_pfn = PFN_UP(end); @@ -444,17 +467,13 @@ static void __init xen_set_identity_and_remap( if (start_pfn < end_pfn) last_pfn = xen_set_identity_and_remap_chunk( - list, map_size, start_pfn, - end_pfn, nr_pages, last_pfn, - &num_released, &num_remapped); + start_pfn, end_pfn, nr_pages, + last_pfn); start = end; } } - *released = num_released; - *remapped = num_remapped; - - pr_info("Released %ld page(s)\n", num_released); + pr_info("Released %ld page(s)\n", xen_released_pages); } /* @@ -494,7 +513,7 @@ void __init xen_remap_memory(void) } else if (pfn_s + len == xen_remap_buf.target_pfn) { len += xen_remap_buf.size; } else { - xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len)); + xen_del_extra_mem(pfn_s, len); pfn_s = xen_remap_buf.target_pfn; len = xen_remap_buf.size; } @@ -504,19 +523,36 @@ void __init xen_remap_memory(void) } if (pfn_s != ~0UL && len) - xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len)); + xen_del_extra_mem(pfn_s, len); set_pte_mfn(buf, mfn_save, PAGE_KERNEL); pr_info("Remapped %ld page(s)\n", remapped); } +static unsigned long __init xen_get_pages_limit(void) +{ + unsigned long limit; + +#ifdef CONFIG_X86_32 + limit = GB(64) / PAGE_SIZE; +#else + limit = MAXMEM / PAGE_SIZE; + if (!xen_initial_domain() && xen_512gb_limit) + limit = GB(512) / PAGE_SIZE; +#endif + return limit; +} + static unsigned long __init xen_get_max_pages(void) { - unsigned long max_pages = MAX_DOMAIN_PAGES; + unsigned long max_pages, limit; domid_t domid = DOMID_SELF; int ret; + limit = xen_get_pages_limit(); + max_pages = limit; + /* * For the initial domain we use the maximum reservation as * the maximum page. @@ -532,7 +568,7 @@ static unsigned long __init xen_get_max_pages(void) max_pages = ret; } - return min(max_pages, MAX_DOMAIN_PAGES); + return min(max_pages, limit); } static void __init xen_align_and_add_e820_region(phys_addr_t start, @@ -549,39 +585,188 @@ static void __init xen_align_and_add_e820_region(phys_addr_t start, e820_add_region(start, end - start, type); } -static void __init xen_ignore_unusable(struct e820entry *list, size_t map_size) +static void __init xen_ignore_unusable(void) { - struct e820entry *entry; + struct e820entry *entry = xen_e820_map; unsigned int i; - for (i = 0, entry = list; i < map_size; i++, entry++) { + for (i = 0; i < xen_e820_map_entries; i++, entry++) { if (entry->type == E820_UNUSABLE) entry->type = E820_RAM; } } +static unsigned long __init xen_count_remap_pages(unsigned long max_pfn) +{ + unsigned long extra = 0; + unsigned long start_pfn, end_pfn; + const struct e820entry *entry = xen_e820_map; + int i; + + end_pfn = 0; + for (i = 0; i < xen_e820_map_entries; i++, entry++) { + start_pfn = PFN_DOWN(entry->addr); + /* Adjacent regions on non-page boundaries handling! */ + end_pfn = min(end_pfn, start_pfn); + + if (start_pfn >= max_pfn) + return extra + max_pfn - end_pfn; + + /* Add any holes in map to result. */ + extra += start_pfn - end_pfn; + + end_pfn = PFN_UP(entry->addr + entry->size); + end_pfn = min(end_pfn, max_pfn); + + if (entry->type != E820_RAM) + extra += end_pfn - start_pfn; + } + + return extra; +} + +bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) +{ + struct e820entry *entry; + unsigned mapcnt; + phys_addr_t end; + + if (!size) + return false; + + end = start + size; + entry = xen_e820_map; + + for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++) { + if (entry->type == E820_RAM && entry->addr <= start && + (entry->addr + entry->size) >= end) + return false; + + entry++; + } + + return true; +} + +/* + * Find a free area in physical memory not yet reserved and compliant with + * E820 map. + * Used to relocate pre-allocated areas like initrd or p2m list which are in + * conflict with the to be used E820 map. + * In case no area is found, return 0. Otherwise return the physical address + * of the area which is already reserved for convenience. + */ +phys_addr_t __init xen_find_free_area(phys_addr_t size) +{ + unsigned mapcnt; + phys_addr_t addr, start; + struct e820entry *entry = xen_e820_map; + + for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++, entry++) { + if (entry->type != E820_RAM || entry->size < size) + continue; + start = entry->addr; + for (addr = start; addr < start + size; addr += PAGE_SIZE) { + if (!memblock_is_reserved(addr)) + continue; + start = addr + PAGE_SIZE; + if (start + size > entry->addr + entry->size) + break; + } + if (addr >= start + size) { + memblock_reserve(start, size); + return start; + } + } + + return 0; +} + +/* + * Like memcpy, but with physical addresses for dest and src. + */ +static void __init xen_phys_memcpy(phys_addr_t dest, phys_addr_t src, + phys_addr_t n) +{ + phys_addr_t dest_off, src_off, dest_len, src_len, len; + void *from, *to; + + while (n) { + dest_off = dest & ~PAGE_MASK; + src_off = src & ~PAGE_MASK; + dest_len = n; + if (dest_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off) + dest_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - dest_off; + src_len = n; + if (src_len > (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off) + src_len = (NR_FIX_BTMAPS << PAGE_SHIFT) - src_off; + len = min(dest_len, src_len); + to = early_memremap(dest - dest_off, dest_len + dest_off); + from = early_memremap(src - src_off, src_len + src_off); + memcpy(to, from, len); + early_memunmap(to, dest_len + dest_off); + early_memunmap(from, src_len + src_off); + n -= len; + dest += len; + src += len; + } +} + +/* + * Reserve Xen mfn_list. + */ +static void __init xen_reserve_xen_mfnlist(void) +{ + phys_addr_t start, size; + + if (xen_start_info->mfn_list >= __START_KERNEL_map) { + start = __pa(xen_start_info->mfn_list); + size = PFN_ALIGN(xen_start_info->nr_pages * + sizeof(unsigned long)); + } else { + start = PFN_PHYS(xen_start_info->first_p2m_pfn); + size = PFN_PHYS(xen_start_info->nr_p2m_frames); + } + + if (!xen_is_e820_reserved(start, size)) { + memblock_reserve(start, size); + return; + } + +#ifdef CONFIG_X86_32 + /* + * Relocating the p2m on 32 bit system to an arbitrary virtual address + * is not supported, so just give up. + */ + xen_raw_console_write("Xen hypervisor allocated p2m list conflicts with E820 map\n"); + BUG(); +#else + xen_relocate_p2m(); +#endif +} + /** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ char * __init xen_memory_setup(void) { - static struct e820entry map[E820MAX] __initdata; - - unsigned long max_pfn = xen_start_info->nr_pages; - phys_addr_t mem_end; + unsigned long max_pfn, pfn_s, n_pfns; + phys_addr_t mem_end, addr, size, chunk_size; + u32 type; int rc; struct xen_memory_map memmap; unsigned long max_pages; unsigned long extra_pages = 0; - unsigned long remapped_pages; int i; int op; - max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); + xen_parse_512gb(); + max_pfn = xen_get_pages_limit(); + max_pfn = min(max_pfn, xen_start_info->nr_pages); mem_end = PFN_PHYS(max_pfn); memmap.nr_entries = E820MAX; - set_xen_guest_handle(memmap.buffer, map); + set_xen_guest_handle(memmap.buffer, xen_e820_map); op = xen_initial_domain() ? XENMEM_machine_memory_map : @@ -590,15 +775,16 @@ char * __init xen_memory_setup(void) if (rc == -ENOSYS) { BUG_ON(xen_initial_domain()); memmap.nr_entries = 1; - map[0].addr = 0ULL; - map[0].size = mem_end; + xen_e820_map[0].addr = 0ULL; + xen_e820_map[0].size = mem_end; /* 8MB slack (to balance backend allocations). */ - map[0].size += 8ULL << 20; - map[0].type = E820_RAM; + xen_e820_map[0].size += 8ULL << 20; + xen_e820_map[0].type = E820_RAM; rc = 0; } BUG_ON(rc); BUG_ON(memmap.nr_entries == 0); + xen_e820_map_entries = memmap.nr_entries; /* * Xen won't allow a 1:1 mapping to be created to UNUSABLE @@ -609,24 +795,19 @@ char * __init xen_memory_setup(void) * a patch in the future. */ if (xen_initial_domain()) - xen_ignore_unusable(map, memmap.nr_entries); + xen_ignore_unusable(); /* Make sure the Xen-supplied memory map is well-ordered. */ - sanitize_e820_map(map, memmap.nr_entries, &memmap.nr_entries); + sanitize_e820_map(xen_e820_map, xen_e820_map_entries, + &xen_e820_map_entries); max_pages = xen_get_max_pages(); - if (max_pages > max_pfn) - extra_pages += max_pages - max_pfn; - /* - * Set identity map on non-RAM pages and prepare remapping the - * underlying RAM. - */ - xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn, - &xen_released_pages, &remapped_pages); + /* How many extra pages do we need due to remapping? */ + max_pages += xen_count_remap_pages(max_pfn); - extra_pages += xen_released_pages; - extra_pages += remapped_pages; + if (max_pages > max_pfn) + extra_pages += max_pages - max_pfn; /* * Clamp the amount of extra memory to a EXTRA_MEM_RATIO @@ -635,46 +816,54 @@ char * __init xen_memory_setup(void) * is limited to the max size of lowmem, so that it doesn't * get completely filled. * + * Make sure we have no memory above max_pages, as this area + * isn't handled by the p2m management. + * * In principle there could be a problem in lowmem systems if * the initial memory is also very large with respect to * lowmem, but we won't try to deal with that here. */ - extra_pages = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), - extra_pages); + extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), + extra_pages, max_pages - max_pfn); i = 0; - while (i < memmap.nr_entries) { - phys_addr_t addr = map[i].addr; - phys_addr_t size = map[i].size; - u32 type = map[i].type; + addr = xen_e820_map[0].addr; + size = xen_e820_map[0].size; + while (i < xen_e820_map_entries) { + chunk_size = size; + type = xen_e820_map[i].type; if (type == E820_RAM) { if (addr < mem_end) { - size = min(size, mem_end - addr); + chunk_size = min(size, mem_end - addr); } else if (extra_pages) { - size = min(size, PFN_PHYS(extra_pages)); - extra_pages -= PFN_DOWN(size); - xen_add_extra_mem(addr, size); - xen_max_p2m_pfn = PFN_DOWN(addr + size); + chunk_size = min(size, PFN_PHYS(extra_pages)); + pfn_s = PFN_UP(addr); + n_pfns = PFN_DOWN(addr + chunk_size) - pfn_s; + extra_pages -= n_pfns; + xen_add_extra_mem(pfn_s, n_pfns); + xen_max_p2m_pfn = pfn_s + n_pfns; } else type = E820_UNUSABLE; } - xen_align_and_add_e820_region(addr, size, type); + xen_align_and_add_e820_region(addr, chunk_size, type); - map[i].addr += size; - map[i].size -= size; - if (map[i].size == 0) + addr += chunk_size; + size -= chunk_size; + if (size == 0) { i++; + if (i < xen_e820_map_entries) { + addr = xen_e820_map[i].addr; + size = xen_e820_map[i].size; + } + } } /* * Set the rest as identity mapped, in case PCI BARs are * located here. - * - * PFNs above MAX_P2M_PFN are considered identity mapped as - * well. */ - set_phys_range_identity(map[i-1].addr / PAGE_SIZE, ~0ul); + set_phys_range_identity(addr / PAGE_SIZE, ~0ul); /* * In domU, the ISA region is normal, usable memory, but we @@ -684,34 +873,53 @@ char * __init xen_memory_setup(void) e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_RESERVED); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + /* - * Reserve Xen bits: - * - mfn_list - * - xen_start_info - * See comment above "struct start_info" in <xen/interface/xen.h> - * We tried to make the the memblock_reserve more selective so - * that it would be clear what region is reserved. Sadly we ran - * in the problem wherein on a 64-bit hypervisor with a 32-bit - * initial domain, the pt_base has the cr3 value which is not - * neccessarily where the pagetable starts! As Jan put it: " - * Actually, the adjustment turns out to be correct: The page - * tables for a 32-on-64 dom0 get allocated in the order "first L1", - * "first L2", "first L3", so the offset to the page table base is - * indeed 2. When reading xen/include/public/xen.h's comment - * very strictly, this is not a violation (since there nothing is said - * that the first thing in the page table space is pointed to by - * pt_base; I admit that this seems to be implied though, namely - * do I think that it is implied that the page table space is the - * range [pt_base, pt_base + nt_pt_frames), whereas that - * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames), - * which - without a priori knowledge - the kernel would have - * difficulty to figure out)." - so lets just fall back to the - * easy way and reserve the whole region. + * Check whether the kernel itself conflicts with the target E820 map. + * Failing now is better than running into weird problems later due + * to relocating (and even reusing) pages with kernel text or data. */ - memblock_reserve(__pa(xen_start_info->mfn_list), - xen_start_info->pt_base - xen_start_info->mfn_list); + if (xen_is_e820_reserved(__pa_symbol(_text), + __pa_symbol(__bss_stop) - __pa_symbol(_text))) { + xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n"); + BUG(); + } - sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + /* + * Check for a conflict of the hypervisor supplied page tables with + * the target E820 map. + */ + xen_pt_check_e820(); + + xen_reserve_xen_mfnlist(); + + /* Check for a conflict of the initrd with the target E820 map. */ + if (xen_is_e820_reserved(boot_params.hdr.ramdisk_image, + boot_params.hdr.ramdisk_size)) { + phys_addr_t new_area, start, size; + + new_area = xen_find_free_area(boot_params.hdr.ramdisk_size); + if (!new_area) { + xen_raw_console_write("Can't find new memory area for initrd needed due to E820 map conflict\n"); + BUG(); + } + + start = boot_params.hdr.ramdisk_image; + size = boot_params.hdr.ramdisk_size; + xen_phys_memcpy(new_area, start, size); + pr_info("initrd moved from [mem %#010llx-%#010llx] to [mem %#010llx-%#010llx]\n", + start, start + size, new_area, new_area + size); + memblock_free(start, size); + boot_params.hdr.ramdisk_image = new_area; + boot_params.ext_ramdisk_image = new_area >> 32; + } + + /* + * Set identity map on non-RAM pages and prepare remapping the + * underlying RAM. + */ + xen_set_identity_and_remap(max_pfn); return "Xen"; } @@ -721,26 +929,30 @@ char * __init xen_memory_setup(void) */ char * __init xen_auto_xlated_memory_setup(void) { - static struct e820entry map[E820MAX] __initdata; - struct xen_memory_map memmap; int i; int rc; memmap.nr_entries = E820MAX; - set_xen_guest_handle(memmap.buffer, map); + set_xen_guest_handle(memmap.buffer, xen_e820_map); rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); if (rc < 0) panic("No memory map (%d)\n", rc); - sanitize_e820_map(map, ARRAY_SIZE(map), &memmap.nr_entries); + xen_e820_map_entries = memmap.nr_entries; + + sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map), + &xen_e820_map_entries); - for (i = 0; i < memmap.nr_entries; i++) - e820_add_region(map[i].addr, map[i].size, map[i].type); + for (i = 0; i < xen_e820_map_entries; i++) + e820_add_region(xen_e820_map[i].addr, xen_e820_map[i].size, + xen_e820_map[i].type); - memblock_reserve(__pa(xen_start_info->mfn_list), - xen_start_info->pt_base - xen_start_info->mfn_list); + /* Remove p2m info, it is not needed. */ + xen_start_info->mfn_list = 0; + xen_start_info->first_p2m_pfn = 0; + xen_start_info->nr_p2m_frames = 0; return "Xen"; } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 86484384492e..2a9ff7342791 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -26,6 +26,7 @@ #include <xen/interface/xen.h> #include <xen/interface/vcpu.h> +#include <xen/interface/xenpmu.h> #include <asm/xen/interface.h> #include <asm/xen/hypercall.h> @@ -38,6 +39,7 @@ #include "xen-ops.h" #include "mmu.h" #include "smp.h" +#include "pmu.h" cpumask_var_t xen_cpu_initialized_map; @@ -50,6 +52,7 @@ static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 }; static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 }; +static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 }; static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); @@ -148,11 +151,18 @@ static void xen_smp_intr_free(unsigned int cpu) kfree(per_cpu(xen_irq_work, cpu).name); per_cpu(xen_irq_work, cpu).name = NULL; } + + if (per_cpu(xen_pmu_irq, cpu).irq >= 0) { + unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL); + per_cpu(xen_pmu_irq, cpu).irq = -1; + kfree(per_cpu(xen_pmu_irq, cpu).name); + per_cpu(xen_pmu_irq, cpu).name = NULL; + } }; static int xen_smp_intr_init(unsigned int cpu) { int rc; - char *resched_name, *callfunc_name, *debug_name; + char *resched_name, *callfunc_name, *debug_name, *pmu_name; resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, @@ -218,6 +228,18 @@ static int xen_smp_intr_init(unsigned int cpu) per_cpu(xen_irq_work, cpu).irq = rc; per_cpu(xen_irq_work, cpu).name = callfunc_name; + if (is_xen_pmu(cpu)) { + pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu); + rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu, + xen_pmu_irq_handler, + IRQF_PERCPU|IRQF_NOBALANCING, + pmu_name, NULL); + if (rc < 0) + goto fail; + per_cpu(xen_pmu_irq, cpu).irq = rc; + per_cpu(xen_pmu_irq, cpu).name = pmu_name; + } + return 0; fail: @@ -335,6 +357,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) } set_cpu_sibling_map(0); + xen_pmu_init(0); + if (xen_smp_intr_init(0)) BUG(); @@ -462,6 +486,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) if (rc) return rc; + xen_pmu_init(cpu); + rc = xen_smp_intr_init(cpu); if (rc) return rc; @@ -503,6 +529,7 @@ static void xen_cpu_die(unsigned int cpu) xen_smp_intr_free(cpu); xen_uninit_lock_cpu(cpu); xen_teardown_timer(cpu); + xen_pmu_finish(cpu); } } diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 53b4c0811f4f..feddabdab448 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -11,6 +11,7 @@ #include "xen-ops.h" #include "mmu.h" +#include "pmu.h" static void xen_pv_pre_suspend(void) { @@ -67,16 +68,26 @@ static void xen_pv_post_suspend(int suspend_cancelled) void xen_arch_pre_suspend(void) { - if (xen_pv_domain()) - xen_pv_pre_suspend(); + int cpu; + + for_each_online_cpu(cpu) + xen_pmu_finish(cpu); + + if (xen_pv_domain()) + xen_pv_pre_suspend(); } void xen_arch_post_suspend(int cancelled) { - if (xen_pv_domain()) - xen_pv_post_suspend(cancelled); - else - xen_hvm_post_suspend(cancelled); + int cpu; + + if (xen_pv_domain()) + xen_pv_post_suspend(cancelled); + else + xen_hvm_post_suspend(cancelled); + + for_each_online_cpu(cpu) + xen_pmu_init(cpu); } static void xen_vcpu_notify_restore(void *data) diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 8afdfccf6086..b65f59a358a2 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -104,6 +104,8 @@ ENTRY(hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET) #else ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map) + /* Map the p2m table to a 512GB-aligned user address. */ + ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad PGDIR_SIZE) #endif ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 2292721b1d10..1399423f3418 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -35,13 +35,20 @@ void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_reserve_top(void); +void __init xen_reserve_special_pages(void); +void __init xen_pt_check_e820(void); void xen_mm_pin_all(void); void xen_mm_unpin_all(void); +#ifdef CONFIG_X86_64 +void __init xen_relocate_p2m(void); +#endif +bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size); unsigned long __ref xen_chk_extra_mem(unsigned long pfn); void __init xen_inv_extra_mem(void); void __init xen_remap_memory(void); +phys_addr_t __init xen_find_free_area(phys_addr_t size); char * __init xen_memory_setup(void); char * xen_auto_xlated_memory_setup(void); void __init xen_arch_setup(void); diff --git a/arch/xtensa/include/asm/io.h b/arch/xtensa/include/asm/io.h index c39bb6e61911..867840f5400f 100644 --- a/arch/xtensa/include/asm/io.h +++ b/arch/xtensa/include/asm/io.h @@ -57,6 +57,7 @@ static inline void __iomem *ioremap_cache(unsigned long offset, else BUG(); } +#define ioremap_cache ioremap_cache #define ioremap_wc ioremap_nocache #define ioremap_wt ioremap_nocache |