diff options
Diffstat (limited to 'arch/x86')
69 files changed, 930 insertions, 538 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d43e7e1c784b..226d5696e1d1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -22,6 +22,7 @@ config X86_64 ### Arch settings config X86 def_bool y + select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS select ARCH_HAS_FAST_MULTIPLIER @@ -178,7 +179,7 @@ config SBUS config NEED_DMA_MAP_STATE def_bool y - depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG + depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG || SWIOTLB config NEED_SG_DMA_LENGTH def_bool y @@ -1421,6 +1422,16 @@ config ILLEGAL_POINTER_VALUE source "mm/Kconfig" +config X86_PMEM_LEGACY + bool "Support non-standard NVDIMMs and ADR protected memory" + help + Treat memory marked using the non-standard e820 type of 12 as used + by the Intel Sandy Bridge-EP reference BIOS as protected memory. + The kernel will offer these regions to the 'pmem' driver so + they can be used for persistent storage. + + Say Y if unsure. + config HIGHPTE bool "Allocate 3rd-level pagetables from highmem" depends on HIGHMEM diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 20028da8ae18..72484a645f05 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -43,10 +43,6 @@ config EARLY_PRINTK with klogd/syslogd or the X server. You should normally N here, unless you want to debug such a crash. -config EARLY_PRINTK_INTEL_MID - bool "Early printk for Intel MID platform support" - depends on EARLY_PRINTK && X86_INTEL_MID - config EARLY_PRINTK_DBGP bool "Early printk via EHCI debug port" depends on EARLY_PRINTK && PCI diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index ef17683484e9..48304b89b601 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -1109,6 +1109,8 @@ struct boot_params *make_boot_params(struct efi_config *c) if (!cmdline_ptr) goto fail; hdr->cmd_line_ptr = (unsigned long)cmdline_ptr; + /* Fill in upper bits of command line address, NOP on 32 bit */ + boot_params->ext_cmd_line_ptr = (u64)(unsigned long)cmdline_ptr >> 32; hdr->ramdisk_image = 0; hdr->ramdisk_size = 0; diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 89dd0d78013a..805d25ca5f1d 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -2,15 +2,14 @@ #define BOOT_COMPRESSED_MISC_H /* - * we have to be careful, because no indirections are allowed here, and - * paravirt_ops is a kind of one. As it will only run in baremetal anyway, - * we just keep it from happening + * Special hack: we have to be careful, because no indirections are allowed here, + * and paravirt_ops is a kind of one. As it will only run in baremetal anyway, + * we just keep it from happening. (This list needs to be extended when new + * paravirt and debugging variants are added.) */ #undef CONFIG_PARAVIRT +#undef CONFIG_PARAVIRT_SPINLOCKS #undef CONFIG_KASAN -#ifdef CONFIG_X86_32 -#define _ASM_X86_DESC_H 1 -#endif #include <linux/linkage.h> #include <linux/screen_info.h> diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S index a4771dcd1fcf..1f20b35d8573 100644 --- a/arch/x86/crypto/sha512-avx2-asm.S +++ b/arch/x86/crypto/sha512-avx2-asm.S @@ -79,7 +79,7 @@ NUM_BLKS = %rdx c = %rcx d = %r8 e = %rdx -y3 = %rdi +y3 = %rsi TBL = %rbp diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index a821b1cd4fa7..72bf2680f819 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -427,6 +427,13 @@ sysretl_from_sys_call: * cs and ss are loaded from MSRs. * (Note: 32bit->32bit SYSRET is different: since r11 * does not exist, it merely sets eflags.IF=1). + * + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss + * descriptor is not reinitialized. This means that we must + * avoid SYSRET with SS == NULL, which could happen if we schedule, + * exit the kernel, and re-enter using an interrupt vector. (All + * interrupt entries on x86_64 set SS to NULL.) We prevent that + * from happening by reloading SS in __switch_to. */ USERGS_SYSRET32 diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 7ee9b94d9921..3d6606fb97d0 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -265,6 +265,7 @@ #define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ #define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ #define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ +#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index e42f758a0fbd..055ea9941dd5 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -50,7 +50,7 @@ extern const struct hypervisor_x86 *x86_hyper; /* Recognized hypervisors */ extern const struct hypervisor_x86 x86_hyper_vmware; extern const struct hypervisor_x86 x86_hyper_ms_hyperv; -extern const struct hypervisor_x86 x86_hyper_xen_hvm; +extern const struct hypervisor_x86 x86_hyper_xen; extern const struct hypervisor_x86 x86_hyper_kvm; extern void init_hypervisor(struct cpuinfo_x86 *c); diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h index 705d35708a50..7c5af123bdbd 100644 --- a/arch/x86/include/asm/intel-mid.h +++ b/arch/x86/include/asm/intel-mid.h @@ -136,9 +136,6 @@ extern enum intel_mid_timer_options intel_mid_timer_options; #define SFI_MTMR_MAX_NUM 8 #define SFI_MRTC_MAX 8 -extern struct console early_hsu_console; -extern void hsu_early_console_init(const char *); - extern void intel_scu_devices_create(void); extern void intel_scu_devices_destroy(void); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dea2e7e962e3..f4a555beef19 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -207,6 +207,7 @@ union kvm_mmu_page_role { unsigned nxe:1; unsigned cr0_wp:1; unsigned smep_andnot_wp:1; + unsigned smap_andnot_wp:1; }; }; @@ -400,6 +401,7 @@ struct kvm_vcpu_arch { struct kvm_mmu_memory_cache mmu_page_header_cache; struct fpu guest_fpu; + bool eager_fpu; u64 xcr0; u64 guest_supported_xcr0; u32 guest_xstate_size; @@ -743,6 +745,7 @@ struct kvm_x86_ops { void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); + void (*fpu_activate)(struct kvm_vcpu *vcpu); void (*fpu_deactivate)(struct kvm_vcpu *vcpu); void (*tlb_flush)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index e2d4a4afa8c3..3bbc07a57a31 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -20,13 +20,10 @@ extern unsigned long switcher_addr; /* Found in switcher.S */ extern unsigned long default_idt_entries[]; -/* Declarations for definitions in lguest_guest.S */ -extern char lguest_noirq_start[], lguest_noirq_end[]; +/* Declarations for definitions in arch/x86/lguest/head_32.S */ +extern char lguest_noirq_iret[]; extern const char lgstart_cli[], lgend_cli[]; -extern const char lgstart_sti[], lgend_sti[]; -extern const char lgstart_popf[], lgend_popf[]; extern const char lgstart_pushf[], lgend_pushf[]; -extern const char lgstart_iret[], lgend_iret[]; extern void lguest_iret(void); extern void lguest_init(void); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 19507ffa5d28..5fabf1362942 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -107,7 +107,7 @@ static inline unsigned long regs_return_value(struct pt_regs *regs) static inline int user_mode(struct pt_regs *regs) { #ifdef CONFIG_X86_32 - return (regs->cs & SEGMENT_RPL_MASK) == USER_RPL; + return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= USER_RPL; #else return !!(regs->cs & 3); #endif diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 25b1cc07d496..d6b078e9fa28 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -95,7 +95,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, struct pvclock_vsyscall_time_info { struct pvclock_vcpu_time_info pvti; - u32 migrate_count; } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 5a9856eb12ba..7d5a1929d76b 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -231,11 +231,21 @@ #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8) #ifdef __KERNEL__ + +/* + * early_idt_handler_array is an array of entry points referenced in the + * early IDT. For simplicity, it's a real array with one entry point + * every nine bytes. That leaves room for an optional 'push $0' if the + * vector has no error code (two bytes), a 'push $vector_number' (two + * bytes), and a jump to the common entry code (up to five bytes). + */ +#define EARLY_IDT_HANDLER_SIZE 9 + #ifndef __ASSEMBLY__ -extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5]; +extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE]; #ifdef CONFIG_TRACING -# define trace_early_idt_handlers early_idt_handlers +# define trace_early_idt_handler_array early_idt_handler_array #endif /* diff --git a/arch/x86/include/asm/serial.h b/arch/x86/include/asm/serial.h index 460b84f64556..8378b8c9109c 100644 --- a/arch/x86/include/asm/serial.h +++ b/arch/x86/include/asm/serial.h @@ -12,11 +12,11 @@ /* Standard COM flags (except for COM4, because of the 8514 problem) */ #ifdef CONFIG_SERIAL_DETECT_IRQ -# define STD_COMX_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ) -# define STD_COM4_FLAGS (ASYNC_BOOT_AUTOCONF | 0 | ASYNC_AUTO_IRQ) +# define STD_COMX_FLAGS (UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | UPF_AUTO_IRQ) +# define STD_COM4_FLAGS (UPF_BOOT_AUTOCONF | 0 | UPF_AUTO_IRQ) #else -# define STD_COMX_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST | 0 ) -# define STD_COM4_FLAGS (ASYNC_BOOT_AUTOCONF | 0 | 0 ) +# define STD_COMX_FLAGS (UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | 0 ) +# define STD_COM4_FLAGS (UPF_BOOT_AUTOCONF | 0 | 0 ) #endif #define SERIAL_PORT_DFNS \ diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index cf87de3fc390..64b611782ef0 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -169,7 +169,7 @@ static inline int arch_spin_is_contended(arch_spinlock_t *lock) struct __raw_tickets tmp = READ_ONCE(lock->tickets); tmp.head &= ~TICKET_SLOWPATH_FLAG; - return (tmp.tail - tmp.head) > TICKET_LOCK_INC; + return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC; } #define arch_spin_is_contended arch_spin_is_contended diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 358dcd338915..c44a5d53e464 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -269,4 +269,9 @@ static inline bool xen_arch_need_swiotlb(struct device *dev, return false; } +static inline unsigned long xen_get_swiotlb_free_pages(unsigned int order) +{ + return __get_free_pages(__GFP_NOWARN, order); +} + #endif /* _ASM_X86_XEN_PAGE_H */ diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h index d993e33f5236..960a8a9dc4ab 100644 --- a/arch/x86/include/uapi/asm/e820.h +++ b/arch/x86/include/uapi/asm/e820.h @@ -33,6 +33,16 @@ #define E820_NVS 4 #define E820_UNUSABLE 5 +/* + * This is a non-standardized way to represent ADR or NVDIMM regions that + * persist over a reboot. The kernel will ignore their special capabilities + * unless the CONFIG_X86_PMEM_LEGACY=y option is set. + * + * ( Note that older platforms also used 6 for the same type of memory, + * but newer versions switched to 12 as 6 was assigned differently. Some + * time they will learn... ) + */ +#define E820_PRAM 12 /* * reserved RAM used by kernel itself diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index 90c458e66e13..ce6068dbcfbc 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -225,6 +225,8 @@ #define HV_STATUS_INVALID_HYPERCALL_CODE 2 #define HV_STATUS_INVALID_HYPERCALL_INPUT 3 #define HV_STATUS_INVALID_ALIGNMENT 4 +#define HV_STATUS_INSUFFICIENT_MEMORY 11 +#define HV_STATUS_INVALID_CONNECTION_ID 18 #define HV_STATUS_INSUFFICIENT_BUFFERS 19 typedef struct _HV_REFERENCE_TSC_PAGE { diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index 1a4eae695ca8..3c6bb342a48f 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -61,6 +61,9 @@ #define MSR_OFFCORE_RSP_1 0x000001a7 #define MSR_NHM_TURBO_RATIO_LIMIT 0x000001ad #define MSR_IVT_TURBO_RATIO_LIMIT 0x000001ae +#define MSR_TURBO_RATIO_LIMIT 0x000001ad +#define MSR_TURBO_RATIO_LIMIT1 0x000001ae +#define MSR_TURBO_RATIO_LIMIT2 0x000001af #define MSR_LBR_SELECT 0x000001c8 #define MSR_LBR_TOS 0x000001c9 @@ -137,6 +140,7 @@ #define MSR_CORE_C3_RESIDENCY 0x000003fc #define MSR_CORE_C6_RESIDENCY 0x000003fd #define MSR_CORE_C7_RESIDENCY 0x000003fe +#define MSR_KNL_CORE_C6_RESIDENCY 0x000003ff #define MSR_PKG_C2_RESIDENCY 0x0000060d #define MSR_PKG_C8_RESIDENCY 0x00000630 #define MSR_PKG_C9_RESIDENCY 0x00000631 @@ -165,6 +169,11 @@ #define MSR_PP1_ENERGY_STATUS 0x00000641 #define MSR_PP1_POLICY 0x00000642 +#define MSR_PKG_WEIGHTED_CORE_C0_RES 0x00000658 +#define MSR_PKG_ANY_CORE_C0_RES 0x00000659 +#define MSR_PKG_ANY_GFXE_C0_RES 0x0000065A +#define MSR_PKG_BOTH_CORE_GFXE_C0_RES 0x0000065B + #define MSR_CORE_C1_RES 0x00000660 #define MSR_CC6_DEMOTION_POLICY_CONFIG 0x00000668 diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index c887cd944f0c..9bcd0b56ca17 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -95,6 +95,7 @@ obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o +obj-$(CONFIG_X86_PMEM_LEGACY) += pmem.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 803b684676ff..dbe76a14c3c9 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -757,7 +757,7 @@ static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu) } /* wrapper to silence section mismatch warning */ -int __ref acpi_map_cpu(acpi_handle handle, int physid, int *pcpu) +int __ref acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu) { return _acpi_map_lsapic(handle, physid, pcpu); } diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index d9d0bd2faaf4..ab3219b3fbda 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -171,8 +171,8 @@ update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu) for_each_online_cpu(cpu) { if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) continue; - __cpu_clear(this_cpu, per_cpu(cpus_in_cluster, cpu)); - __cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu)); + cpumask_clear_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu)); + cpumask_clear_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu)); } free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu)); free_cpumask_var(per_cpu(ipi_mask, this_cpu)); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index fd470ebf924e..e4cf63301ff4 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -720,6 +720,9 @@ static void init_amd(struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH)) if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH); + + /* AMD CPUs don't reset SS attributes on SYSRET */ + set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 36ce402a3fa5..d820d8eae96b 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -27,8 +27,8 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = { -#ifdef CONFIG_XEN_PVHVM - &x86_hyper_xen_hvm, +#ifdef CONFIG_XEN + &x86_hyper_xen, #endif &x86_hyper_vmware, &x86_hyper_ms_hyperv, diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index e535533d5ab8..20190bdac9d5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -708,6 +708,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, struct pt_regs *regs) { int i, ret = 0; + char *tmp; for (i = 0; i < mca_cfg.banks; i++) { m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); @@ -716,9 +717,11 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, if (quirk_no_way_out) quirk_no_way_out(i, m, regs); } - if (mce_severity(m, mca_cfg.tolerant, msg, true) >= - MCE_PANIC_SEVERITY) + + if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { + *msg = tmp; ret = 1; + } } return ret; } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 87848ebe2bb7..4f7001f28936 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -190,6 +190,7 @@ static bool check_hw_exists(void) u64 val, val_fail, val_new= ~0; int i, reg, reg_fail, ret = 0; int bios_fail = 0; + int reg_safe = -1; /* * Check to see if the BIOS enabled any of the counters, if so @@ -204,6 +205,8 @@ static bool check_hw_exists(void) bios_fail = 1; val_fail = val; reg_fail = reg; + } else { + reg_safe = i; } } @@ -222,11 +225,22 @@ static bool check_hw_exists(void) } /* + * If all the counters are enabled, the below test will always + * fail. The tools will also become useless in this scenario. + * Just fail and disable the hardware counters. + */ + + if (reg_safe == -1) { + reg = reg_safe; + goto msr_fail; + } + + /* * Read the current value, change it and read it back to see if it * matches, this is needed to detect certain hardware emulators * (qemu/kvm) that don't trap on the MSR access and always return 0s. */ - reg = x86_pmu_event_addr(0); + reg = x86_pmu_event_addr(reg_safe); if (rdmsrl_safe(reg, &val)) goto msr_fail; val ^= 0xffffUL; @@ -611,6 +625,7 @@ struct sched_state { int event; /* event index */ int counter; /* counter index */ int unassigned; /* number of events to be assigned left */ + int nr_gp; /* number of GP counters used */ unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; }; @@ -620,27 +635,29 @@ struct sched_state { struct perf_sched { int max_weight; int max_events; - struct perf_event **events; - struct sched_state state; + int max_gp; int saved_states; + struct event_constraint **constraints; + struct sched_state state; struct sched_state saved[SCHED_STATES_MAX]; }; /* * Initialize interator that runs through all events and counters. */ -static void perf_sched_init(struct perf_sched *sched, struct perf_event **events, - int num, int wmin, int wmax) +static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints, + int num, int wmin, int wmax, int gpmax) { int idx; memset(sched, 0, sizeof(*sched)); sched->max_events = num; sched->max_weight = wmax; - sched->events = events; + sched->max_gp = gpmax; + sched->constraints = constraints; for (idx = 0; idx < num; idx++) { - if (events[idx]->hw.constraint->weight == wmin) + if (constraints[idx]->weight == wmin) break; } @@ -687,7 +704,7 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) if (sched->state.event >= sched->max_events) return false; - c = sched->events[sched->state.event]->hw.constraint; + c = sched->constraints[sched->state.event]; /* Prefer fixed purpose counters */ if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { idx = INTEL_PMC_IDX_FIXED; @@ -696,11 +713,16 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) goto done; } } + /* Grab the first unused counter starting with idx */ idx = sched->state.counter; for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { - if (!__test_and_set_bit(idx, sched->state.used)) + if (!__test_and_set_bit(idx, sched->state.used)) { + if (sched->state.nr_gp++ >= sched->max_gp) + return false; + goto done; + } } return false; @@ -745,7 +767,7 @@ static bool perf_sched_next_event(struct perf_sched *sched) if (sched->state.weight > sched->max_weight) return false; } - c = sched->events[sched->state.event]->hw.constraint; + c = sched->constraints[sched->state.event]; } while (c->weight != sched->state.weight); sched->state.counter = 0; /* start with first counter */ @@ -756,12 +778,12 @@ static bool perf_sched_next_event(struct perf_sched *sched) /* * Assign a counter for each event. */ -int perf_assign_events(struct perf_event **events, int n, - int wmin, int wmax, int *assign) +int perf_assign_events(struct event_constraint **constraints, int n, + int wmin, int wmax, int gpmax, int *assign) { struct perf_sched sched; - perf_sched_init(&sched, events, n, wmin, wmax); + perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax); do { if (!perf_sched_find_counter(&sched)) @@ -788,9 +810,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) x86_pmu.start_scheduling(cpuc); for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { - hwc = &cpuc->event_list[i]->hw; + cpuc->event_constraint[i] = NULL; c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]); - hwc->constraint = c; + cpuc->event_constraint[i] = c; wmin = min(wmin, c->weight); wmax = max(wmax, c->weight); @@ -801,7 +823,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) */ for (i = 0; i < n; i++) { hwc = &cpuc->event_list[i]->hw; - c = hwc->constraint; + c = cpuc->event_constraint[i]; /* never assigned */ if (hwc->idx == -1) @@ -821,9 +843,26 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) } /* slow path */ - if (i != n) - unsched = perf_assign_events(cpuc->event_list, n, wmin, - wmax, assign); + if (i != n) { + int gpmax = x86_pmu.num_counters; + + /* + * Do not allow scheduling of more than half the available + * generic counters. + * + * This helps avoid counter starvation of sibling thread by + * ensuring at most half the counters cannot be in exclusive + * mode. There is no designated counters for the limits. Any + * N/2 counters can be used. This helps with events with + * specific counter constraints. + */ + if (is_ht_workaround_enabled() && !cpuc->is_fake && + READ_ONCE(cpuc->excl_cntrs->exclusive_present)) + gpmax /= 2; + + unsched = perf_assign_events(cpuc->event_constraint, n, wmin, + wmax, gpmax, assign); + } /* * In case of success (unsched = 0), mark events as committed, @@ -840,7 +879,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) e = cpuc->event_list[i]; e->hw.flags |= PERF_X86_EVENT_COMMITTED; if (x86_pmu.commit_scheduling) - x86_pmu.commit_scheduling(cpuc, e, assign[i]); + x86_pmu.commit_scheduling(cpuc, i, assign[i]); } } @@ -1292,8 +1331,10 @@ static void x86_pmu_del(struct perf_event *event, int flags) x86_pmu.put_event_constraints(cpuc, event); /* Delete the array entry. */ - while (++i < cpuc->n_events) + while (++i < cpuc->n_events) { cpuc->event_list[i-1] = cpuc->event_list[i]; + cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; + } --cpuc->n_events; perf_event_update_userpage(event); diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 329f0356ad4a..ef78516850fb 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -65,15 +65,16 @@ struct event_constraint { /* * struct hw_perf_event.flags flags */ -#define PERF_X86_EVENT_PEBS_LDLAT 0x1 /* ld+ldlat data address sampling */ -#define PERF_X86_EVENT_PEBS_ST 0x2 /* st data address sampling */ -#define PERF_X86_EVENT_PEBS_ST_HSW 0x4 /* haswell style datala, store */ -#define PERF_X86_EVENT_COMMITTED 0x8 /* event passed commit_txn */ -#define PERF_X86_EVENT_PEBS_LD_HSW 0x10 /* haswell style datala, load */ -#define PERF_X86_EVENT_PEBS_NA_HSW 0x20 /* haswell style datala, unknown */ -#define PERF_X86_EVENT_EXCL 0x40 /* HT exclusivity on counter */ -#define PERF_X86_EVENT_DYNAMIC 0x80 /* dynamic alloc'd constraint */ -#define PERF_X86_EVENT_RDPMC_ALLOWED 0x40 /* grant rdpmc permission */ +#define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */ +#define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */ +#define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */ +#define PERF_X86_EVENT_COMMITTED 0x0008 /* event passed commit_txn */ +#define PERF_X86_EVENT_PEBS_LD_HSW 0x0010 /* haswell style datala, load */ +#define PERF_X86_EVENT_PEBS_NA_HSW 0x0020 /* haswell style datala, unknown */ +#define PERF_X86_EVENT_EXCL 0x0040 /* HT exclusivity on counter */ +#define PERF_X86_EVENT_DYNAMIC 0x0080 /* dynamic alloc'd constraint */ +#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */ +#define PERF_X86_EVENT_EXCL_ACCT 0x0200 /* accounted EXCL event */ struct amd_nb { @@ -134,8 +135,6 @@ enum intel_excl_state_type { struct intel_excl_states { enum intel_excl_state_type init_state[X86_PMC_IDX_MAX]; enum intel_excl_state_type state[X86_PMC_IDX_MAX]; - int num_alloc_cntrs;/* #counters allocated */ - int max_alloc_cntrs;/* max #counters allowed */ bool sched_started; /* true if scheduling has started */ }; @@ -144,6 +143,11 @@ struct intel_excl_cntrs { struct intel_excl_states states[2]; + union { + u16 has_exclusive[2]; + u32 exclusive_present; + }; + int refcnt; /* per-core: #HT threads */ unsigned core_id; /* per-core: core id */ }; @@ -172,7 +176,11 @@ struct cpu_hw_events { added in the current transaction */ int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ u64 tags[X86_PMC_IDX_MAX]; + struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ + struct event_constraint *event_constraint[X86_PMC_IDX_MAX]; + + int n_excl; /* the number of exclusive events */ unsigned int group_flag; int is_fake; @@ -519,9 +527,7 @@ struct x86_pmu { void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); - void (*commit_scheduling)(struct cpu_hw_events *cpuc, - struct perf_event *event, - int cntr); + void (*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr); void (*start_scheduling)(struct cpu_hw_events *cpuc); @@ -717,8 +723,8 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, void x86_pmu_enable_all(int added); -int perf_assign_events(struct perf_event **events, int n, - int wmin, int wmax, int *assign); +int perf_assign_events(struct event_constraint **constraints, int n, + int wmin, int wmax, int gpmax, int *assign); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign); void x86_pmu_stop(struct perf_event *event, int flags); @@ -929,4 +935,8 @@ static inline struct intel_shared_regs *allocate_shared_regs(int cpu) return NULL; } +static inline int is_ht_workaround_enabled(void) +{ + return 0; +} #endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 9da2400c2ec3..a1e35c9f06b9 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1134,7 +1134,7 @@ static __initconst const u64 slm_hw_cache_extra_regs [ C(LL ) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS, - [ C(RESULT_MISS) ] = SLM_DMND_READ|SLM_LLC_MISS, + [ C(RESULT_MISS) ] = 0, }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS, @@ -1184,8 +1184,7 @@ static __initconst const u64 slm_hw_cache_event_ids [ C(OP_READ) ] = { /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ [ C(RESULT_ACCESS) ] = 0x01b7, - /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ - [ C(RESULT_MISS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0, }, [ C(OP_WRITE) ] = { /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ @@ -1217,7 +1216,7 @@ static __initconst const u64 slm_hw_cache_event_ids [ C(ITLB) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ + [ C(RESULT_MISS) ] = 0x40205, /* PAGE_WALKS.I_SIDE_WALKS */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = -1, @@ -1924,7 +1923,6 @@ intel_start_scheduling(struct cpu_hw_events *cpuc) xl = &excl_cntrs->states[tid]; xl->sched_started = true; - xl->num_alloc_cntrs = 0; /* * lock shared state until we are done scheduling * in stop_event_scheduling() @@ -2001,6 +1999,11 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, * across HT threads */ is_excl = c->flags & PERF_X86_EVENT_EXCL; + if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) { + event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT; + if (!cpuc->n_excl++) + WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1); + } /* * xl = state of current HT @@ -2009,18 +2012,6 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, xl = &excl_cntrs->states[tid]; xlo = &excl_cntrs->states[o_tid]; - /* - * do not allow scheduling of more than max_alloc_cntrs - * which is set to half the available generic counters. - * this helps avoid counter starvation of sibling thread - * by ensuring at most half the counters cannot be in - * exclusive mode. There is not designated counters for the - * limits. Any N/2 counters can be used. This helps with - * events with specifix counter constraints - */ - if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs) - return &emptyconstraint; - cx = c; /* @@ -2107,7 +2098,7 @@ static struct event_constraint * intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) { - struct event_constraint *c1 = event->hw.constraint; + struct event_constraint *c1 = cpuc->event_constraint[idx]; struct event_constraint *c2; /* @@ -2151,6 +2142,11 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc, xl = &excl_cntrs->states[tid]; xlo = &excl_cntrs->states[o_tid]; + if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) { + hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT; + if (!--cpuc->n_excl) + WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0); + } /* * put_constraint may be called from x86_schedule_events() @@ -2189,8 +2185,6 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, static void intel_put_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { - struct event_constraint *c = event->hw.constraint; - intel_put_shared_regs_event_constraints(cpuc, event); /* @@ -2198,19 +2192,14 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc, * all events are subject to and must call the * put_excl_constraints() routine */ - if (c && cpuc->excl_cntrs) + if (cpuc->excl_cntrs) intel_put_excl_constraints(cpuc, event); - - /* cleanup dynamic constraint */ - if (c && (c->flags & PERF_X86_EVENT_DYNAMIC)) - event->hw.constraint = NULL; } -static void intel_commit_scheduling(struct cpu_hw_events *cpuc, - struct perf_event *event, int cntr) +static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr) { struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; - struct event_constraint *c = event->hw.constraint; + struct event_constraint *c = cpuc->event_constraint[idx]; struct intel_excl_states *xlo, *xl; int tid = cpuc->excl_thread_id; int o_tid = 1 - tid; @@ -2533,34 +2522,6 @@ ssize_t intel_event_sysfs_show(char *page, u64 config) return x86_event_sysfs_show(page, config, event); } -static __initconst const struct x86_pmu core_pmu = { - .name = "core", - .handle_irq = x86_pmu_handle_irq, - .disable_all = x86_pmu_disable_all, - .enable_all = core_pmu_enable_all, - .enable = core_pmu_enable_event, - .disable = x86_pmu_disable_event, - .hw_config = x86_pmu_hw_config, - .schedule_events = x86_schedule_events, - .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, - .perfctr = MSR_ARCH_PERFMON_PERFCTR0, - .event_map = intel_pmu_event_map, - .max_events = ARRAY_SIZE(intel_perfmon_event_map), - .apic = 1, - /* - * Intel PMCs cannot be accessed sanely above 32 bit width, - * so we install an artificial 1<<31 period regardless of - * the generic event period: - */ - .max_period = (1ULL << 31) - 1, - .get_event_constraints = intel_get_event_constraints, - .put_event_constraints = intel_put_event_constraints, - .event_constraints = intel_core_event_constraints, - .guest_get_msrs = core_guest_get_msrs, - .format_attrs = intel_arch_formats_attr, - .events_sysfs_show = intel_event_sysfs_show, -}; - struct intel_shared_regs *allocate_shared_regs(int cpu) { struct intel_shared_regs *regs; @@ -2668,8 +2629,6 @@ static void intel_pmu_cpu_starting(int cpu) cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR]; if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { - int h = x86_pmu.num_counters >> 1; - for_each_cpu(i, topology_thread_cpumask(cpu)) { struct intel_excl_cntrs *c; @@ -2683,11 +2642,6 @@ static void intel_pmu_cpu_starting(int cpu) } cpuc->excl_cntrs->core_id = core_id; cpuc->excl_cntrs->refcnt++; - /* - * set hard limit to half the number of generic counters - */ - cpuc->excl_cntrs->states[0].max_alloc_cntrs = h; - cpuc->excl_cntrs->states[1].max_alloc_cntrs = h; } } @@ -2743,6 +2697,44 @@ static struct attribute *intel_arch3_formats_attr[] = { NULL, }; +static __initconst const struct x86_pmu core_pmu = { + .name = "core", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = core_pmu_enable_all, + .enable = core_pmu_enable_event, + .disable = x86_pmu_disable_event, + .hw_config = x86_pmu_hw_config, + .schedule_events = x86_schedule_events, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = intel_pmu_event_map, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, + /* + * Intel PMCs cannot be accessed sanely above 32-bit width, + * so we install an artificial 1<<31 period regardless of + * the generic event period: + */ + .max_period = (1ULL<<31) - 1, + .get_event_constraints = intel_get_event_constraints, + .put_event_constraints = intel_put_event_constraints, + .event_constraints = intel_core_event_constraints, + .guest_get_msrs = core_guest_get_msrs, + .format_attrs = intel_arch_formats_attr, + .events_sysfs_show = intel_event_sysfs_show, + + /* + * Virtual (or funny metal) CPU can define x86_pmu.extra_regs + * together with PMU version 1 and thus be using core_pmu with + * shared_regs. We need following callbacks here to allocate + * it properly. + */ + .cpu_prepare = intel_pmu_cpu_prepare, + .cpu_starting = intel_pmu_cpu_starting, + .cpu_dying = intel_pmu_cpu_dying, +}; + static __initconst const struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, @@ -3275,7 +3267,7 @@ __init int intel_pmu_init(void) hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE| BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM; - intel_pmu_lbr_init_snb(); + intel_pmu_lbr_init_hsw(); x86_pmu.event_constraints = intel_bdw_event_constraints; x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index ca69ea56c712..7f73b3553e2e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -558,6 +558,8 @@ struct event_constraint intel_core2_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ + /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01), EVENT_CONSTRAINT_END }; @@ -565,6 +567,8 @@ struct event_constraint intel_atom_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ + /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01), EVENT_CONSTRAINT_END }; @@ -588,6 +592,8 @@ struct event_constraint intel_nehalem_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ + /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f), EVENT_CONSTRAINT_END }; @@ -603,6 +609,8 @@ struct event_constraint intel_westmere_pebs_event_constraints[] = { INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ + /* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */ + INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f), EVENT_CONSTRAINT_END }; @@ -698,9 +706,9 @@ void intel_pmu_pebs_disable(struct perf_event *event) cpuc->pebs_enabled &= ~(1ULL << hwc->idx); - if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_LDLAT) + if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32)); - else if (event->hw.constraint->flags & PERF_X86_EVENT_PEBS_ST) + else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) cpuc->pebs_enabled &= ~(1ULL << 63); if (cpuc->enabled) diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index f2770641c0fd..123ff1bb2f60 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -151,7 +151,7 @@ static int __init pt_pmu_hw_init(void) de_attr->attr.attr.name = pt_caps[i].name; - sysfs_attr_init(&de_attrs->attr.attr); + sysfs_attr_init(&de_attr->attr.attr); de_attr->attr.attr.mode = S_IRUGO; de_attr->attr.show = pt_cap_show; @@ -615,7 +615,8 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, struct perf_output_handle *handle) { - unsigned long idx, npages, end; + unsigned long head = local64_read(&buf->head); + unsigned long idx, npages, wakeup; if (buf->snapshot) return 0; @@ -634,17 +635,26 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, buf->topa_index[buf->stop_pos]->stop = 0; buf->topa_index[buf->intr_pos]->intr = 0; - if (pt_cap_get(PT_CAP_topa_multiple_entries)) { - npages = (handle->size + 1) >> PAGE_SHIFT; - end = (local64_read(&buf->head) >> PAGE_SHIFT) + npages; - /*if (end > handle->wakeup >> PAGE_SHIFT) - end = handle->wakeup >> PAGE_SHIFT;*/ - idx = end & (buf->nr_pages - 1); - buf->stop_pos = idx; - idx = (local64_read(&buf->head) >> PAGE_SHIFT) + npages - 1; - idx &= buf->nr_pages - 1; - buf->intr_pos = idx; - } + /* how many pages till the STOP marker */ + npages = handle->size >> PAGE_SHIFT; + + /* if it's on a page boundary, fill up one more page */ + if (!offset_in_page(head + handle->size + 1)) + npages++; + + idx = (head >> PAGE_SHIFT) + npages; + idx &= buf->nr_pages - 1; + buf->stop_pos = idx; + + wakeup = handle->wakeup >> PAGE_SHIFT; + + /* in the worst case, wake up the consumer one page before hard stop */ + idx = (head >> PAGE_SHIFT) + npages - 1; + if (idx > wakeup) + idx = wakeup; + + idx &= buf->nr_pages - 1; + buf->intr_pos = idx; buf->topa_index[buf->stop_pos]->stop = 1; buf->topa_index[buf->intr_pos]->intr = 1; @@ -988,39 +998,36 @@ static int pt_event_add(struct perf_event *event, int mode) int ret = -EBUSY; if (pt->handle.event) - goto out; + goto fail; buf = perf_aux_output_begin(&pt->handle, event); - if (!buf) { - ret = -EINVAL; - goto out; - } + ret = -EINVAL; + if (!buf) + goto fail_stop; pt_buffer_reset_offsets(buf, pt->handle.head); if (!buf->snapshot) { ret = pt_buffer_reset_markers(buf, &pt->handle); - if (ret) { - perf_aux_output_end(&pt->handle, 0, true); - goto out; - } + if (ret) + goto fail_end_stop; } if (mode & PERF_EF_START) { pt_event_start(event, 0); - if (hwc->state == PERF_HES_STOPPED) { - pt_event_del(event, 0); - ret = -EBUSY; - } + ret = -EBUSY; + if (hwc->state == PERF_HES_STOPPED) + goto fail_end_stop; } else { hwc->state = PERF_HES_STOPPED; } - ret = 0; -out: - - if (ret) - hwc->state = PERF_HES_STOPPED; + return 0; +fail_end_stop: + perf_aux_output_end(&pt->handle, 0, true); +fail_stop: + hwc->state = PERF_HES_STOPPED; +fail: return ret; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index c4bb8b8e5017..358c54ad20d4 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -62,6 +62,14 @@ #define RAPL_IDX_PP1_NRG_STAT 3 /* gpu */ #define INTEL_RAPL_PP1 0x4 /* pseudo-encoding */ +#define NR_RAPL_DOMAINS 0x4 +static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { + "pp0-core", + "package", + "dram", + "pp1-gpu", +}; + /* Clients have PP0, PKG */ #define RAPL_IDX_CLN (1<<RAPL_IDX_PP0_NRG_STAT|\ 1<<RAPL_IDX_PKG_NRG_STAT|\ @@ -112,7 +120,6 @@ static struct perf_pmu_events_attr event_attr_##v = { \ struct rapl_pmu { spinlock_t lock; - int hw_unit; /* 1/2^hw_unit Joule */ int n_active; /* number of active events */ struct list_head active_list; struct pmu *pmu; /* pointer to rapl_pmu_class */ @@ -120,6 +127,7 @@ struct rapl_pmu { struct hrtimer hrtimer; }; +static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; /* 1/2^hw_unit Joule */ static struct pmu rapl_pmu_class; static cpumask_t rapl_cpu_mask; static int rapl_cntr_mask; @@ -127,6 +135,7 @@ static int rapl_cntr_mask; static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu); static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free); +static struct x86_pmu_quirk *rapl_quirks; static inline u64 rapl_read_counter(struct perf_event *event) { u64 raw; @@ -134,15 +143,28 @@ static inline u64 rapl_read_counter(struct perf_event *event) return raw; } -static inline u64 rapl_scale(u64 v) +#define rapl_add_quirk(func_) \ +do { \ + static struct x86_pmu_quirk __quirk __initdata = { \ + .func = func_, \ + }; \ + __quirk.next = rapl_quirks; \ + rapl_quirks = &__quirk; \ +} while (0) + +static inline u64 rapl_scale(u64 v, int cfg) { + if (cfg > NR_RAPL_DOMAINS) { + pr_warn("invalid domain %d, failed to scale data\n", cfg); + return v; + } /* * scale delta to smallest unit (1/2^32) * users must then scale back: count * 1/(1e9*2^32) to get Joules * or use ldexp(count, -32). * Watts = Joules/Time delta */ - return v << (32 - __this_cpu_read(rapl_pmu)->hw_unit); + return v << (32 - rapl_hw_unit[cfg - 1]); } static u64 rapl_event_update(struct perf_event *event) @@ -173,7 +195,7 @@ again: delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; - sdelta = rapl_scale(delta); + sdelta = rapl_scale(delta, event->hw.config); local64_add(sdelta, &event->count); @@ -546,12 +568,22 @@ static void rapl_cpu_init(int cpu) cpumask_set_cpu(cpu, &rapl_cpu_mask); } +static __init void rapl_hsw_server_quirk(void) +{ + /* + * DRAM domain on HSW server has fixed energy unit which can be + * different than the unit from power unit MSR. + * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2 + * of 2. Datasheet, September 2014, Reference Number: 330784-001 " + */ + rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16; +} + static int rapl_cpu_prepare(int cpu) { struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu); int phys_id = topology_physical_package_id(cpu); u64 ms; - u64 msr_rapl_power_unit_bits; if (pmu) return 0; @@ -559,24 +591,13 @@ static int rapl_cpu_prepare(int cpu) if (phys_id < 0) return -1; - /* protect rdmsrl() to handle virtualization */ - if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) - return -1; - pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); if (!pmu) return -1; - spin_lock_init(&pmu->lock); INIT_LIST_HEAD(&pmu->active_list); - /* - * grab power unit as: 1/2^unit Joules - * - * we cache in local PMU instance - */ - pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; pmu->pmu = &rapl_pmu_class; /* @@ -586,8 +607,8 @@ static int rapl_cpu_prepare(int cpu) * divide interval by 2 to avoid lockstep (2 * 100) * if hw unit is 32, then we use 2 ms 1/200/2 */ - if (pmu->hw_unit < 32) - ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1)); + if (rapl_hw_unit[0] < 32) + ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1)); else ms = 2; @@ -655,6 +676,20 @@ static int rapl_cpu_notifier(struct notifier_block *self, return NOTIFY_OK; } +static int rapl_check_hw_unit(void) +{ + u64 msr_rapl_power_unit_bits; + int i; + + /* protect rdmsrl() to handle virtualization */ + if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits)) + return -1; + for (i = 0; i < NR_RAPL_DOMAINS; i++) + rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; + + return 0; +} + static const struct x86_cpu_id rapl_cpu_match[] = { [0] = { .vendor = X86_VENDOR_INTEL, .family = 6 }, [1] = {}, @@ -664,6 +699,8 @@ static int __init rapl_pmu_init(void) { struct rapl_pmu *pmu; int cpu, ret; + struct x86_pmu_quirk *quirk; + int i; /* * check for Intel processor family 6 @@ -678,8 +715,14 @@ static int __init rapl_pmu_init(void) rapl_cntr_mask = RAPL_IDX_CLN; rapl_pmu_events_group.attrs = rapl_events_cln_attr; break; + case 63: /* Haswell-Server */ + rapl_add_quirk(rapl_hsw_server_quirk); + rapl_cntr_mask = RAPL_IDX_SRV; + rapl_pmu_events_group.attrs = rapl_events_srv_attr; + break; case 60: /* Haswell */ case 69: /* Haswell-Celeron */ + case 61: /* Broadwell */ rapl_cntr_mask = RAPL_IDX_HSW; rapl_pmu_events_group.attrs = rapl_events_hsw_attr; break; @@ -693,7 +736,13 @@ static int __init rapl_pmu_init(void) /* unsupported */ return 0; } + ret = rapl_check_hw_unit(); + if (ret) + return ret; + /* run cpu model quirks */ + for (quirk = rapl_quirks; quirk; quirk = quirk->next) + quirk->func(); cpu_notifier_register_begin(); for_each_online_cpu(cpu) { @@ -714,14 +763,18 @@ static int __init rapl_pmu_init(void) pmu = __this_cpu_read(rapl_pmu); - pr_info("RAPL PMU detected, hw unit 2^-%d Joules," + pr_info("RAPL PMU detected," " API unit is 2^-32 Joules," " %d fixed counters" " %llu ms ovfl timer\n", - pmu->hw_unit, hweight32(rapl_cntr_mask), ktime_to_ms(pmu->timer_interval)); - + for (i = 0; i < NR_RAPL_DOMAINS; i++) { + if (rapl_cntr_mask & (1 << i)) { + pr_info("hw unit of domain %s 2^-%d Joules\n", + rapl_domain_names[i], rapl_hw_unit[i]); + } + } out: cpu_notifier_register_done(); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index c635b8b49e93..dd319e59246b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -365,9 +365,8 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX); for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) { - hwc = &box->event_list[i]->hw; c = uncore_get_event_constraint(box, box->event_list[i]); - hwc->constraint = c; + box->event_constraint[i] = c; wmin = min(wmin, c->weight); wmax = max(wmax, c->weight); } @@ -375,7 +374,7 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int /* fastpath, try to reuse previous register */ for (i = 0; i < n; i++) { hwc = &box->event_list[i]->hw; - c = hwc->constraint; + c = box->event_constraint[i]; /* never assigned */ if (hwc->idx == -1) @@ -395,8 +394,8 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int } /* slow path */ if (i != n) - ret = perf_assign_events(box->event_list, n, - wmin, wmax, assign); + ret = perf_assign_events(box->event_constraint, n, + wmin, wmax, n, assign); if (!assign || ret) { for (i = 0; i < n; i++) diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 6c8c1e7e69d8..f789ec9a0133 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -97,6 +97,7 @@ struct intel_uncore_box { atomic_t refcnt; struct perf_event *events[UNCORE_PMC_IDX_MAX]; struct perf_event *event_list[UNCORE_PMC_IDX_MAX]; + struct event_constraint *event_constraint[UNCORE_PMC_IDX_MAX]; unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; u64 tags[UNCORE_PMC_IDX_MAX]; struct pci_dev *pci_dev; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c index 3001015b755c..4562e9e22c60 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c @@ -1,6 +1,13 @@ /* Nehalem/SandBridge/Haswell uncore support */ #include "perf_event_intel_uncore.h" +/* Uncore IMC PCI IDs */ +#define PCI_DEVICE_ID_INTEL_SNB_IMC 0x0100 +#define PCI_DEVICE_ID_INTEL_IVB_IMC 0x0154 +#define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150 +#define PCI_DEVICE_ID_INTEL_HSW_IMC 0x0c00 +#define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04 + /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff #define SNB_UNC_CTL_UMASK_MASK 0x0000ff00 @@ -472,6 +479,10 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_IMC), .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_HSW_U_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, { /* end: all zeroes */ }, }; @@ -502,6 +513,7 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(IVB_IMC, &ivb_uncore_pci_driver), /* 3rd Gen Core processor */ IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */ IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core Processor */ + IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core ULT Mobile Processor */ { /* end marker */ } }; diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7d46bb260334..e2ce85db2283 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -149,6 +149,9 @@ static void __init e820_print_type(u32 type) case E820_UNUSABLE: printk(KERN_CONT "unusable"); break; + case E820_PRAM: + printk(KERN_CONT "persistent (type %u)", type); + break; default: printk(KERN_CONT "type %u", type); break; @@ -343,7 +346,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, * continue building up new bios map based on this * information */ - if (current_type != last_type) { + if (current_type != last_type || current_type == E820_PRAM) { if (last_type != 0) { new_bios[new_bios_entry].size = change_point[chgidx]->addr - last_addr; @@ -688,6 +691,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn) register_nosave_region(pfn, PFN_UP(ei->addr)); pfn = PFN_DOWN(ei->addr + ei->size); + if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN) register_nosave_region(PFN_UP(ei->addr), pfn); @@ -748,7 +752,7 @@ u64 __init early_reserve_e820(u64 size, u64 align) /* * Find the highest page frame number we have available */ -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) +static unsigned long __init e820_end_pfn(unsigned long limit_pfn) { int i; unsigned long last_pfn = 0; @@ -759,7 +763,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) unsigned long start_pfn; unsigned long end_pfn; - if (ei->type != type) + /* + * Persistent memory is accounted as ram for purposes of + * establishing max_pfn and mem_map. + */ + if (ei->type != E820_RAM && ei->type != E820_PRAM) continue; start_pfn = ei->addr >> PAGE_SHIFT; @@ -784,12 +792,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) } unsigned long __init e820_end_of_ram_pfn(void) { - return e820_end_pfn(MAX_ARCH_PFN, E820_RAM); + return e820_end_pfn(MAX_ARCH_PFN); } unsigned long __init e820_end_of_low_ram_pfn(void) { - return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM); + return e820_end_pfn(1UL << (32-PAGE_SHIFT)); } static void early_panic(char *msg) @@ -866,6 +874,9 @@ static int __init parse_memmap_one(char *p) } else if (*p == '$') { start_at = memparse(p+1, &p); e820_add_region(start_at, mem_size, E820_RESERVED); + } else if (*p == '!') { + start_at = memparse(p+1, &p); + e820_add_region(start_at, mem_size, E820_PRAM); } else e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); @@ -907,6 +918,7 @@ static inline const char *e820_type_to_string(int e820_type) case E820_ACPI: return "ACPI Tables"; case E820_NVS: return "ACPI Non-volatile Storage"; case E820_UNUSABLE: return "Unusable memory"; + case E820_PRAM: return "Persistent RAM"; default: return "reserved"; } } @@ -940,7 +952,9 @@ void __init e820_reserve_resources(void) * pci device BAR resource and insert them later in * pcibios_resource_survey() */ - if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) { + if (((e820.map[i].type != E820_RESERVED) && + (e820.map[i].type != E820_PRAM)) || + res->start < (1ULL<<20)) { res->flags |= IORESOURCE_BUSY; insert_resource(&iomem_resource, res); } diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 49ff55ef9b26..89427d8d4fc5 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -375,12 +375,6 @@ static int __init setup_early_printk(char *buf) if (!strncmp(buf, "xen", 3)) early_console_register(&xenboot_console, keep); #endif -#ifdef CONFIG_EARLY_PRINTK_INTEL_MID - if (!strncmp(buf, "hsu", 3)) { - hsu_early_console_init(buf + 3); - early_console_register(&early_hsu_console, keep); - } -#endif #ifdef CONFIG_EARLY_PRINTK_EFI if (!strncmp(buf, "efi", 3)) early_console_register(&early_efi_console, keep); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c7b238494b31..02c2eff7478d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -295,6 +295,15 @@ system_call_fastpath: * rflags from r11 (but RF and VM bits are forced to 0), * cs and ss are loaded from MSRs. * Restoration of rflags re-enables interrupts. + * + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss + * descriptor is not reinitialized. This means that we should + * avoid SYSRET with SS == NULL, which could happen if we schedule, + * exit the kernel, and re-enter using an interrupt vector. (All + * interrupt entries on x86_64 set SS to NULL.) We prevent that + * from happening by reloading SS in __switch_to. (Actually + * detecting the failure in 64-bit userspace is tricky but can be + * done.) */ USERGS_SYSRET64 diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 2b55ee6db053..5a4668136e98 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -167,7 +167,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) clear_bss(); for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) - set_intr_gate(i, early_idt_handlers[i]); + set_intr_gate(i, early_idt_handler_array[i]); load_idt((const struct desc_ptr *)&idt_descr); copy_bootdata(__va(real_mode_data)); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index d031bad9e07e..53eeb226657c 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -478,21 +478,22 @@ is486: __INIT setup_once: /* - * Set up a idt with 256 entries pointing to ignore_int, - * interrupt gates. It doesn't actually load idt - that needs - * to be done on each CPU. Interrupts are enabled elsewhere, - * when we can be relatively sure everything is ok. + * Set up a idt with 256 interrupt gates that push zero if there + * is no error code and then jump to early_idt_handler_common. + * It doesn't actually load the idt - that needs to be done on + * each CPU. Interrupts are enabled elsewhere, when we can be + * relatively sure everything is ok. */ movl $idt_table,%edi - movl $early_idt_handlers,%eax + movl $early_idt_handler_array,%eax movl $NUM_EXCEPTION_VECTORS,%ecx 1: movl %eax,(%edi) movl %eax,4(%edi) /* interrupt gate, dpl=0, present */ movl $(0x8E000000 + __KERNEL_CS),2(%edi) - addl $9,%eax + addl $EARLY_IDT_HANDLER_SIZE,%eax addl $8,%edi loop 1b @@ -524,26 +525,28 @@ setup_once: andl $0,setup_once_ref /* Once is enough, thanks */ ret -ENTRY(early_idt_handlers) +ENTRY(early_idt_handler_array) # 36(%esp) %eflags # 32(%esp) %cs # 28(%esp) %eip # 24(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .if (EXCEPTION_ERRCODE_MASK >> i) & 1 - ASM_NOP2 - .else + .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 pushl $0 # Dummy error code, to make stack frame uniform .endif pushl $i # 20(%esp) Vector number - jmp early_idt_handler + jmp early_idt_handler_common i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr -ENDPROC(early_idt_handlers) +ENDPROC(early_idt_handler_array) - /* This is global to keep gas from relaxing the jumps */ -ENTRY(early_idt_handler) +early_idt_handler_common: + /* + * The stack is the hardware frame, an error code or zero, and the + * vector number. + */ cld cmpl $2,(%esp) # X86_TRAP_NMI @@ -603,7 +606,7 @@ ex_entry: is_nmi: addl $8,%esp /* drop vector number and error code */ iret -ENDPROC(early_idt_handler) +ENDPROC(early_idt_handler_common) /* This is the default interrupt "handler" :-) */ ALIGN diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index ae6588b301c2..df7e78057ae0 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -321,26 +321,28 @@ bad_address: jmp bad_address __INIT - .globl early_idt_handlers -early_idt_handlers: +ENTRY(early_idt_handler_array) # 104(%rsp) %rflags # 96(%rsp) %cs # 88(%rsp) %rip # 80(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .if (EXCEPTION_ERRCODE_MASK >> i) & 1 - ASM_NOP2 - .else + .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 pushq $0 # Dummy error code, to make stack frame uniform .endif pushq $i # 72(%rsp) Vector number - jmp early_idt_handler + jmp early_idt_handler_common i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr +ENDPROC(early_idt_handler_array) -/* This is global to keep gas from relaxing the jumps */ -ENTRY(early_idt_handler) +early_idt_handler_common: + /* + * The stack is the hardware frame, an error code or zero, and the + * vector number. + */ cld cmpl $2,(%rsp) # X86_TRAP_NMI @@ -412,7 +414,7 @@ ENTRY(early_idt_handler) is_nmi: addq $16,%rsp # drop vector number and error code INTERRUPT_RETURN -ENDPROC(early_idt_handler) +ENDPROC(early_idt_handler_common) __INITDATA diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 367f39d35e9c..6185d3141219 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -173,6 +173,21 @@ static void init_thread_xstate(void) xstate_size = sizeof(struct i387_fxsave_struct); else xstate_size = sizeof(struct i387_fsave_struct); + + /* + * Quirk: we don't yet handle the XSAVES* instructions + * correctly, as we don't correctly convert between + * standard and compacted format when interfacing + * with user-space - so disable it for now. + * + * The difference is small: with recent CPUs the + * compacted format is only marginally smaller than + * the standard FPU state format. + * + * ( This is easy to backport while we are fixing + * XSAVES* support. ) + */ + setup_clear_cpu_cap(X86_FEATURE_XSAVES); } /* @@ -341,7 +356,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - struct xsave_struct *xsave = &target->thread.fpu.state->xsave; + struct xsave_struct *xsave; int ret; if (!cpu_has_xsave) @@ -351,6 +366,8 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; + xsave = &target->thread.fpu.state->xsave; + /* * Copy the 48bytes defined by the software first into the xstate * memory layout in the thread struct, so that we can copy the entire @@ -369,7 +386,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct xsave_struct *xsave = &target->thread.fpu.state->xsave; + struct xsave_struct *xsave; int ret; if (!cpu_has_xsave) @@ -379,6 +396,8 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; + xsave = &target->thread.fpu.state->xsave; + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); /* * mxcsr reserved bits must be masked to zero for security reasons. diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c new file mode 100644 index 000000000000..3420c874ddc5 --- /dev/null +++ b/arch/x86/kernel/pmem.c @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2015, Christoph Hellwig. + */ +#include <linux/memblock.h> +#include <linux/platform_device.h> +#include <linux/slab.h> +#include <asm/e820.h> +#include <asm/page_types.h> +#include <asm/setup.h> + +static __init void register_pmem_device(struct resource *res) +{ + struct platform_device *pdev; + int error; + + pdev = platform_device_alloc("pmem", PLATFORM_DEVID_AUTO); + if (!pdev) + return; + + error = platform_device_add_resources(pdev, res, 1); + if (error) + goto out_put_pdev; + + error = platform_device_add(pdev); + if (error) + goto out_put_pdev; + return; + +out_put_pdev: + dev_warn(&pdev->dev, "failed to add 'pmem' (persistent memory) device!\n"); + platform_device_put(pdev); +} + +static __init int register_pmem_devices(void) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (ei->type == E820_PRAM) { + struct resource res = { + .flags = IORESOURCE_MEM, + .start = ei->addr, + .end = ei->addr + ei->size - 1, + }; + register_pmem_device(&res); + } + } + + return 0; +} +device_initcall(register_pmem_devices); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 8213da62b1b7..6e338e3b1dc0 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -57,7 +57,7 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, #endif }; -EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss); +EXPORT_PER_CPU_SYMBOL(cpu_tss); #ifdef CONFIG_X86_64 static DEFINE_PER_CPU(unsigned char, is_idle); @@ -156,11 +156,13 @@ void flush_thread(void) /* FPU state will be reallocated lazily at the first use. */ drop_fpu(tsk); free_thread_xstate(tsk); - } else if (!used_math()) { - /* kthread execs. TODO: cleanup this horror. */ - if (WARN_ON(init_fpu(tsk))) - force_sig(SIGKILL, tsk); - user_fpu_begin(); + } else { + if (!tsk_used_math(tsk)) { + /* kthread execs. TODO: cleanup this horror. */ + if (WARN_ON(init_fpu(tsk))) + force_sig(SIGKILL, tsk); + user_fpu_begin(); + } restore_init_xstate(); } } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 4baaa972f52a..ddfdbf74f174 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -419,6 +419,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) __switch_to_xtra(prev_p, next_p, tss); + if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) { + /* + * AMD CPUs have a misfeature: SYSRET sets the SS selector but + * does not update the cached descriptor. As a result, if we + * do SYSRET while SS is NULL, we'll end up in user mode with + * SS apparently equal to __USER_DS but actually unusable. + * + * The straightforward workaround would be to fix it up just + * before SYSRET, but that would slow down the system call + * fast paths. Instead, we ensure that SS is never NULL in + * system call context. We do this by replacing NULL SS + * selectors at every context switch. SYSCALL sets up a valid + * SS, so the only way to get NULL is to re-enter the kernel + * from CPL 3 through an interrupt. Since that can't happen + * in the same task as a running syscall, we are guaranteed to + * context switch between every interrupt vector entry and a + * subsequent SYSRET. + * + * We read SS first because SS reads are much faster than + * writes. Out of caution, we force SS to __KERNEL_DS even if + * it previously had a different non-NULL value. + */ + unsigned short ss_sel; + savesegment(ss, ss_sel); + if (ss_sel != __KERNEL_DS) + loadsegment(ss, __KERNEL_DS); + } + return prev_p; } diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index e5ecd20e72dd..2f355d229a58 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -141,46 +141,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); } -static struct pvclock_vsyscall_time_info *pvclock_vdso_info; - -static struct pvclock_vsyscall_time_info * -pvclock_get_vsyscall_user_time_info(int cpu) -{ - if (!pvclock_vdso_info) { - BUG(); - return NULL; - } - - return &pvclock_vdso_info[cpu]; -} - -struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu) -{ - return &pvclock_get_vsyscall_user_time_info(cpu)->pvti; -} - #ifdef CONFIG_X86_64 -static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l, - void *v) -{ - struct task_migration_notifier *mn = v; - struct pvclock_vsyscall_time_info *pvti; - - pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu); - - /* this is NULL when pvclock vsyscall is not initialized */ - if (unlikely(pvti == NULL)) - return NOTIFY_DONE; - - pvti->migrate_count++; - - return NOTIFY_DONE; -} - -static struct notifier_block pvclock_migrate = { - .notifier_call = pvclock_task_migrate, -}; - /* * Initialize the generic pvclock vsyscall state. This will allocate * a/some page(s) for the per-vcpu pvclock information, set up a @@ -194,17 +155,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); - pvclock_vdso_info = i; - for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, __pa(i) + (idx*PAGE_SIZE), PAGE_KERNEL_VVAR); } - - register_task_migration_notifier(&pvclock_migrate); - return 0; } #endif diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index f9804080ccb3..1ea14fd53933 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -616,7 +616,8 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { - bool failed; + bool stepping, failed; + /* Are we from a system call? */ if (syscall_get_nr(current, regs) >= 0) { /* If so, check system call restarting.. */ @@ -640,12 +641,13 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) } /* - * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF - * flag so that register information in the sigcontext is correct. + * If TF is set due to a debugger (TIF_FORCED_TF), clear TF now + * so that register information in the sigcontext is correct and + * then notify the tracer before entering the signal handler. */ - if (unlikely(regs->flags & X86_EFLAGS_TF) && - likely(test_and_clear_thread_flag(TIF_FORCED_TF))) - regs->flags &= ~X86_EFLAGS_TF; + stepping = test_thread_flag(TIF_SINGLESTEP); + if (stepping) + user_disable_single_step(current); failed = (setup_rt_frame(ksig, regs) < 0); if (!failed) { @@ -656,10 +658,8 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) * it might disable possible debug exception from the * signal handler. * - * Clear TF when entering the signal handler, but - * notify any tracer that was single-stepping it. - * The tracer may want to single-step inside the - * handler too. + * Clear TF for the case when it wasn't set by debugger to + * avoid the recursive send_sigtrap() in SIGTRAP handler. */ regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF); /* @@ -668,7 +668,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) if (used_math()) fpu_reset_state(current); } - signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP)); + signal_setup_done(failed, ksig, stepping); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c index 6eb5c20ee373..d090ecf08809 100644 --- a/arch/x86/kvm/assigned-dev.c +++ b/arch/x86/kvm/assigned-dev.c @@ -666,7 +666,7 @@ static int probe_sysfs_permissions(struct pci_dev *dev) if (r) return r; - inode = path.dentry->d_inode; + inode = d_backing_inode(path.dentry); r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS); path_put(&path); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 59b69f6a2844..1d08ad3582d0 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -16,6 +16,8 @@ #include <linux/module.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> +#include <asm/i387.h> /* For use_eager_fpu. Ugh! */ +#include <asm/fpu-internal.h> /* For use_eager_fpu. Ugh! */ #include <asm/user.h> #include <asm/xsave.h> #include "cpuid.h" @@ -95,6 +97,8 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) if (best && (best->eax & (F(XSAVES) | F(XSAVEC)))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); + vcpu->arch.eager_fpu = guest_cpuid_has_mpx(vcpu); + /* * The existing code assumes virtual address is 48-bit in the canonical * address checks; exit if it is ever changed. diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index c3b1ad9fca81..496b3695d3d3 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -117,4 +117,12 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->ebx & bit(X86_FEATURE_RTM)); } + +static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_MPX)); +} #endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d67206a7b99a..629af0f1c5c4 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -683,8 +683,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, unsigned long bitmap = 1; struct kvm_lapic **dst; int i; - bool ret = false; - bool x2apic_ipi = src && apic_x2apic_mode(src); + bool ret, x2apic_ipi; *r = -1; @@ -696,16 +695,18 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, if (irq->shorthand) return false; + x2apic_ipi = src && apic_x2apic_mode(src); if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST)) return false; + ret = true; rcu_read_lock(); map = rcu_dereference(kvm->arch.apic_map); - if (!map) + if (!map) { + ret = false; goto out; - - ret = true; + } if (irq->dest_mode == APIC_DEST_PHYSICAL) { if (irq->dest_id >= ARRAY_SIZE(map->phys_map)) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 146f295ee322..44a7d2515497 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3736,8 +3736,8 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, } } -void update_permission_bitmask(struct kvm_vcpu *vcpu, - struct kvm_mmu *mmu, bool ept) +static void update_permission_bitmask(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu, bool ept) { unsigned bit, byte, pfec; u8 map; @@ -3918,6 +3918,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) { bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); + bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); struct kvm_mmu *context = &vcpu->arch.mmu; MMU_WARN_ON(VALID_PAGE(context->root_hpa)); @@ -3936,6 +3937,8 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) context->base_role.cr0_wp = is_write_protection(vcpu); context->base_role.smep_andnot_wp = smep && !is_write_protection(vcpu); + context->base_role.smap_andnot_wp + = smap && !is_write_protection(vcpu); } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); @@ -4207,12 +4210,18 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes) { gfn_t gfn = gpa >> PAGE_SHIFT; - union kvm_mmu_page_role mask = { .word = 0 }; struct kvm_mmu_page *sp; LIST_HEAD(invalid_list); u64 entry, gentry, *spte; int npte; bool remote_flush, local_flush, zap_page; + union kvm_mmu_page_role mask = (union kvm_mmu_page_role) { + .cr0_wp = 1, + .cr4_pae = 1, + .nxe = 1, + .smep_andnot_wp = 1, + .smap_andnot_wp = 1, + }; /* * If we don't have indirect shadow pages, it means no page is @@ -4238,7 +4247,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, ++vcpu->kvm->stat.mmu_pte_write; kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); - mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) { if (detect_write_misaligned(sp, gpa, bytes) || detect_write_flooding(sp)) { @@ -4481,9 +4489,11 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, pfn = spte_to_pfn(*sptep); /* - * Only EPT supported for now; otherwise, one would need to - * find out efficiently whether the guest page tables are - * also using huge pages. + * We cannot do huge page mapping for indirect shadow pages, + * which are found on the last rmap (level = 1) when not using + * tdp; such shadow pages are synced with the page table in + * the guest, and the guest page table is using 4K page size + * mapping if the indirect sp has level = 1. */ if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && @@ -4504,19 +4514,12 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, bool flush = false; unsigned long *rmapp; unsigned long last_index, index; - gfn_t gfn_start, gfn_end; spin_lock(&kvm->mmu_lock); - gfn_start = memslot->base_gfn; - gfn_end = memslot->base_gfn + memslot->npages - 1; - - if (gfn_start >= gfn_end) - goto out; - rmapp = memslot->arch.rmap[0]; - last_index = gfn_to_index(gfn_end, memslot->base_gfn, - PT_PAGE_TABLE_LEVEL); + last_index = gfn_to_index(memslot->base_gfn + memslot->npages - 1, + memslot->base_gfn, PT_PAGE_TABLE_LEVEL); for (index = 0; index <= last_index; ++index, ++rmapp) { if (*rmapp) @@ -4534,7 +4537,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, if (flush) kvm_flush_remote_tlbs(kvm); -out: spin_unlock(&kvm->mmu_lock); } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index c7d65637c851..0ada65ecddcf 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -71,8 +71,6 @@ enum { int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly); -void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - bool ept); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { @@ -166,6 +164,8 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index = (pfec >> 1) + (smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1)); + WARN_ON(pfec & PFERR_RSVD_MASK); + return (mmu->permissions[index] >> pte_access) & 1; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index fd49c867b25a..6e6d115fe9b5 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -718,6 +718,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, mmu_is_nested(vcpu)); if (likely(r != RET_MMIO_PF_INVALID)) return r; + + /* + * page fault with PFEC.RSVD = 1 is caused by shadow + * page fault, should not be used to walk guest page + * table. + */ + error_code &= ~PFERR_RSVD_MASK; }; r = mmu_topup_memory_caches(vcpu); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ce741b8650f6..9afa233b5482 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4381,6 +4381,7 @@ static struct kvm_x86_ops svm_x86_ops = { .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + .fpu_activate = svm_fpu_activate, .fpu_deactivate = svm_fpu_deactivate, .tlb_flush = svm_flush_tlb, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f5e8dce8046c..2d73807f0d31 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3622,8 +3622,16 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ? - KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); + /* + * Pass through host's Machine Check Enable value to hw_cr4, which + * is in force while we are in guest mode. Do not let guests control + * this bit, even if host CR4.MCE == 0. + */ + unsigned long hw_cr4 = + (cr4_read_shadow() & X86_CR4_MCE) | + (cr4 & ~X86_CR4_MCE) | + (to_vmx(vcpu)->rmode.vm86_active ? + KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); if (cr4 & X86_CR4_VMXE) { /* @@ -10177,6 +10185,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, + .fpu_activate = vmx_fpu_activate, .fpu_deactivate = vmx_fpu_deactivate, .tlb_flush = vmx_flush_tlb, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e1a81267f3f6..ea306adbbc13 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -702,8 +702,9 @@ EXPORT_SYMBOL_GPL(kvm_set_xcr); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | - X86_CR4_PAE | X86_CR4_SMEP; + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE | + X86_CR4_SMEP | X86_CR4_SMAP; + if (cr4 & CR4_RESERVED_BITS) return 1; @@ -744,9 +745,6 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); - if ((cr4 ^ old_cr4) & X86_CR4_SMAP) - update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false); - if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) kvm_update_cpuid(vcpu); @@ -1669,12 +1667,28 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) &guest_hv_clock, sizeof(guest_hv_clock)))) return 0; - /* - * The interface expects us to write an even number signaling that the - * update is finished. Since the guest won't see the intermediate - * state, we just increase by 2 at the end. + /* This VCPU is paused, but it's legal for a guest to read another + * VCPU's kvmclock, so we really have to follow the specification where + * it says that version is odd if data is being modified, and even after + * it is consistent. + * + * Version field updates must be kept separate. This is because + * kvm_write_guest_cached might use a "rep movs" instruction, and + * writes within a string instruction are weakly ordered. So there + * are three writes overall. + * + * As a small optimization, only write the version field in the first + * and third write. The vcpu->pv_time cache is still valid, because the + * version field is the first in the struct. */ - vcpu->hv_clock.version = guest_hv_clock.version + 2; + BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); + + vcpu->hv_clock.version = guest_hv_clock.version + 1; + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); + + smp_wmb(); /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); @@ -1695,6 +1709,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) kvm_write_guest_cached(v->kvm, &vcpu->pv_time, &vcpu->hv_clock, sizeof(vcpu->hv_clock)); + + smp_wmb(); + + vcpu->hv_clock.version++; + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); return 0; } @@ -5799,7 +5820,6 @@ int kvm_arch_init(void *opaque) kvm_set_mmio_spte_mask(); kvm_x86_ops = ops; - kvm_init_msr_list(); kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0); @@ -6175,6 +6195,8 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) return; page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); + if (is_error_page(page)) + return; kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page)); /* @@ -7038,7 +7060,9 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) fpu_save_init(&vcpu->arch.guest_fpu); __kernel_fpu_end(); ++vcpu->stat.fpu_reload; - kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); + if (!vcpu->arch.eager_fpu) + kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); + trace_kvm_fpu(0); } @@ -7054,11 +7078,21 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { + struct kvm_vcpu *vcpu; + if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) printk_once(KERN_WARNING "kvm: SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); - return kvm_x86_ops->vcpu_create(kvm, id); + + vcpu = kvm_x86_ops->vcpu_create(kvm, id); + + /* + * Activate fpu unconditionally in case the guest needs eager FPU. It will be + * deactivated soon if it doesn't. + */ + kvm_x86_ops->fpu_activate(vcpu); + return vcpu; } int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) @@ -7253,7 +7287,14 @@ void kvm_arch_hardware_disable(void) int kvm_arch_hardware_setup(void) { - return kvm_x86_ops->hardware_setup(); + int r; + + r = kvm_x86_ops->hardware_setup(); + if (r != 0) + return r; + + kvm_init_msr_list(); + return 0; } void kvm_arch_hardware_unsetup(void) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 717908b16037..8f9a133cc099 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -87,8 +87,7 @@ struct lguest_data lguest_data = { .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, - .noirq_start = (u32)lguest_noirq_start, - .noirq_end = (u32)lguest_noirq_end, + .noirq_iret = (u32)lguest_noirq_iret, .kernel_address = PAGE_OFFSET, .blocked_interrupts = { 1 }, /* Block timer interrupts */ .syscall_vec = SYSCALL_VECTOR, @@ -262,7 +261,7 @@ PV_CALLEE_SAVE_REGS_THUNK(lguest_save_fl); PV_CALLEE_SAVE_REGS_THUNK(lguest_irq_disable); /*:*/ -/* These are in i386_head.S */ +/* These are in head_32.S */ extern void lg_irq_enable(void); extern void lg_restore_fl(unsigned long flags); @@ -1368,7 +1367,7 @@ static void lguest_restart(char *reason) * fit comfortably. * * First we need assembly templates of each of the patchable Guest operations, - * and these are in i386_head.S. + * and these are in head_32.S. */ /*G:060 We construct a table from the assembler templates: */ diff --git a/arch/x86/lguest/head_32.S b/arch/x86/lguest/head_32.S index 6ddfe4fc23c3..d5ae63f5ec5d 100644 --- a/arch/x86/lguest/head_32.S +++ b/arch/x86/lguest/head_32.S @@ -84,7 +84,7 @@ ENTRY(lg_irq_enable) * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we * jump to send_interrupts, otherwise we're done. */ - testl $0, lguest_data+LGUEST_DATA_irq_pending + cmpl $0, lguest_data+LGUEST_DATA_irq_pending jnz send_interrupts /* * One cool thing about x86 is that you can do many things without using @@ -133,9 +133,8 @@ ENTRY(lg_restore_fl) ret /*:*/ -/* These demark the EIP range where host should never deliver interrupts. */ -.global lguest_noirq_start -.global lguest_noirq_end +/* These demark the EIP where host should never deliver interrupts. */ +.global lguest_noirq_iret /*M:004 * When the Host reflects a trap or injects an interrupt into the Guest, it @@ -168,29 +167,26 @@ ENTRY(lg_restore_fl) * So we have to copy eflags from the stack to lguest_data.irq_enabled before * we do the "iret". * - * There are two problems with this: firstly, we need to use a register to do - * the copy and secondly, the whole thing needs to be atomic. The first - * problem is easy to solve: push %eax on the stack so we can use it, and then - * restore it at the end just before the real "iret". + * There are two problems with this: firstly, we can't clobber any registers + * and secondly, the whole thing needs to be atomic. The first problem + * is solved by using "push memory"/"pop memory" instruction pair for copying. * * The second is harder: copying eflags to lguest_data.irq_enabled will turn * interrupts on before we're finished, so we could be interrupted before we - * return to userspace or wherever. Our solution to this is to surround the - * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the + * return to userspace or wherever. Our solution to this is to tell the * Host that it is *never* to interrupt us there, even if interrupts seem to be - * enabled. + * enabled. (It's not necessary to protect pop instruction, since + * data gets updated only after it completes, so we only need to protect + * one instruction, iret). */ ENTRY(lguest_iret) - pushl %eax - movl 12(%esp), %eax -lguest_noirq_start: + pushl 2*4(%esp) /* * Note the %ss: segment prefix here. Normal data accesses use the * "ds" segment, but that will have already been restored for whatever * we're returning to (such as userspace): we can't trust it. The %ss: * prefix makes sure we use the stack segment, which is still valid. */ - movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled - popl %eax + popl %ss:lguest_data+LGUEST_DATA_irq_enabled +lguest_noirq_iret: iret -lguest_noirq_end: diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 1f33b3d1fd68..0a42327a59d7 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -82,7 +82,7 @@ copy_user_handle_tail(char *to, char *from, unsigned len) clac(); /* If the destination is a kernel buffer, we always clear the end */ - if ((unsigned long)to >= TASK_SIZE_MAX) + if (!__addr_ok(to)) memset(to, 0, len); return len; } diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 5ead4d6cf3a7..70e7444c6835 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -351,18 +351,20 @@ int arch_ioremap_pmd_supported(void) */ void *xlate_dev_mem_ptr(phys_addr_t phys) { - void *addr; - unsigned long start = phys & PAGE_MASK; + unsigned long start = phys & PAGE_MASK; + unsigned long offset = phys & ~PAGE_MASK; + unsigned long vaddr; /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ if (page_is_ram(start >> PAGE_SHIFT)) return __va(phys); - addr = (void __force *)ioremap_cache(start, PAGE_SIZE); - if (addr) - addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); + vaddr = (unsigned long)ioremap_cache(start, PAGE_SIZE); + /* Only add the offset on success and return NULL if the ioremap() failed: */ + if (vaddr) + vaddr += offset; - return addr; + return (void *)vaddr; } void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 987514396c1e..579a8fd74be0 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -12,6 +12,7 @@ #include <linux/filter.h> #include <linux/if_vlan.h> #include <asm/cacheflush.h> +#include <linux/bpf.h> int bpf_jit_enable __read_mostly; @@ -37,7 +38,8 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) return ptr + len; } -#define EMIT(bytes, len) do { prog = emit_code(prog, bytes, len); } while (0) +#define EMIT(bytes, len) \ + do { prog = emit_code(prog, bytes, len); cnt += len; } while (0) #define EMIT1(b1) EMIT(b1, 1) #define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2) @@ -186,31 +188,31 @@ struct jit_context { #define BPF_MAX_INSN_SIZE 128 #define BPF_INSN_SAFETY 64 -static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, - int oldproglen, struct jit_context *ctx) +#define STACKSIZE \ + (MAX_BPF_STACK + \ + 32 /* space for rbx, r13, r14, r15 */ + \ + 8 /* space for skb_copy_bits() buffer */) + +#define PROLOGUE_SIZE 51 + +/* emit x64 prologue code for BPF program and check it's size. + * bpf_tail_call helper will skip it while jumping into another program + */ +static void emit_prologue(u8 **pprog) { - struct bpf_insn *insn = bpf_prog->insnsi; - int insn_cnt = bpf_prog->len; - bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0); - bool seen_exit = false; - u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; - int i; - int proglen = 0; - u8 *prog = temp; - int stacksize = MAX_BPF_STACK + - 32 /* space for rbx, r13, r14, r15 */ + - 8 /* space for skb_copy_bits() buffer */; + u8 *prog = *pprog; + int cnt = 0; EMIT1(0x55); /* push rbp */ EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */ - /* sub rsp, stacksize */ - EMIT3_off32(0x48, 0x81, 0xEC, stacksize); + /* sub rsp, STACKSIZE */ + EMIT3_off32(0x48, 0x81, 0xEC, STACKSIZE); /* all classic BPF filters use R6(rbx) save it */ /* mov qword ptr [rbp-X],rbx */ - EMIT3_off32(0x48, 0x89, 0x9D, -stacksize); + EMIT3_off32(0x48, 0x89, 0x9D, -STACKSIZE); /* bpf_convert_filter() maps classic BPF register X to R7 and uses R8 * as temporary, so all tcpdump filters need to spill/fill R7(r13) and @@ -221,16 +223,112 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, */ /* mov qword ptr [rbp-X],r13 */ - EMIT3_off32(0x4C, 0x89, 0xAD, -stacksize + 8); + EMIT3_off32(0x4C, 0x89, 0xAD, -STACKSIZE + 8); /* mov qword ptr [rbp-X],r14 */ - EMIT3_off32(0x4C, 0x89, 0xB5, -stacksize + 16); + EMIT3_off32(0x4C, 0x89, 0xB5, -STACKSIZE + 16); /* mov qword ptr [rbp-X],r15 */ - EMIT3_off32(0x4C, 0x89, 0xBD, -stacksize + 24); + EMIT3_off32(0x4C, 0x89, 0xBD, -STACKSIZE + 24); /* clear A and X registers */ EMIT2(0x31, 0xc0); /* xor eax, eax */ EMIT3(0x4D, 0x31, 0xED); /* xor r13, r13 */ + /* clear tail_cnt: mov qword ptr [rbp-X], rax */ + EMIT3_off32(0x48, 0x89, 0x85, -STACKSIZE + 32); + + BUILD_BUG_ON(cnt != PROLOGUE_SIZE); + *pprog = prog; +} + +/* generate the following code: + * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ... + * if (index >= array->map.max_entries) + * goto out; + * if (++tail_call_cnt > MAX_TAIL_CALL_CNT) + * goto out; + * prog = array->prog[index]; + * if (prog == NULL) + * goto out; + * goto *(prog->bpf_func + prologue_size); + * out: + */ +static void emit_bpf_tail_call(u8 **pprog) +{ + u8 *prog = *pprog; + int label1, label2, label3; + int cnt = 0; + + /* rdi - pointer to ctx + * rsi - pointer to bpf_array + * rdx - index in bpf_array + */ + + /* if (index >= array->map.max_entries) + * goto out; + */ + EMIT4(0x48, 0x8B, 0x46, /* mov rax, qword ptr [rsi + 16] */ + offsetof(struct bpf_array, map.max_entries)); + EMIT3(0x48, 0x39, 0xD0); /* cmp rax, rdx */ +#define OFFSET1 44 /* number of bytes to jump */ + EMIT2(X86_JBE, OFFSET1); /* jbe out */ + label1 = cnt; + + /* if (tail_call_cnt > MAX_TAIL_CALL_CNT) + * goto out; + */ + EMIT2_off32(0x8B, 0x85, -STACKSIZE + 36); /* mov eax, dword ptr [rbp - 516] */ + EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */ +#define OFFSET2 33 + EMIT2(X86_JA, OFFSET2); /* ja out */ + label2 = cnt; + EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */ + EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */ + + /* prog = array->prog[index]; */ + EMIT4(0x48, 0x8D, 0x44, 0xD6); /* lea rax, [rsi + rdx * 8 + 0x50] */ + EMIT1(offsetof(struct bpf_array, prog)); + EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */ + + /* if (prog == NULL) + * goto out; + */ + EMIT4(0x48, 0x83, 0xF8, 0x00); /* cmp rax, 0 */ +#define OFFSET3 10 + EMIT2(X86_JE, OFFSET3); /* je out */ + label3 = cnt; + + /* goto *(prog->bpf_func + prologue_size); */ + EMIT4(0x48, 0x8B, 0x40, /* mov rax, qword ptr [rax + 32] */ + offsetof(struct bpf_prog, bpf_func)); + EMIT4(0x48, 0x83, 0xC0, PROLOGUE_SIZE); /* add rax, prologue_size */ + + /* now we're ready to jump into next BPF program + * rdi == ctx (1st arg) + * rax == prog->bpf_func + prologue_size + */ + EMIT2(0xFF, 0xE0); /* jmp rax */ + + /* out: */ + BUILD_BUG_ON(cnt - label1 != OFFSET1); + BUILD_BUG_ON(cnt - label2 != OFFSET2); + BUILD_BUG_ON(cnt - label3 != OFFSET3); + *pprog = prog; +} + +static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, + int oldproglen, struct jit_context *ctx) +{ + struct bpf_insn *insn = bpf_prog->insnsi; + int insn_cnt = bpf_prog->len; + bool seen_ld_abs = ctx->seen_ld_abs | (oldproglen == 0); + bool seen_exit = false; + u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY]; + int i, cnt = 0; + int proglen = 0; + u8 *prog = temp; + + emit_prologue(&prog); + if (seen_ld_abs) { /* r9d : skb->len - skb->data_len (headlen) * r10 : skb->data @@ -559,6 +657,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, if (is_ereg(dst_reg)) EMIT1(0x41); EMIT3(0xC1, add_1reg(0xC8, dst_reg), 8); + + /* emit 'movzwl eax, ax' */ + if (is_ereg(dst_reg)) + EMIT3(0x45, 0x0F, 0xB7); + else + EMIT2(0x0F, 0xB7); + EMIT1(add_2reg(0xC0, dst_reg, dst_reg)); break; case 32: /* emit 'bswap eax' to swap lower 4 bytes */ @@ -577,6 +682,27 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, break; case BPF_ALU | BPF_END | BPF_FROM_LE: + switch (imm32) { + case 16: + /* emit 'movzwl eax, ax' to zero extend 16-bit + * into 64 bit + */ + if (is_ereg(dst_reg)) + EMIT3(0x45, 0x0F, 0xB7); + else + EMIT2(0x0F, 0xB7); + EMIT1(add_2reg(0xC0, dst_reg, dst_reg)); + break; + case 32: + /* emit 'mov eax, eax' to clear upper 32-bits */ + if (is_ereg(dst_reg)) + EMIT1(0x45); + EMIT2(0x89, add_2reg(0xC0, dst_reg, dst_reg)); + break; + case 64: + /* nop */ + break; + } break; /* ST: *(u8*)(dst_reg + off) = imm */ @@ -711,6 +837,10 @@ xadd: if (is_imm8(insn->off)) } break; + case BPF_JMP | BPF_CALL | BPF_X: + emit_bpf_tail_call(&prog); + break; + /* cond jump */ case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JNE | BPF_X: @@ -863,13 +993,13 @@ common_load: /* update cleanup_addr */ ctx->cleanup_addr = proglen; /* mov rbx, qword ptr [rbp-X] */ - EMIT3_off32(0x48, 0x8B, 0x9D, -stacksize); + EMIT3_off32(0x48, 0x8B, 0x9D, -STACKSIZE); /* mov r13, qword ptr [rbp-X] */ - EMIT3_off32(0x4C, 0x8B, 0xAD, -stacksize + 8); + EMIT3_off32(0x4C, 0x8B, 0xAD, -STACKSIZE + 8); /* mov r14, qword ptr [rbp-X] */ - EMIT3_off32(0x4C, 0x8B, 0xB5, -stacksize + 16); + EMIT3_off32(0x4C, 0x8B, 0xB5, -STACKSIZE + 16); /* mov r15, qword ptr [rbp-X] */ - EMIT3_off32(0x4C, 0x8B, 0xBD, -stacksize + 24); + EMIT3_off32(0x4C, 0x8B, 0xBD, -STACKSIZE + 24); EMIT1(0xC9); /* leave */ EMIT1(0xC3); /* ret */ @@ -938,7 +1068,12 @@ void bpf_int_jit_compile(struct bpf_prog *prog) } ctx.cleanup_addr = proglen; - for (pass = 0; pass < 10; pass++) { + /* JITed image shrinks with every pass and the loop iterates + * until the image stops shrinking. Very large bpf programs + * may converge on the last pass. In such case do one more + * pass to emit the final image + */ + for (pass = 0; pass < 10 || image; pass++) { proglen = do_jit(prog, addrs, image, oldproglen, &ctx); if (proglen <= 0) { image = NULL; diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index e4695985f9de..14a63ed6fe09 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -325,6 +325,26 @@ static void release_pci_root_info(struct pci_host_bridge *bridge) kfree(info); } +/* + * An IO port or MMIO resource assigned to a PCI host bridge may be + * consumed by the host bridge itself or available to its child + * bus/devices. The ACPI specification defines a bit (Producer/Consumer) + * to tell whether the resource is consumed by the host bridge itself, + * but firmware hasn't used that bit consistently, so we can't rely on it. + * + * On x86 and IA64 platforms, all IO port and MMIO resources are assumed + * to be available to child bus/devices except one special case: + * IO port [0xCF8-0xCFF] is consumed by the host bridge itself + * to access PCI configuration space. + * + * So explicitly filter out PCI CFG IO ports[0xCF8-0xCFF]. + */ +static bool resource_is_pcicfg_ioport(struct resource *res) +{ + return (res->flags & IORESOURCE_IO) && + res->start == 0xCF8 && res->end == 0xCFF; +} + static void probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device, int busnum, int domain, @@ -346,8 +366,8 @@ static void probe_pci_root_info(struct pci_root_info *info, "no IO and memory resources present in _CRS\n"); else resource_list_for_each_entry_safe(entry, tmp, list) { - if ((entry->res->flags & IORESOURCE_WINDOW) == 0 || - (entry->res->flags & IORESOURCE_DISABLED)) + if ((entry->res->flags & IORESOURCE_DISABLED) || + resource_is_pcicfg_ioport(entry->res)) resource_list_destroy_entry(entry); else entry->res->name = info->name; @@ -462,9 +482,16 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root) int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge) { - struct pci_sysdata *sd = bridge->bus->sysdata; - - ACPI_COMPANION_SET(&bridge->dev, sd->companion); + /* + * We pass NULL as parent to pci_create_root_bus(), so if it is not NULL + * here, pci_create_root_bus() has been called by someone else and + * sysdata is likely to be different from what we expect. Let it go in + * that case. + */ + if (!bridge->dev.parent) { + struct pci_sysdata *sd = bridge->bus->sysdata; + ACPI_COMPANION_SET(&bridge->dev, sd->companion); + } return 0; } diff --git a/arch/x86/platform/intel-mid/Makefile b/arch/x86/platform/intel-mid/Makefile index 0a8ee703b9fa..0ce1b1913673 100644 --- a/arch/x86/platform/intel-mid/Makefile +++ b/arch/x86/platform/intel-mid/Makefile @@ -1,5 +1,4 @@ obj-$(CONFIG_X86_INTEL_MID) += intel-mid.o intel_mid_vrtc.o mfld.o mrfl.o -obj-$(CONFIG_EARLY_PRINTK_INTEL_MID) += early_printk_intel_mid.o # SFI specific code ifdef CONFIG_X86_INTEL_MID diff --git a/arch/x86/platform/intel-mid/early_printk_intel_mid.c b/arch/x86/platform/intel-mid/early_printk_intel_mid.c deleted file mode 100644 index 4e720829ab90..000000000000 --- a/arch/x86/platform/intel-mid/early_printk_intel_mid.c +++ /dev/null @@ -1,112 +0,0 @@ -/* - * early_printk_intel_mid.c - early consoles for Intel MID platforms - * - * Copyright (c) 2008-2010, Intel Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; version 2 - * of the License. - */ - -/* - * This file implements early console named hsu. - * hsu is based on a High Speed UART device which only exists in the Medfield - * platform - */ - -#include <linux/serial_reg.h> -#include <linux/serial_mfd.h> -#include <linux/console.h> -#include <linux/kernel.h> -#include <linux/delay.h> -#include <linux/io.h> - -#include <asm/fixmap.h> -#include <asm/pgtable.h> -#include <asm/intel-mid.h> - -/* - * Following is the early console based on Medfield HSU (High - * Speed UART) device. - */ -#define HSU_PORT_BASE 0xffa28080 - -static void __iomem *phsu; - -void hsu_early_console_init(const char *s) -{ - unsigned long paddr, port = 0; - u8 lcr; - - /* - * Select the early HSU console port if specified by user in the - * kernel command line. - */ - if (*s && !kstrtoul(s, 10, &port)) - port = clamp_val(port, 0, 2); - - paddr = HSU_PORT_BASE + port * 0x80; - phsu = (void __iomem *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE, paddr); - - /* Disable FIFO */ - writeb(0x0, phsu + UART_FCR); - - /* Set to default 115200 bps, 8n1 */ - lcr = readb(phsu + UART_LCR); - writeb((0x80 | lcr), phsu + UART_LCR); - writeb(0x18, phsu + UART_DLL); - writeb(lcr, phsu + UART_LCR); - writel(0x3600, phsu + UART_MUL*4); - - writeb(0x8, phsu + UART_MCR); - writeb(0x7, phsu + UART_FCR); - writeb(0x3, phsu + UART_LCR); - - /* Clear IRQ status */ - readb(phsu + UART_LSR); - readb(phsu + UART_RX); - readb(phsu + UART_IIR); - readb(phsu + UART_MSR); - - /* Enable FIFO */ - writeb(0x7, phsu + UART_FCR); -} - -#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE) - -static void early_hsu_putc(char ch) -{ - unsigned int timeout = 10000; /* 10ms */ - u8 status; - - while (--timeout) { - status = readb(phsu + UART_LSR); - if (status & BOTH_EMPTY) - break; - udelay(1); - } - - /* Only write the char when there was no timeout */ - if (timeout) - writeb(ch, phsu + UART_TX); -} - -static void early_hsu_write(struct console *con, const char *str, unsigned n) -{ - int i; - - for (i = 0; i < n && *str; i++) { - if (*str == '\n') - early_hsu_putc('\r'); - early_hsu_putc(*str); - str++; - } -} - -struct console early_hsu_console = { - .name = "earlyhsu", - .write = early_hsu_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 275a3a8b78af..e97032069f88 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -51,7 +51,7 @@ VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ $(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE $(call if_changed,vdso) -HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi +HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/x86/include/uapi hostprogs-y += vdso2c quiet_cmd_vdso2c = VDSO2C $@ diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 40d2473836c9..9793322751e0 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -82,15 +82,18 @@ static notrace cycle_t vread_pvclock(int *mode) cycle_t ret; u64 last; u32 version; - u32 migrate_count; u8 flags; unsigned cpu, cpu1; /* - * When looping to get a consistent (time-info, tsc) pair, we - * also need to deal with the possibility we can switch vcpus, - * so make sure we always re-fetch time-info for the current vcpu. + * Note: hypervisor must guarantee that: + * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. + * 2. that per-CPU pvclock time info is updated if the + * underlying CPU changes. + * 3. that version is increased whenever underlying CPU + * changes. + * */ do { cpu = __getcpu() & VGETCPU_CPU_MASK; @@ -99,27 +102,20 @@ static notrace cycle_t vread_pvclock(int *mode) * __getcpu() calls (Gleb). */ - /* Make sure migrate_count will change if we leave the VCPU. */ - do { - pvti = get_pvti(cpu); - migrate_count = pvti->migrate_count; - - cpu1 = cpu; - cpu = __getcpu() & VGETCPU_CPU_MASK; - } while (unlikely(cpu != cpu1)); + pvti = get_pvti(cpu); version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); /* * Test we're still on the cpu as well as the version. - * - We must read TSC of pvti's VCPU. - * - KVM doesn't follow the versioning protocol, so data could - * change before version if we left the VCPU. + * We could have been migrated just after the first + * vgetcpu but before fetching the version, so we + * wouldn't notice a version change. */ - smp_rmb(); - } while (unlikely((pvti->pvti.version & 1) || - pvti->pvti.version != version || - pvti->migrate_count != migrate_count)); + cpu1 = __getcpu() & VGETCPU_CPU_MASK; + } while (unlikely(cpu != cpu1 || + (pvti->pvti.version & 1) || + pvti->pvti.version != version)); if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) *mode = VCLOCK_NONE; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 94578efd3067..46957ead3060 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1760,6 +1760,9 @@ static struct notifier_block xen_hvm_cpu_notifier = { static void __init xen_hvm_guest_init(void) { + if (xen_pv_domain()) + return; + init_hvm_pv_info(); xen_hvm_init_shared_info(); @@ -1775,6 +1778,7 @@ static void __init xen_hvm_guest_init(void) xen_hvm_init_time_ops(); xen_hvm_init_mmu_ops(); } +#endif static bool xen_nopv = false; static __init int xen_parse_nopv(char *arg) @@ -1784,14 +1788,11 @@ static __init int xen_parse_nopv(char *arg) } early_param("xen_nopv", xen_parse_nopv); -static uint32_t __init xen_hvm_platform(void) +static uint32_t __init xen_platform(void) { if (xen_nopv) return 0; - if (xen_pv_domain()) - return 0; - return xen_cpuid_base(); } @@ -1809,11 +1810,19 @@ bool xen_hvm_need_lapic(void) } EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); -const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = { - .name = "Xen HVM", - .detect = xen_hvm_platform, +static void xen_set_cpu_features(struct cpuinfo_x86 *c) +{ + if (xen_pv_domain()) + clear_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); +} + +const struct hypervisor_x86 x86_hyper_xen = { + .name = "Xen", + .detect = xen_platform, +#ifdef CONFIG_XEN_PVHVM .init_platform = xen_hvm_guest_init, +#endif .x2apic_available = xen_x2apic_para_available, + .set_cpu_features = xen_set_cpu_features, }; -EXPORT_SYMBOL(x86_hyper_xen_hvm); -#endif +EXPORT_SYMBOL(x86_hyper_xen); diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index d9497698645a..53b4c0811f4f 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -88,7 +88,17 @@ static void xen_vcpu_notify_restore(void *data) tick_resume_local(); } +static void xen_vcpu_notify_suspend(void *data) +{ + tick_suspend_local(); +} + void xen_arch_resume(void) { on_each_cpu(xen_vcpu_notify_restore, NULL, 1); } + +void xen_arch_suspend(void) +{ + on_each_cpu(xen_vcpu_notify_suspend, NULL, 1); +} |