diff options
Diffstat (limited to 'arch/x86')
127 files changed, 1213 insertions, 1030 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f8958b01b975..140e254fe546 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -123,7 +123,7 @@ config NEED_SG_DMA_LENGTH def_bool y config GENERIC_ISA_DMA - def_bool y + def_bool ISA_DMA_API config GENERIC_IOMAP def_bool y @@ -143,7 +143,7 @@ config GENERIC_GPIO bool config ARCH_MAY_HAVE_PC_FDC - def_bool y + def_bool ISA_DMA_API config RWSEM_GENERIC_SPINLOCK def_bool !X86_XADD @@ -221,10 +221,6 @@ config X86_HT def_bool y depends on SMP -config X86_TRAMPOLINE - def_bool y - depends on SMP || (64BIT && ACPI_SLEEP) - config X86_32_LAZY_GS def_bool y depends on X86_32 && !CC_STACKPROTECTOR @@ -2006,9 +2002,13 @@ source "drivers/pci/pcie/Kconfig" source "drivers/pci/Kconfig" -# x86_64 have no ISA slots, but do have ISA-style DMA. +# x86_64 have no ISA slots, but can have ISA-style DMA. config ISA_DMA_API - def_bool y + bool "ISA-style DMA support" if (X86_64 && EXPERT) + default y + help + Enables ISA-style DMA support for devices requiring such controllers. + If unsure, say Y. if X86_32 @@ -2096,6 +2096,16 @@ source "drivers/pcmcia/Kconfig" source "drivers/pci/hotplug/Kconfig" +config RAPIDIO + bool "RapidIO support" + depends on PCI + default n + help + If you say Y here, the kernel will include drivers and + infrastructure code to support RapidIO interconnect devices. + +source "drivers/rapidio/Kconfig" + endmenu @@ -2130,6 +2140,11 @@ config SYSVIPC_COMPAT def_bool y depends on COMPAT && SYSVIPC +config KEYS_COMPAT + bool + depends on COMPAT && KEYS + default y + endmenu diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index ed47e6e1747f..d161e939df62 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -326,7 +326,7 @@ config X86_PPRO_FENCE Old PentiumPro multiprocessor systems had errata that could cause memory operations to violate the x86 ordering standard in rare cases. Enabling this option will attempt to work around some (but not all) - occurances of this problem, at the cost of much heavier spinlock and + occurrences of this problem, at the cost of much heavier spinlock and memory barrier operations. If unsure, say n here. Even distro kernels should think twice before @@ -366,7 +366,7 @@ config X86_INTEL_USERCOPY config X86_USE_PPRO_CHECKSUM def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM config X86_USE_3DNOW def_bool y diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index 8fe2a4966b7a..adcf794b22e2 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -1346,7 +1346,7 @@ _zero_cipher_left_decrypt: and $15, %r13 # %r13 = arg4 (mod 16) je _multiple_of_16_bytes_decrypt - # Handle the last <16 byte block seperately + # Handle the last <16 byte block separately paddd ONE(%rip), %xmm0 # increment CNT to get Yn movdqa SHUF_MASK(%rip), %xmm10 @@ -1355,7 +1355,7 @@ _zero_cipher_left_decrypt: ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) sub $16, %r11 add %r13, %r11 - movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block + movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block lea SHIFT_MASK+16(%rip), %r12 sub %r13, %r12 # adjust the shuffle mask pointer to be able to shift 16-%r13 bytes @@ -1607,7 +1607,7 @@ _zero_cipher_left_encrypt: and $15, %r13 # %r13 = arg4 (mod 16) je _multiple_of_16_bytes_encrypt - # Handle the last <16 Byte block seperately + # Handle the last <16 Byte block separately paddd ONE(%rip), %xmm0 # INCR CNT to get Yn movdqa SHUF_MASK(%rip), %xmm10 PSHUFB_XMM %xmm10, %xmm0 diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index e1e60c7d5813..e0e6340c8dad 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -873,22 +873,18 @@ rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) crypto_ablkcipher_clear_flags(ctr_tfm, ~0); ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len); - if (ret) { - crypto_free_ablkcipher(ctr_tfm); - return ret; - } + if (ret) + goto out_free_ablkcipher; + ret = -ENOMEM; req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL); - if (!req) { - crypto_free_ablkcipher(ctr_tfm); - return -EINVAL; - } + if (!req) + goto out_free_ablkcipher; req_data = kmalloc(sizeof(*req_data), GFP_KERNEL); - if (!req_data) { - crypto_free_ablkcipher(ctr_tfm); - return -ENOMEM; - } + if (!req_data) + goto out_free_request; + memset(req_data->iv, 0, sizeof(req_data->iv)); /* Clear the data in the hash sub key container to zero.*/ @@ -913,8 +909,10 @@ rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) if (!ret) ret = req_data->result.err; } - ablkcipher_request_free(req); kfree(req_data); +out_free_request: + ablkcipher_request_free(req); +out_free_ablkcipher: crypto_free_ablkcipher(ctr_tfm); return ret; } diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 2d93bdbc9ac0..fd843877e841 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -298,6 +298,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs) /* OK, This is the point of no return */ set_personality(PER_LINUX); set_thread_flag(TIF_IA32); + current->mm->context.ia32_compat = 1; setup_new_exec(bprm); diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 430312ba6e3f..849a9d23c71d 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -847,4 +847,5 @@ ia32_sys_call_table: .quad sys_name_to_handle_at .quad compat_sys_open_by_handle_at .quad compat_sys_clock_adjtime + .quad sys_syncfs ia32_syscall_end: diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index b964ec457546..12e0e7dd869c 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -29,6 +29,7 @@ #include <asm/processor.h> #include <asm/mmu.h> #include <asm/mpspec.h> +#include <asm/trampoline.h> #define COMPILER_DEPENDENT_INT64 long long #define COMPILER_DEPENDENT_UINT64 unsigned long long @@ -113,11 +114,11 @@ static inline void acpi_disable_pci(void) acpi_noirq_set(); } -/* routines for saving/restoring kernel state */ -extern int acpi_save_state_mem(void); -extern void acpi_restore_state_mem(void); +/* Low-level suspend routine. */ +extern int acpi_suspend_lowlevel(void); -extern unsigned long acpi_wakeup_address; +extern const unsigned char acpi_wakeup_code[]; +#define acpi_wakeup_address (__pa(TRAMPOLINE_SYM(acpi_wakeup_code))) /* early initialization routine */ extern void acpi_reserve_wakeup_memory(void); diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index e264ae5a1443..331682231bb4 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -13,7 +13,7 @@ extern const struct pci_device_id amd_nb_misc_ids[]; extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; struct bootnode; -extern int early_is_amd_nb(u32 value); +extern bool early_is_amd_nb(u32 value); extern int amd_cache_northbridges(void); extern void amd_flush_garts(void); extern int amd_numa_init(void); @@ -32,18 +32,18 @@ struct amd_northbridge_info { }; extern struct amd_northbridge_info amd_northbridges; -#define AMD_NB_GART 0x1 -#define AMD_NB_L3_INDEX_DISABLE 0x2 -#define AMD_NB_L3_PARTITIONING 0x4 +#define AMD_NB_GART BIT(0) +#define AMD_NB_L3_INDEX_DISABLE BIT(1) +#define AMD_NB_L3_PARTITIONING BIT(2) #ifdef CONFIG_AMD_NB -static inline int amd_nb_num(void) +static inline u16 amd_nb_num(void) { return amd_northbridges.num; } -static inline int amd_nb_has_feature(int feature) +static inline bool amd_nb_has_feature(unsigned feature) { return ((amd_northbridges.flags & feature) == feature); } diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 903683b07e42..69d58131bc8e 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -456,14 +456,12 @@ static inline int fls(int x) #ifdef __KERNEL__ -#include <asm-generic/bitops/ext2-non-atomic.h> +#include <asm-generic/bitops/le.h> #define ext2_set_bit_atomic(lock, nr, addr) \ test_and_set_bit((nr), (unsigned long *)(addr)) #define ext2_clear_bit_atomic(lock, nr, addr) \ test_and_clear_bit((nr), (unsigned long *)(addr)) -#include <asm-generic/bitops/minix.h> - #endif /* __KERNEL__ */ #endif /* _ASM_X86_BITOPS_H */ diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 62f084478f7e..4e12668711e5 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -71,7 +71,7 @@ static inline void set_page_memtype(struct page *pg, unsigned long memtype) { } * Read/Write : ReadOnly, ReadWrite * Presence : NotPresent * - * Within a catagory, the attributes are mutually exclusive. + * Within a category, the attributes are mutually exclusive. * * The implementation of this API will take care of various aspects that * are associated with changing such attributes, such as: diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h index ca1098a7e580..97b6d8114a43 100644 --- a/arch/x86/include/asm/dma.h +++ b/arch/x86/include/asm/dma.h @@ -151,6 +151,7 @@ #define DMA_AUTOINIT 0x10 +#ifdef CONFIG_ISA_DMA_API extern spinlock_t dma_spin_lock; static inline unsigned long claim_dma_lock(void) @@ -164,6 +165,7 @@ static inline void release_dma_lock(unsigned long flags) { spin_unlock_irqrestore(&dma_spin_lock, flags); } +#endif /* CONFIG_ISA_DMA_API */ /* enable/disable a specific DMA channel */ static inline void enable_dma(unsigned int dmanr) @@ -303,9 +305,11 @@ static inline int get_dma_residue(unsigned int dmanr) } -/* These are in kernel/dma.c: */ +/* These are in kernel/dma.c because x86 uses CONFIG_GENERIC_ISA_DMA */ +#ifdef CONFIG_ISA_DMA_API extern int request_dma(unsigned int dmanr, const char *device_id); extern void free_dma(unsigned int dmanr); +#endif /* From PCI */ diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index 518bbbb9ee59..fe2cc6e105fa 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -26,7 +26,7 @@ extern void die(const char *, struct pt_regs *,long); extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_registers(struct pt_regs *regs); extern void show_trace(struct task_struct *t, struct pt_regs *regs, - unsigned long *sp); + unsigned long *sp, unsigned long bp); extern void __show_regs(struct pt_regs *regs, int all); extern void show_regs(struct pt_regs *regs); extern unsigned long oops_begin(void); diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 8e37deb1eb38..0f5213564326 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -142,9 +142,9 @@ struct x86_emulate_ops { int (*pio_out_emulated)(int size, unsigned short port, const void *val, unsigned int count, struct kvm_vcpu *vcpu); - bool (*get_cached_descriptor)(struct desc_struct *desc, + bool (*get_cached_descriptor)(struct desc_struct *desc, u32 *base3, int seg, struct kvm_vcpu *vcpu); - void (*set_cached_descriptor)(struct desc_struct *desc, + void (*set_cached_descriptor)(struct desc_struct *desc, u32 base3, int seg, struct kvm_vcpu *vcpu); u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); @@ -239,6 +239,7 @@ struct x86_emulate_ctxt { int interruptibility; bool perm_ok; /* do not check permissions if true */ + bool only_vendor_specific_insn; bool have_exception; struct x86_exception exception; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ffd7f8d29187..c8af0991fdf0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -85,7 +85,7 @@ #define ASYNC_PF_PER_VCPU 64 -extern spinlock_t kvm_lock; +extern raw_spinlock_t kvm_lock; extern struct list_head vm_list; struct kvm_vcpu; @@ -255,6 +255,8 @@ struct kvm_mmu { int (*sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); + void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + u64 *spte, const void *pte, unsigned long mmu_seq); hpa_t root_hpa; int root_level; int shadow_root_level; @@ -335,12 +337,6 @@ struct kvm_vcpu_arch { u64 *last_pte_updated; gfn_t last_pte_gfn; - struct { - gfn_t gfn; /* presumed gfn during guest pte update */ - pfn_t pfn; /* pfn corresponding to that gfn */ - unsigned long mmu_seq; - } update_pte; - struct fpu guest_fpu; u64 xcr0; @@ -448,7 +444,7 @@ struct kvm_arch { unsigned long irq_sources_bitmap; s64 kvmclock_offset; - spinlock_t tsc_write_lock; + raw_spinlock_t tsc_write_lock; u64 last_tsc_nsec; u64 last_tsc_offset; u64 last_tsc_write; diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 80a1dee5bea5..aeff3e89b222 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -13,6 +13,12 @@ typedef struct { int size; struct mutex lock; void *vdso; + +#ifdef CONFIG_X86_64 + /* True if mm supports a task running in 32 bit compatibility mode. */ + unsigned short ia32_compat; +#endif + } mm_context_t; #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 823d48223400..fd5a1f365c95 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -43,6 +43,7 @@ #define MSR_MTRRcap 0x000000fe #define MSR_IA32_BBL_CR_CTL 0x00000119 +#define MSR_IA32_BBL_CR_CTL3 0x0000011e #define MSR_IA32_SYSENTER_CS 0x00000174 #define MSR_IA32_SYSENTER_ESP 0x00000175 diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 07f46016d3ff..4886a68f267e 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -29,8 +29,8 @@ void arch_trigger_all_cpu_backtrace(void); * external nmis, because the local ones are more frequent. * * Also setup some default high/normal/low settings for - * subsystems to registers with. Using 4 bits to seperate - * the priorities. This can go alot higher if needed be. + * subsystems to registers with. Using 4 bits to separate + * the priorities. This can go a lot higher if needed be. */ #define NMI_LOCAL_SHIFT 16 /* randomly picked */ diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h index 6d8723a766cc..af788496020b 100644 --- a/arch/x86/include/asm/nops.h +++ b/arch/x86/include/asm/nops.h @@ -38,7 +38,7 @@ #define K8_NOP8 K8_NOP4 K8_NOP4 /* K7 nops - uses eax dependencies (arbitary choice) + uses eax dependencies (arbitrary choice) 1: nop 2: movl %eax,%eax 3: leal (,%eax,1),%eax diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h index f482010350fb..5ca6801b75f3 100644 --- a/arch/x86/include/asm/olpc.h +++ b/arch/x86/include/asm/olpc.h @@ -20,7 +20,7 @@ extern struct olpc_platform_t olpc_platform_info; /* * OLPC board IDs contain the major build number within the mask 0x0ff0, - * and the minor build number withing 0x000f. Pre-builds have a minor + * and the minor build number within 0x000f. Pre-builds have a minor * number less than 8, and normal builds start at 8. For example, 0x0B10 * is a PreB1, and 0x0C18 is a C1. */ diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 7e172955ee57..a09e1f052d84 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -451,6 +451,26 @@ do { \ #define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) #endif /* !CONFIG_M386 */ +#ifdef CONFIG_X86_CMPXCHG64 +#define percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) \ +({ \ + char __ret; \ + typeof(o1) __o1 = o1; \ + typeof(o1) __n1 = n1; \ + typeof(o2) __o2 = o2; \ + typeof(o2) __n2 = n2; \ + typeof(o2) __dummy = n2; \ + asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t" \ + : "=a"(__ret), "=m" (pcp1), "=d"(__dummy) \ + : "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2)); \ + __ret; \ +}) + +#define __this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) +#define this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) +#define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) +#endif /* CONFIG_X86_CMPXCHG64 */ + /* * Per cpu atomic 64 bit operations are only available under 64 bit. * 32 bit must fall back to generic operations. @@ -480,6 +500,34 @@ do { \ #define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) #define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval) #define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval) + +/* + * Pretty complex macro to generate cmpxchg16 instruction. The instruction + * is not supported on early AMD64 processors so we must be able to emulate + * it in software. The address used in the cmpxchg16 instruction must be + * aligned to a 16 byte boundary. + */ +#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) \ +({ \ + char __ret; \ + typeof(o1) __o1 = o1; \ + typeof(o1) __n1 = n1; \ + typeof(o2) __o2 = o2; \ + typeof(o2) __n2 = n2; \ + typeof(o2) __dummy; \ + alternative_io("call this_cpu_cmpxchg16b_emu\n\t" P6_NOP4, \ + "cmpxchg16b %%gs:(%%rsi)\n\tsetz %0\n\t", \ + X86_FEATURE_CX16, \ + ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \ + "S" (&pcp1), "b"(__n1), "c"(__n2), \ + "a"(__o1), "d"(__o2)); \ + __ret; \ +}) + +#define __this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) +#define this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) +#define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) + #endif /* This is not atomic against other CPUs -- CPU preemption needs to be off */ diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index cc29086e30cd..56fd9e3abbda 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -1,5 +1,5 @@ /* - * Netburst Perfomance Events (P4, old Xeon) + * Netburst Performance Events (P4, old Xeon) */ #ifndef PERF_EVENT_P4_H @@ -9,7 +9,7 @@ #include <linux/bitops.h> /* - * NetBurst has perfomance MSRs shared between + * NetBurst has performance MSRs shared between * threads if HT is turned on, ie for both logical * processors (mem: in turn in Atom with HT support * perf-MSRs are not shared and every thread has its diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 94b979d1b58d..effff47a3c82 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -69,8 +69,6 @@ static inline void native_pmd_clear(pmd_t *pmd) static inline void pud_clear(pud_t *pudp) { - unsigned long pgd; - set_pud(pudp, __pud(0)); /* @@ -79,13 +77,10 @@ static inline void pud_clear(pud_t *pudp) * section 8.1: in PAE mode we explicitly have to flush the * TLB via cr3 if the top-level pgd is changed... * - * Make sure the pud entry we're updating is within the - * current pgd to avoid unnecessary TLB flushes. + * Currently all places where pud_clear() is called either have + * flush_tlb_mm() followed or don't need TLB flush (x86_64 code or + * pud_clear_bad()), so we don't need TLB flush here. */ - pgd = read_cr3(); - if (__pa(pudp) >= pgd && __pa(pudp) < - (pgd + sizeof(pgd_t)*PTRS_PER_PGD)) - write_cr3(pgd); } #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 7a3e836eb2a9..a898a2b6e10c 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -7,7 +7,7 @@ */ #define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ #define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ -#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ +#define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */ #define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ #define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ #define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h index 52b098a6eebb..7b0a55a88851 100644 --- a/arch/x86/include/asm/ptrace-abi.h +++ b/arch/x86/include/asm/ptrace-abi.h @@ -31,7 +31,7 @@ #define R12 24 #define RBP 32 #define RBX 40 -/* arguments: interrupts/non tracing syscalls only save upto here*/ +/* arguments: interrupts/non tracing syscalls only save up to here*/ #define R11 48 #define R10 56 #define R9 64 diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 78cd1ea94500..1babf8adecdf 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -73,7 +73,7 @@ struct pt_regs { unsigned long r12; unsigned long rbp; unsigned long rbx; -/* arguments: non interrupts/non tracing syscalls only save upto here*/ +/* arguments: non interrupts/non tracing syscalls only save up to here*/ unsigned long r11; unsigned long r10; unsigned long r9; @@ -103,7 +103,7 @@ struct pt_regs { unsigned long r12; unsigned long bp; unsigned long bx; -/* arguments: non interrupts/non tracing syscalls only save upto here*/ +/* arguments: non interrupts/non tracing syscalls only save up to here*/ unsigned long r11; unsigned long r10; unsigned long r9; diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 562d4fd31ba8..3250e3d605d9 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -18,7 +18,10 @@ extern struct machine_ops machine_ops; void native_machine_crash_shutdown(struct pt_regs *regs); void native_machine_shutdown(void); -void machine_real_restart(const unsigned char *code, int length); +void machine_real_restart(unsigned int type); +/* These must match dispatch_table in reboot_32.S */ +#define MRR_BIOS 0 +#define MRR_APM 1 typedef void (*nmi_shootdown_cb)(int, struct die_args*); void nmi_shootdown_cpus(nmi_shootdown_cb callback); diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 231f1c1d6607..cd84f7208f76 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -1,14 +1,16 @@ #ifndef _ASM_X86_SEGMENT_H #define _ASM_X86_SEGMENT_H +#include <linux/const.h> + /* Constructor for a conventional segment GDT (or LDT) entry */ /* This is a macro so it can be used in initializers */ #define GDT_ENTRY(flags, base, limit) \ - ((((base) & 0xff000000ULL) << (56-24)) | \ - (((flags) & 0x0000f0ffULL) << 40) | \ - (((limit) & 0x000f0000ULL) << (48-16)) | \ - (((base) & 0x00ffffffULL) << 16) | \ - (((limit) & 0x0000ffffULL))) + ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \ + (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \ + (((limit) & _AC(0x000f0000,ULL)) << (48-16)) | \ + (((base) & _AC(0x00ffffff,ULL)) << 16) | \ + (((limit) & _AC(0x0000ffff,ULL)))) /* Simple and small GDT entries for booting only */ diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 52b5c7ed3608..d7e89c83645d 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -47,7 +47,7 @@ struct stacktrace_ops { }; void dump_trace(struct task_struct *tsk, struct pt_regs *regs, - unsigned long *stack, + unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data); #ifdef CONFIG_X86_32 @@ -86,11 +86,11 @@ stack_frame(struct task_struct *task, struct pt_regs *regs) extern void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, char *log_lvl); + unsigned long *stack, unsigned long bp, char *log_lvl); extern void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, char *log_lvl); + unsigned long *sp, unsigned long bp, char *log_lvl); extern unsigned int code_bytes; diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index f0b6e5dbc5a0..1f2e61e28981 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -161,8 +161,14 @@ struct thread_info { #define __HAVE_ARCH_THREAD_INFO_ALLOCATOR -#define alloc_thread_info(tsk) \ - ((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER)) +#define alloc_thread_info_node(tsk, node) \ +({ \ + struct page *page = alloc_pages_node(node, THREAD_FLAGS, \ + THREAD_ORDER); \ + struct thread_info *ret = page ? page_address(page) : NULL; \ + \ + ret; \ +}) #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h index f4500fb3b485..feca3118a73b 100644 --- a/arch/x86/include/asm/trampoline.h +++ b/arch/x86/include/asm/trampoline.h @@ -3,25 +3,36 @@ #ifndef __ASSEMBLY__ -#ifdef CONFIG_X86_TRAMPOLINE +#include <linux/types.h> +#include <asm/io.h> + /* - * Trampoline 80x86 program as an array. + * Trampoline 80x86 program as an array. These are in the init rodata + * segment, but that's okay, because we only care about the relative + * addresses of the symbols. */ -extern const unsigned char trampoline_data []; -extern const unsigned char trampoline_end []; -extern unsigned char *trampoline_base; +extern const unsigned char x86_trampoline_start []; +extern const unsigned char x86_trampoline_end []; +extern unsigned char *x86_trampoline_base; extern unsigned long init_rsp; extern unsigned long initial_code; extern unsigned long initial_gs; -#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) +extern void __init setup_trampolines(void); + +extern const unsigned char trampoline_data[]; +extern const unsigned char trampoline_status[]; + +#define TRAMPOLINE_SYM(x) \ + ((void *)(x86_trampoline_base + \ + ((const unsigned char *)(x) - x86_trampoline_start))) -extern unsigned long setup_trampoline(void); -extern void __init reserve_trampoline_memory(void); -#else -static inline void reserve_trampoline_memory(void) {} -#endif /* CONFIG_X86_TRAMPOLINE */ +/* Address of the SMP trampoline */ +static inline unsigned long trampoline_address(void) +{ + return virt_to_phys(TRAMPOLINE_SYM(trampoline_data)); +} #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 1ca132fc0d03..83e2efd181e2 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -35,7 +35,7 @@ static inline cycles_t get_cycles(void) static __always_inline cycles_t vget_cycles(void) { /* - * We only do VDSOs on TSC capable CPUs, so this shouldnt + * We only do VDSOs on TSC capable CPUs, so this shouldn't * access boot_cpu_data (which is not VDSO-safe): */ #ifndef CONFIG_X86_TSC diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h index df1da20f4534..8e8c23fef08c 100644 --- a/arch/x86/include/asm/types.h +++ b/arch/x86/include/asm/types.h @@ -1,22 +1,6 @@ #ifndef _ASM_X86_TYPES_H #define _ASM_X86_TYPES_H -#define dma_addr_t dma_addr_t - #include <asm-generic/types.h> -#ifdef __KERNEL__ -#ifndef __ASSEMBLY__ - -typedef u64 dma64_addr_t; -#if defined(CONFIG_X86_64) || defined(CONFIG_HIGHMEM64G) -/* DMA addresses come in 32-bit and 64-bit flavours. */ -typedef u64 dma_addr_t; -#else -typedef u32 dma_addr_t; -#endif - -#endif /* __ASSEMBLY__ */ -#endif /* __KERNEL__ */ - #endif /* _ASM_X86_TYPES_H */ diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index ffaf183c619a..a755ef5e5977 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -349,10 +349,11 @@ #define __NR_name_to_handle_at 341 #define __NR_open_by_handle_at 342 #define __NR_clock_adjtime 343 +#define __NR_syncfs 344 #ifdef __KERNEL__ -#define NR_syscalls 344 +#define NR_syscalls 345 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 5466bea670e7..160fa76bd578 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -675,6 +675,8 @@ __SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at) __SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at) #define __NR_clock_adjtime 305 __SYSCALL(__NR_clock_adjtime, sys_clock_adjtime) +#define __NR_syncfs 306 +__SYSCALL(__NR_syncfs, sys_syncfs) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index 1c10c88ee4e1..5d4922ad4b9b 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -86,7 +86,7 @@ DEFINE_GUEST_HANDLE(void); * The privilege level specifies which modes may enter a trap via a software * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate * privilege levels as follows: - * Level == 0: Noone may enter + * Level == 0: No one may enter * Level == 1: Kernel may enter * Level == 2: Kernel may enter * Level == 3: Everyone may enter diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 62445ba2f8a8..7338ef2218bc 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -41,13 +41,13 @@ obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o obj-y += bootflag.o e820.o -obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o +obj-y += pci-dma.o quirks.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o obj-y += tsc.o io_delay.o rtc.o obj-y += pci-iommu_table.o obj-y += resource.o -obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o +obj-y += trampoline.o trampoline_$(BITS).o obj-y += process.o obj-y += i387.o xsave.o obj-y += ptrace.o @@ -55,10 +55,12 @@ obj-$(CONFIG_X86_32) += tls.o obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o obj-$(CONFIG_INTEL_TXT) += tboot.o +obj-$(CONFIG_ISA_DMA_API) += i8237.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ obj-y += acpi/ obj-y += reboot.o +obj-$(CONFIG_X86_32) += reboot_32.o obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o @@ -69,7 +71,6 @@ obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SMP) += smpboot.o obj-$(CONFIG_SMP) += tsc_sync.o obj-$(CONFIG_SMP) += setup_percpu.o -obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S index 28595d6df47c..ead21b663117 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.S @@ -6,11 +6,17 @@ #include <asm/page_types.h> #include <asm/pgtable_types.h> #include <asm/processor-flags.h> +#include "wakeup.h" .code16 - .section ".header", "a" + .section ".jump", "ax" + .globl _start +_start: + cli + jmp wakeup_code /* This should match the structure in wakeup.h */ + .section ".header", "a" .globl wakeup_header wakeup_header: video_mode: .short 0 /* Video mode number */ @@ -30,14 +36,11 @@ wakeup_jmp: .byte 0xea /* ljmpw */ wakeup_jmp_off: .word 3f wakeup_jmp_seg: .word 0 wakeup_gdt: .quad 0, 0, 0 -signature: .long 0x51ee1111 +signature: .long WAKEUP_HEADER_SIGNATURE .text - .globl _start .code16 wakeup_code: -_start: - cli cld /* Apparently some dimwit BIOS programmers don't know how to @@ -77,12 +80,12 @@ _start: /* Check header signature... */ movl signature, %eax - cmpl $0x51ee1111, %eax + cmpl $WAKEUP_HEADER_SIGNATURE, %eax jne bogus_real_magic /* Check we really have everything... */ movl end_signature, %eax - cmpl $0x65a22c82, %eax + cmpl $WAKEUP_END_SIGNATURE, %eax jne bogus_real_magic /* Call the C code */ @@ -147,3 +150,7 @@ wakeup_heap: wakeup_stack: .space 2048 wakeup_stack_end: + + .section ".signature","a" +end_signature: + .long WAKEUP_END_SIGNATURE diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h index 69d38d0b2b64..e1828c07e79c 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.h +++ b/arch/x86/kernel/acpi/realmode/wakeup.h @@ -35,7 +35,8 @@ struct wakeup_header { extern struct wakeup_header wakeup_header; #endif -#define HEADER_OFFSET 0x3f00 -#define WAKEUP_SIZE 0x4000 +#define WAKEUP_HEADER_OFFSET 8 +#define WAKEUP_HEADER_SIGNATURE 0x51ee1111 +#define WAKEUP_END_SIGNATURE 0x65a22c82 #endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */ diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S index 060fff8f5c5b..d4f8010a5b1b 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S @@ -13,9 +13,19 @@ ENTRY(_start) SECTIONS { . = 0; + .jump : { + *(.jump) + } = 0x90909090 + + . = WAKEUP_HEADER_OFFSET; + .header : { + *(.header) + } + + . = ALIGN(16); .text : { *(.text*) - } + } = 0x90909090 . = ALIGN(16); .rodata : { @@ -33,11 +43,6 @@ SECTIONS *(.data*) } - .signature : { - end_signature = .; - LONG(0x65a22c82) - } - . = ALIGN(16); .bss : { __bss_start = .; @@ -45,20 +50,13 @@ SECTIONS __bss_end = .; } - . = HEADER_OFFSET; - .header : { - *(.header) + .signature : { + *(.signature) } - . = ALIGN(16); _end = .; /DISCARD/ : { *(.note*) } - - /* - * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: - */ - . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!"); } diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 68d1537b8c81..ff93bc1b09c3 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -18,37 +18,28 @@ #include "realmode/wakeup.h" #include "sleep.h" -unsigned long acpi_wakeup_address; unsigned long acpi_realmode_flags; -/* address in low memory of the wakeup routine. */ -static unsigned long acpi_realmode; - #if defined(CONFIG_SMP) && defined(CONFIG_64BIT) static char temp_stack[4096]; #endif /** - * acpi_save_state_mem - save kernel state + * acpi_suspend_lowlevel - save kernel state * * Create an identity mapped page table and copy the wakeup routine to * low memory. - * - * Note that this is too late to change acpi_wakeup_address. */ -int acpi_save_state_mem(void) +int acpi_suspend_lowlevel(void) { struct wakeup_header *header; + /* address in low memory of the wakeup routine. */ + char *acpi_realmode; - if (!acpi_realmode) { - printk(KERN_ERR "Could not allocate memory during boot, " - "S3 disabled\n"); - return -ENOMEM; - } - memcpy((void *)acpi_realmode, &wakeup_code_start, WAKEUP_SIZE); + acpi_realmode = TRAMPOLINE_SYM(acpi_wakeup_code); - header = (struct wakeup_header *)(acpi_realmode + HEADER_OFFSET); - if (header->signature != 0x51ee1111) { + header = (struct wakeup_header *)(acpi_realmode + WAKEUP_HEADER_OFFSET); + if (header->signature != WAKEUP_HEADER_SIGNATURE) { printk(KERN_ERR "wakeup header does not match\n"); return -EINVAL; } @@ -68,9 +59,7 @@ int acpi_save_state_mem(void) /* GDT[0]: GDT self-pointer */ header->wakeup_gdt[0] = (u64)(sizeof(header->wakeup_gdt) - 1) + - ((u64)(acpi_wakeup_address + - ((char *)&header->wakeup_gdt - (char *)acpi_realmode)) - << 16); + ((u64)__pa(&header->wakeup_gdt) << 16); /* GDT[1]: big real mode-like code segment */ header->wakeup_gdt[1] = GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff); @@ -96,7 +85,7 @@ int acpi_save_state_mem(void) header->pmode_cr3 = (u32)__pa(&initial_page_table); saved_magic = 0x12345678; #else /* CONFIG_64BIT */ - header->trampoline_segment = setup_trampoline() >> 4; + header->trampoline_segment = trampoline_address() >> 4; #ifdef CONFIG_SMP stack_start = (unsigned long)temp_stack + sizeof(temp_stack); early_gdt_descr.address = @@ -107,56 +96,10 @@ int acpi_save_state_mem(void) saved_magic = 0x123456789abcdef0L; #endif /* CONFIG_64BIT */ + do_suspend_lowlevel(); return 0; } -/* - * acpi_restore_state - undo effects of acpi_save_state_mem - */ -void acpi_restore_state_mem(void) -{ -} - - -/** - * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation - * - * We allocate a page from the first 1MB of memory for the wakeup - * routine for when we come back from a sleep state. The - * runtime allocator allows specification of <16MB pages, but not - * <1MB pages. - */ -void __init acpi_reserve_wakeup_memory(void) -{ - phys_addr_t mem; - - if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) { - printk(KERN_ERR - "ACPI: Wakeup code way too big, S3 disabled.\n"); - return; - } - - mem = memblock_find_in_range(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE); - - if (mem == MEMBLOCK_ERROR) { - printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); - return; - } - acpi_realmode = (unsigned long) phys_to_virt(mem); - acpi_wakeup_address = mem; - memblock_x86_reserve_range(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP"); -} - -int __init acpi_configure_wakeup_memory(void) -{ - if (acpi_realmode) - set_memory_x(acpi_realmode, WAKEUP_SIZE >> PAGE_SHIFT); - - return 0; -} -arch_initcall(acpi_configure_wakeup_memory); - - static int __init acpi_sleep_setup(char *str) { while ((str != NULL) && (*str != '\0')) { diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h index adbcbaa6f1df..416d4be13fef 100644 --- a/arch/x86/kernel/acpi/sleep.h +++ b/arch/x86/kernel/acpi/sleep.h @@ -4,13 +4,12 @@ #include <asm/trampoline.h> -extern char wakeup_code_start, wakeup_code_end; - extern unsigned long saved_video_mode; extern long saved_magic; extern int wakeup_pmode_return; -extern char swsusp_pg_dir[PAGE_SIZE]; extern unsigned long acpi_copy_wakeup_routine(unsigned long); extern void wakeup_long64(void); + +extern void do_suspend_lowlevel(void); diff --git a/arch/x86/kernel/acpi/wakeup_rm.S b/arch/x86/kernel/acpi/wakeup_rm.S index 6ff3b5730575..63b8ab524f2c 100644 --- a/arch/x86/kernel/acpi/wakeup_rm.S +++ b/arch/x86/kernel/acpi/wakeup_rm.S @@ -2,9 +2,11 @@ * Wrapper script for the realmode binary as a transport object * before copying to low memory. */ - .section ".rodata","a" - .globl wakeup_code_start, wakeup_code_end -wakeup_code_start: +#include <asm/page_types.h> + + .section ".x86_trampoline","a" + .balign PAGE_SIZE + .globl acpi_wakeup_code +acpi_wakeup_code: .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin" -wakeup_code_end: - .size wakeup_code_start, .-wakeup_code_start + .size acpi_wakeup_code, .-acpi_wakeup_code diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 7038b95d363f..4a234677e213 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -199,7 +199,7 @@ void *text_poke_early(void *addr, const void *opcode, size_t len); /* Replace instructions with better alternatives for this CPU type. This runs before SMP is initialized to avoid SMP problems with - self modifying code. This implies that assymetric systems where + self modifying code. This implies that asymmetric systems where APs have less capabilities than the boot processor are not handled. Tough. Make sure you disable such features by hand. */ @@ -620,7 +620,12 @@ static int __kprobes stop_machine_text_poke(void *data) flush_icache_range((unsigned long)p->addr, (unsigned long)p->addr + p->len); } - + /* + * Intel Archiecture Software Developer's Manual section 7.1.3 specifies + * that a core serializing instruction such as "cpuid" should be + * executed on _each_ core before the new instruction is made visible. + */ + sync_core(); return 0; } diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index ed3c2e5b714a..6801959a8b2a 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -15,7 +15,7 @@ static u32 *flush_words; const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, {} }; EXPORT_SYMBOL(amd_nb_misc_ids); @@ -48,7 +48,7 @@ static struct pci_dev *next_northbridge(struct pci_dev *dev, int amd_cache_northbridges(void) { - int i = 0; + u16 i = 0; struct amd_northbridge *nb; struct pci_dev *misc, *link; @@ -103,9 +103,11 @@ int amd_cache_northbridges(void) } EXPORT_SYMBOL_GPL(amd_cache_northbridges); -/* Ignores subdevice/subvendor but as far as I can figure out - they're useless anyways */ -int __init early_is_amd_nb(u32 device) +/* + * Ignores subdevice/subvendor but as far as I can figure out + * they're useless anyways + */ +bool __init early_is_amd_nb(u32 device) { const struct pci_device_id *id; u32 vendor = device & 0xffff; @@ -113,8 +115,8 @@ int __init early_is_amd_nb(u32 device) device >>= 16; for (id = amd_nb_misc_ids; id->vendor; id++) if (vendor == id->vendor && device == id->device) - return 1; - return 0; + return true; + return false; } int amd_get_subcaches(int cpu) @@ -176,9 +178,9 @@ int amd_set_subcaches(int cpu, int mask) return 0; } -int amd_cache_gart(void) +static int amd_cache_gart(void) { - int i; + u16 i; if (!amd_nb_has_feature(AMD_NB_GART)) return 0; diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 7b1e8e10b89c..86d1ad4962a7 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -73,7 +73,7 @@ static u32 __init allocate_aperture(void) /* * using 512M as goal, in case kexec will load kernel_big * that will do the on position decompress, and could overlap with - * that positon with gart that is used. + * that position with gart that is used. * sequende: * kernel_small * ==> kexec (with kdump trigger path or previous doesn't shutdown gart) diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 4b5ebd26f565..180ca240e03c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1886,7 +1886,7 @@ void disable_IO_APIC(void) * * With interrupt-remapping, for now we will use virtual wire A mode, * as virtual wire B is little complex (need to configure both - * IOAPIC RTE aswell as interrupt-remapping table entry). + * IOAPIC RTE as well as interrupt-remapping table entry). * As this gets called during crash dump, keep this simple for now. */ if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) { @@ -2905,7 +2905,7 @@ void __init setup_IO_APIC(void) } /* - * Called after all the initialization is done. If we didnt find any + * Called after all the initialization is done. If we didn't find any * APIC bugs then we can allow the modify fast path */ @@ -3983,7 +3983,7 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi) static __init int bad_ioapic(unsigned long address) { if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded " + printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded " "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics); return 1; } diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 0e4f24c2a746..0b4be431c620 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -66,7 +66,7 @@ * 1.5: Fix segment register reloading (in case of bad segments saved * across BIOS call). * Stephen Rothwell - * 1.6: Cope with complier/assembler differences. + * 1.6: Cope with compiler/assembler differences. * Only try to turn off the first display device. * Fix OOPS at power off with no APM BIOS by Jan Echternach * <echter@informatik.uni-rostock.de> @@ -227,6 +227,7 @@ #include <linux/suspend.h> #include <linux/kthread.h> #include <linux/jiffies.h> +#include <linux/acpi.h> #include <asm/system.h> #include <asm/uaccess.h> @@ -975,20 +976,10 @@ recalc: static void apm_power_off(void) { - unsigned char po_bios_call[] = { - 0xb8, 0x00, 0x10, /* movw $0x1000,ax */ - 0x8e, 0xd0, /* movw ax,ss */ - 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */ - 0xb8, 0x07, 0x53, /* movw $0x5307,ax */ - 0xbb, 0x01, 0x00, /* movw $0x0001,bx */ - 0xb9, 0x03, 0x00, /* movw $0x0003,cx */ - 0xcd, 0x15 /* int $0x15 */ - }; - /* Some bioses don't like being called from CPU != 0 */ if (apm_info.realmode_power_off) { set_cpus_allowed_ptr(current, cpumask_of(0)); - machine_real_restart(po_bios_call, sizeof(po_bios_call)); + machine_real_restart(MRR_APM); } else { (void)set_system_power_state(APM_STATE_OFF); } @@ -2331,12 +2322,11 @@ static int __init apm_init(void) apm_info.disabled = 1; return -ENODEV; } - if (pm_flags & PM_ACPI) { + if (!acpi_disabled) { printk(KERN_NOTICE "apm: overridden by ACPI.\n"); apm_info.disabled = 1; return -ENODEV; } - pm_flags |= PM_APM; /* * Set up the long jump entry point to the APM BIOS, which is called @@ -2428,7 +2418,6 @@ static void __exit apm_exit(void) kthread_stop(kapmd_task); kapmd_task = NULL; } - pm_flags &= ~PM_APM; } module_init(apm_init); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f771ab6b49e9..3ecece0217ef 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -611,6 +611,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } } #endif + + /* As a rule processors have APIC timer running in deep C states */ + if (c->x86 >= 0xf && !cpu_has_amd_erratum(amd_erratum_400)) + set_cpu_cap(c, X86_FEATURE_ARAT); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index 03162dac6271..cf48cdd6907d 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -444,7 +444,7 @@ static int __cpuinit longhaul_get_ranges(void) return -EINVAL; } /* Get max multiplier - as we always did. - * Longhaul MSR is usefull only when voltage scaling is enabled. + * Longhaul MSR is useful only when voltage scaling is enabled. * C3 is booting at max anyway. */ maxmult = mult; /* Get min multiplier */ @@ -1011,7 +1011,7 @@ static void __exit longhaul_exit(void) * trigger frequency transition in some cases. */ module_param(disable_acpi_c3, int, 0644); MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support"); -/* Change CPU voltage with frequency. Very usefull to save +/* Change CPU voltage with frequency. Very useful to save * power, but most VIA C3 processors aren't supporting it. */ module_param(scale_voltage, int, 0644); MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor"); diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index 4a5a42b842ad..755a31e0f5b0 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -315,8 +315,6 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle) input.count = 4; input.pointer = in_params; - input.count = 4; - input.pointer = in_params; in_params[0].type = ACPI_TYPE_BUFFER; in_params[0].buffer.length = 16; in_params[0].buffer.pointer = OSC_UUID; diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index c567dec854f6..2368e38327b3 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -630,8 +630,7 @@ static void print_basics(struct powernow_k8_data *data) data->powernow_table[j].frequency/1000); } else { printk(KERN_INFO PFX - " %d : fid 0x%x (%d MHz), vid 0x%x\n", - j, + "fid 0x%x (%d MHz), vid 0x%x\n", data->powernow_table[j].index & 0xff, data->powernow_table[j].frequency/1000, data->powernow_table[j].index >> 8); @@ -1276,7 +1275,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) if (powernow_k8_cpu_init_acpi(data)) { /* - * Use the PSB BIOS structure. This is only availabe on + * Use the PSB BIOS structure. This is only available on * an UP version, and is deprecated by AMD. */ if (num_online_cpus() != 1) { diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c index 8abd869baabf..91bc25b67bc1 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c @@ -292,7 +292,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy) result = speedstep_smi_ownership(); if (result) { - dprintk("fails in aquiring ownership of a SMI interface.\n"); + dprintk("fails in acquiring ownership of a SMI interface.\n"); return -EINVAL; } @@ -360,7 +360,7 @@ static int speedstep_resume(struct cpufreq_policy *policy) int result = speedstep_smi_ownership(); if (result) - dprintk("fails in re-aquiring ownership of a SMI interface.\n"); + dprintk("fails in re-acquiring ownership of a SMI interface.\n"); return result; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 8209472b27a5..83930deec3c6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -106,24 +106,34 @@ int apei_write_mce(struct mce *m) ssize_t apei_read_mce(struct mce *m, u64 *record_id) { struct cper_mce_record rcd; - ssize_t len; - - len = erst_read_next(&rcd.hdr, sizeof(rcd)); - if (len <= 0) - return len; - /* Can not skip other records in storage via ERST unless clear them */ - else if (len != sizeof(rcd) || - uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) { - if (printk_ratelimit()) - pr_warning( - "MCE-APEI: Can not skip the unknown record in ERST"); - return -EIO; - } - + int rc, pos; + + rc = erst_get_record_id_begin(&pos); + if (rc) + return rc; +retry: + rc = erst_get_record_id_next(&pos, record_id); + if (rc) + goto out; + /* no more record */ + if (*record_id == APEI_ERST_INVALID_RECORD_ID) + goto out; + rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd)); + /* someone else has cleared the record, try next one */ + if (rc == -ENOENT) + goto retry; + else if (rc < 0) + goto out; + /* try to skip other type records in storage */ + else if (rc != sizeof(rcd) || + uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) + goto retry; memcpy(m, &rcd.mce, sizeof(*m)); - *record_id = rcd.hdr.record_id; + rc = sizeof(*m); +out: + erst_get_record_id_end(); - return sizeof(*m); + return rc; } /* Check whether there is record in ERST */ diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index a77971979564..0ed633c5048b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -32,7 +32,7 @@ static void inject_mce(struct mce *m) { struct mce *i = &per_cpu(injectm, m->extcpu); - /* Make sure noone reads partially written injectm */ + /* Make sure no one reads partially written injectm */ i->finished = 0; mb(); m->finished = 0; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index d916183b7f9c..ab1122998dba 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -881,7 +881,7 @@ reset: * Check if the address reported by the CPU is in a format we can parse. * It would be possible to add code for most other cases, but all would * be somewhat complicated (e.g. segment offset would require an instruction - * parser). So only support physical addresses upto page granuality for now. + * parser). So only support physical addresses up to page granuality for now. */ static int mce_usable_address(struct mce *m) { diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 9f27228ceffd..a71efcdbb092 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -1,6 +1,6 @@ /* * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong - * because MTRRs can span upto 40 bits (36bits on most modern x86) + * because MTRRs can span up to 40 bits (36bits on most modern x86) */ #define DEBUG diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index eb00677ee2ae..eed3673a8656 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1114,7 +1114,7 @@ static int x86_pmu_add(struct perf_event *event, int flags) /* * If group events scheduling transaction was started, - * skip the schedulability test here, it will be peformed + * skip the schedulability test here, it will be performed * at commit time (->commit_txn) as a whole */ if (cpuc->group_flag & PERF_EVENT_TXN) @@ -1795,7 +1795,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) perf_callchain_store(entry, regs->ip); - dump_trace(NULL, regs, NULL, &backtrace_ops, entry); + dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); } #ifdef CONFIG_COMPAT diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index d3d7b59841e5..c2520e178d32 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -1,5 +1,5 @@ /* - * Netburst Perfomance Events (P4, old Xeon) + * Netburst Performance Events (P4, old Xeon) * * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org> * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com> @@ -679,7 +679,7 @@ static int p4_validate_raw_event(struct perf_event *event) */ /* - * if an event is shared accross the logical threads + * if an event is shared across the logical threads * the user needs special permissions to be able to use it */ if (p4_ht_active() && p4_event_bind_map[v].shared) { @@ -791,13 +791,13 @@ static void p4_pmu_disable_pebs(void) * * It's still allowed that two threads setup same cache * events so we can't simply clear metrics until we knew - * noone is depending on us, so we need kind of counter + * no one is depending on us, so we need kind of counter * for "ReplayEvent" users. * * What is more complex -- RAW events, if user (for some * reason) will pass some cache event metric with improper * event opcode -- it's fine from hardware point of view - * but completely nonsence from "meaning" of such action. + * but completely nonsense from "meaning" of such action. * * So at moment let leave metrics turned on forever -- it's * ok for now but need to be revisited! diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 227b0448960d..d22d0c4edcfd 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -86,7 +86,7 @@ static void __init vmware_platform_setup(void) } /* - * While checking the dmi string infomation, just checking the product + * While checking the dmi string information, just checking the product * serial key should be enough, as this will always have a VMware * specific string when running under VMware hypervisor. */ diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index d5cd13945d5a..642f75a68cd5 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -14,9 +14,6 @@ static void *kdump_buf_page; -/* Stores the physical address of elf header of crash image. */ -unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; - static inline bool is_crashed_pfn_valid(unsigned long pfn) { #ifndef CONFIG_X86_PAE diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 994828899e09..afa64adb75ee 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -10,9 +10,6 @@ #include <linux/uaccess.h> #include <linux/io.h> -/* Stores the physical address of elf header of crash image. */ -unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; - /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 7a8cebc9ff29..706a9fb46a58 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -65,12 +65,10 @@ unsigned int irq_create_of_mapping(struct device_node *controller, return 0; ret = ih->xlate(ih, intspec, intsize, &virq, &type); if (ret) - return ret; + return 0; if (type == IRQ_TYPE_NONE) return virq; - /* set the mask if it is different from current */ - if (type == (irq_to_desc(virq)->status & IRQF_TRIGGER_MASK)) - set_irq_type(virq, type); + irq_set_irq_type(virq, type); return virq; } EXPORT_SYMBOL_GPL(irq_create_of_mapping); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 220a1c11cfde..e2a3f0606da4 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -27,7 +27,7 @@ static int die_counter; void printk_address(unsigned long address, int reliable) { - printk(" [<%p>] %s%pS\n", (void *) address, + printk(" [<%p>] %s%pB\n", (void *) address, reliable ? "" : "? ", (void *) address); } @@ -175,21 +175,21 @@ static const struct stacktrace_ops print_trace_ops = { void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, char *log_lvl) + unsigned long *stack, unsigned long bp, char *log_lvl) { printk("%sCall Trace:\n", log_lvl); - dump_trace(task, regs, stack, &print_trace_ops, log_lvl); + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); } void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack) + unsigned long *stack, unsigned long bp) { - show_trace_log_lvl(task, regs, stack, ""); + show_trace_log_lvl(task, regs, stack, bp, ""); } void show_stack(struct task_struct *task, unsigned long *sp) { - show_stack_log_lvl(task, NULL, sp, ""); + show_stack_log_lvl(task, NULL, sp, 0, ""); } /* @@ -197,14 +197,16 @@ void show_stack(struct task_struct *task, unsigned long *sp) */ void dump_stack(void) { + unsigned long bp; unsigned long stack; + bp = stack_frame(current, NULL); printk("Pid: %d, comm: %.20s %s %s %.*s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - show_trace(NULL, NULL, &stack); + show_trace(NULL, NULL, &stack, bp); } EXPORT_SYMBOL(dump_stack); @@ -320,16 +322,6 @@ void die(const char *str, struct pt_regs *regs, long err) oops_end(flags, regs, sig); } -static int __init oops_setup(char *s) -{ - if (!s) - return -EINVAL; - if (!strcmp(s, "panic")) - panic_on_oops = 1; - return 0; -} -early_param("oops", oops_setup); - static int __init kstack_setup(char *s) { if (!s) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 74cc1eda384b..3b97a80ce329 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -17,12 +17,11 @@ #include <asm/stacktrace.h> -void dump_trace(struct task_struct *task, - struct pt_regs *regs, unsigned long *stack, +void dump_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { int graph = 0; - unsigned long bp; if (!task) task = current; @@ -35,7 +34,9 @@ void dump_trace(struct task_struct *task, stack = (unsigned long *)task->thread.sp; } - bp = stack_frame(task, regs); + if (!bp) + bp = stack_frame(task, regs); + for (;;) { struct thread_info *context; @@ -55,7 +56,7 @@ EXPORT_SYMBOL(dump_trace); void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, char *log_lvl) + unsigned long *sp, unsigned long bp, char *log_lvl) { unsigned long *stack; int i; @@ -77,7 +78,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, touch_nmi_watchdog(); } printk(KERN_CONT "\n"); - show_trace_log_lvl(task, regs, sp, log_lvl); + show_trace_log_lvl(task, regs, sp, bp, log_lvl); } @@ -102,7 +103,7 @@ void show_registers(struct pt_regs *regs) u8 *ip; printk(KERN_EMERG "Stack:\n"); - show_stack_log_lvl(NULL, regs, ®s->sp, KERN_EMERG); + show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); printk(KERN_EMERG "Code: "); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index a6b6fcf7f0ae..e71c98d3c0d2 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -139,8 +139,8 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack, * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -void dump_trace(struct task_struct *task, - struct pt_regs *regs, unsigned long *stack, +void dump_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { const unsigned cpu = get_cpu(); @@ -150,7 +150,6 @@ void dump_trace(struct task_struct *task, struct thread_info *tinfo; int graph = 0; unsigned long dummy; - unsigned long bp; if (!task) task = current; @@ -161,7 +160,8 @@ void dump_trace(struct task_struct *task, stack = (unsigned long *)task->thread.sp; } - bp = stack_frame(task, regs); + if (!bp) + bp = stack_frame(task, regs); /* * Print function call entries in all stacks, starting at the * current stack address. If the stacks consist of nested @@ -225,7 +225,7 @@ EXPORT_SYMBOL(dump_trace); void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, char *log_lvl) + unsigned long *sp, unsigned long bp, char *log_lvl) { unsigned long *irq_stack_end; unsigned long *irq_stack; @@ -269,7 +269,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, preempt_enable(); printk(KERN_CONT "\n"); - show_trace_log_lvl(task, regs, sp, log_lvl); + show_trace_log_lvl(task, regs, sp, bp, log_lvl); } void show_registers(struct pt_regs *regs) @@ -298,7 +298,7 @@ void show_registers(struct pt_regs *regs) printk(KERN_EMERG "Stack:\n"); show_stack_log_lvl(NULL, regs, (unsigned long *)sp, - KERN_EMERG); + 0, KERN_EMERG); printk(KERN_EMERG "Code: "); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index cdf5bfd9d4d5..3e2ef8425316 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -11,6 +11,7 @@ #include <linux/kernel.h> #include <linux/types.h> #include <linux/init.h> +#include <linux/crash_dump.h> #include <linux/bootmem.h> #include <linux/pfn.h> #include <linux/suspend.h> diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 9efbdcc56425..3755ef494390 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -159,7 +159,12 @@ static void __init ati_bugs_contd(int num, int slot, int func) if (rev >= 0x40) acpi_fix_pin2_polarity = 1; - if (rev > 0x13) + /* + * SB600: revisions 0x11, 0x12, 0x13, 0x14, ... + * SB700: revisions 0x39, 0x3a, ... + * SB800: revisions 0x40, 0x41, ... + */ + if (rev >= 0x39) return; if (acpi_use_timer_override) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index fa41f7298c84..5c1a91974918 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1414,7 +1414,7 @@ ENTRY(async_page_fault) pushl_cfi $do_async_page_fault jmp error_code CFI_ENDPROC -END(apf_page_fault) +END(async_page_fault) #endif /* diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b72b4a6466a9..8a445a0c989e 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -18,7 +18,7 @@ * A note on terminology: * - top of stack: Architecture defined interrupt frame from SS to RIP * at the top of the kernel process stack. - * - partial stack frame: partially saved registers upto R11. + * - partial stack frame: partially saved registers up to R11. * - full stack frame: Like partial stack frame, but all register saved. * * Some macro usage: @@ -422,7 +422,7 @@ ENTRY(ret_from_fork) END(ret_from_fork) /* - * System call entry. Upto 6 arguments in registers are supported. + * System call entry. Up to 6 arguments in registers are supported. * * SYSCALL does not save anything on the stack and does not change the * stack pointer. diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 7f138b3c3c52..d6d6bb361931 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -34,15 +34,6 @@ void __init i386_start_kernel(void) { memblock_init(); -#ifdef CONFIG_X86_TRAMPOLINE - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - memblock_x86_reserve_range(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE"); -#endif - memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 2d2673c28aff..5655c2272adb 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -77,9 +77,6 @@ void __init x86_64_start_kernel(char * real_mode_data) /* Make NULL pointers segfault */ zap_identity_mappings(); - /* Cleanup the over mapped high alias */ - cleanup_highmap(); - max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 239046bd447f..e11e39478a49 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -136,10 +136,9 @@ ident_complete: /* Fixup phys_base */ addq %rbp, phys_base(%rip) -#ifdef CONFIG_X86_TRAMPOLINE + /* Fixup trampoline */ addq %rbp, trampoline_level4_pgt + 0(%rip) addq %rbp, trampoline_level4_pgt + (511*8)(%rip) -#endif /* Due to ENTRY(), sometimes the empty space gets filled with * zeros. Better take a jmp than relying on empty space being diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index e60c38cc0eed..12aff2537682 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -145,7 +145,7 @@ EXPORT_SYMBOL_GPL(fpu_finit); * The _current_ task is using the FPU for the first time * so initialize it and set the mxcsr to its default * value at reset if we support XMM instructions and then - * remeber the current task has used the FPU. + * remember the current task has used the FPU. */ int init_fpu(struct task_struct *tsk) { diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 9974d21048fd..72090705a656 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -172,7 +172,7 @@ asmlinkage void do_softirq(void) call_on_stack(__do_softirq, isp); /* - * Shouldnt happen, we returned above if in_interrupt(): + * Shouldn't happen, we returned above if in_interrupt(): */ WARN_ON_ONCE(softirq_count()); } diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 7c64c420a9f6..dba0b36941a5 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -278,7 +278,7 @@ static int hw_break_release_slot(int breakno) pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); if (dbg_release_bp_slot(*pevent)) /* - * The debugger is responisble for handing the retry on + * The debugger is responsible for handing the retry on * remove failure. */ return -1; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 8dc44662394b..33c07b0b122e 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -493,7 +493,7 @@ static void __init kvm_smp_prepare_boot_cpu(void) native_smp_prepare_boot_cpu(); } -static void kvm_guest_cpu_online(void *dummy) +static void __cpuinit kvm_guest_cpu_online(void *dummy) { kvm_guest_cpu_init(); } diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c index 63eaf6596233..177183cbb6ae 100644 --- a/arch/x86/kernel/mca_32.c +++ b/arch/x86/kernel/mca_32.c @@ -259,7 +259,7 @@ static int __init mca_init(void) /* * WARNING: Be careful when making changes here. Putting an adapter * and the motherboard simultaneously into setup mode may result in - * damage to chips (according to The Indispensible PC Hardware Book + * damage to chips (according to The Indispensable PC Hardware Book * by Hans-Peter Messmer). Also, we disable system interrupts (so * that we are not disturbed in the middle of this). */ diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 01b0f6d06451..5a532ce646bf 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -714,10 +714,6 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) *nr_m_spare += 1; } } -#else /* CONFIG_X86_IO_APIC */ -static -inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} -#endif /* CONFIG_X86_IO_APIC */ static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) @@ -731,6 +727,10 @@ check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) return ret; } +#else /* CONFIG_X86_IO_APIC */ +static +inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {} +#endif /* CONFIG_X86_IO_APIC */ static int __init replace_intsrc_all(struct mpc_table *mpc, unsigned long mpc_new_phys, @@ -883,7 +883,7 @@ static int __init update_mp_table(void) if (!mpc_new_phys) { unsigned char old, new; - /* check if we can change the postion */ + /* check if we can change the position */ mpc->checksum = 0; old = mpf_checksum((unsigned char *)mpc, mpc->length); mpc->checksum = 0xff; @@ -892,7 +892,7 @@ static int __init update_mp_table(void) printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n"); return 0; } - printk(KERN_INFO "use in-positon replacing\n"); + printk(KERN_INFO "use in-position replacing\n"); } else { mpf->physptr = mpc_new_phys; mpc_new = phys_to_virt(mpc_new_phys); diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index f56a117cef68..e8c33a302006 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1279,7 +1279,7 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev) if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) { /* - * FIXME: properly scan for devices accross the + * FIXME: properly scan for devices across the * PCI-to-PCI bridge on every CalIOC2 port. */ return 1; @@ -1295,7 +1295,7 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev) /* * calgary_init_bitmap_from_tce_table(): - * Funtion for kdump case. In the second/kdump kernel initialize + * Function for kdump case. In the second/kdump kernel initialize * the bitmap based on the tce table entries obtained from first kernel */ static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 99fa3adf0141..d46cbe46b7ab 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -87,7 +87,7 @@ void exit_thread(void) void show_regs(struct pt_regs *regs) { show_registers(regs); - show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs)); + show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0); } void show_regs_common(void) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index bd387e8f73b4..6c9dd922ac0d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -501,6 +501,10 @@ void set_personality_64bit(void) /* Make sure to be in 64bit mode */ clear_thread_flag(TIF_IA32); + /* Ensure the corresponding mm is not marked. */ + if (current->mm) + current->mm->context.ia32_compat = 0; + /* TBD: overwrites user setup. Should have two bits. But 64bit processes have always behaved this way, so it's not too bad. The main problem is just that @@ -516,6 +520,10 @@ void set_personality_ia32(void) set_thread_flag(TIF_IA32); current->personality |= force_personality32; + /* Mark the associated mm as containing 32-bit tasks. */ + if (current->mm) + current->mm->context.ia32_compat = 1; + /* Prepare the first "return" to user space */ current_thread_info()->status |= TS_COMPAT; } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 715037caeb43..d3ce37edb54d 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -303,68 +303,16 @@ static int __init reboot_init(void) } core_initcall(reboot_init); -/* The following code and data reboots the machine by switching to real - mode and jumping to the BIOS reset entry point, as if the CPU has - really been reset. The previous version asked the keyboard - controller to pulse the CPU reset line, which is more thorough, but - doesn't work with at least one type of 486 motherboard. It is easy - to stop this code working; hence the copious comments. */ -static const unsigned long long -real_mode_gdt_entries [3] = -{ - 0x0000000000000000ULL, /* Null descriptor */ - 0x00009b000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */ - 0x000093000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ -}; +extern const unsigned char machine_real_restart_asm[]; +extern const u64 machine_real_restart_gdt[3]; -static const struct desc_ptr -real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries }, -real_mode_idt = { 0x3ff, 0 }; - -/* This is 16-bit protected mode code to disable paging and the cache, - switch to real mode and jump to the BIOS reset code. - - The instruction that switches to real mode by writing to CR0 must be - followed immediately by a far jump instruction, which set CS to a - valid value for real mode, and flushes the prefetch queue to avoid - running instructions that have already been decoded in protected - mode. - - Clears all the flags except ET, especially PG (paging), PE - (protected-mode enable) and TS (task switch for coprocessor state - save). Flushes the TLB after paging has been disabled. Sets CD and - NW, to disable the cache on a 486, and invalidates the cache. This - is more like the state of a 486 after reset. I don't know if - something else should be done for other chips. - - More could be done here to set up the registers as if a CPU reset had - occurred; hopefully real BIOSs don't assume much. */ -static const unsigned char real_mode_switch [] = -{ - 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */ - 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */ - 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */ - 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */ - 0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */ - 0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */ - 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */ - 0x74, 0x02, /* jz f */ - 0x0f, 0x09, /* wbinvd */ - 0x24, 0x10, /* f: andb $0x10,al */ - 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */ -}; -static const unsigned char jump_to_bios [] = +void machine_real_restart(unsigned int type) { - 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */ -}; + void *restart_va; + unsigned long restart_pa; + void (*restart_lowmem)(unsigned int); + u64 *lowmem_gdt; -/* - * Switch to real mode and then execute the code - * specified by the code and length parameters. - * We assume that length will aways be less that 100! - */ -void machine_real_restart(const unsigned char *code, int length) -{ local_irq_disable(); /* Write zero to CMOS register number 0x0f, which the BIOS POST @@ -392,41 +340,23 @@ void machine_real_restart(const unsigned char *code, int length) too. */ *((unsigned short *)0x472) = reboot_mode; - /* For the switch to real mode, copy some code to low memory. It has - to be in the first 64k because it is running in 16-bit mode, and it - has to have the same physical and virtual address, because it turns - off paging. Copy it near the end of the first page, out of the way - of BIOS variables. */ - memcpy((void *)(0x1000 - sizeof(real_mode_switch) - 100), - real_mode_switch, sizeof (real_mode_switch)); - memcpy((void *)(0x1000 - 100), code, length); - - /* Set up the IDT for real mode. */ - load_idt(&real_mode_idt); - - /* Set up a GDT from which we can load segment descriptors for real - mode. The GDT is not used in real mode; it is just needed here to - prepare the descriptors. */ - load_gdt(&real_mode_gdt); - - /* Load the data segment registers, and thus the descriptors ready for - real mode. The base address of each segment is 0x100, 16 times the - selector value being loaded here. This is so that the segment - registers don't have to be reloaded after switching to real mode: - the values are consistent for real mode operation already. */ - __asm__ __volatile__ ("movl $0x0010,%%eax\n" - "\tmovl %%eax,%%ds\n" - "\tmovl %%eax,%%es\n" - "\tmovl %%eax,%%fs\n" - "\tmovl %%eax,%%gs\n" - "\tmovl %%eax,%%ss" : : : "eax"); - - /* Jump to the 16-bit code that we copied earlier. It disables paging - and the cache, switches to real mode, and jumps to the BIOS reset - entry point. */ - __asm__ __volatile__ ("ljmp $0x0008,%0" - : - : "i" ((void *)(0x1000 - sizeof (real_mode_switch) - 100))); + /* Patch the GDT in the low memory trampoline */ + lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt); + + restart_va = TRAMPOLINE_SYM(machine_real_restart_asm); + restart_pa = virt_to_phys(restart_va); + restart_lowmem = (void (*)(unsigned int))restart_pa; + + /* GDT[0]: GDT self-pointer */ + lowmem_gdt[0] = + (u64)(sizeof(machine_real_restart_gdt) - 1) + + ((u64)virt_to_phys(lowmem_gdt) << 16); + /* GDT[1]: 64K real mode code segment */ + lowmem_gdt[1] = + GDT_ENTRY(0x009b, restart_pa, 0xffff); + + /* Jump to the identity-mapped low memory code */ + restart_lowmem(type); } #ifdef CONFIG_APM_MODULE EXPORT_SYMBOL(machine_real_restart); @@ -581,7 +511,7 @@ static void native_machine_emergency_restart(void) #ifdef CONFIG_X86_32 case BOOT_BIOS: - machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); + machine_real_restart(MRR_BIOS); reboot_type = BOOT_KBD; break; diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S new file mode 100644 index 000000000000..29092b38d816 --- /dev/null +++ b/arch/x86/kernel/reboot_32.S @@ -0,0 +1,135 @@ +#include <linux/linkage.h> +#include <linux/init.h> +#include <asm/segment.h> +#include <asm/page_types.h> + +/* + * The following code and data reboots the machine by switching to real + * mode and jumping to the BIOS reset entry point, as if the CPU has + * really been reset. The previous version asked the keyboard + * controller to pulse the CPU reset line, which is more thorough, but + * doesn't work with at least one type of 486 motherboard. It is easy + * to stop this code working; hence the copious comments. + * + * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax. + */ + .section ".x86_trampoline","a" + .balign 16 + .code32 +ENTRY(machine_real_restart_asm) +r_base = . + /* Get our own relocated address */ + call 1f +1: popl %ebx + subl $1b, %ebx + + /* Compute the equivalent real-mode segment */ + movl %ebx, %ecx + shrl $4, %ecx + + /* Patch post-real-mode segment jump */ + movw dispatch_table(%ebx,%eax,2),%ax + movw %ax, 101f(%ebx) + movw %cx, 102f(%ebx) + + /* Set up the IDT for real mode. */ + lidtl machine_real_restart_idt(%ebx) + + /* + * Set up a GDT from which we can load segment descriptors for real + * mode. The GDT is not used in real mode; it is just needed here to + * prepare the descriptors. + */ + lgdtl machine_real_restart_gdt(%ebx) + + /* + * Load the data segment registers with 16-bit compatible values + */ + movl $16, %ecx + movl %ecx, %ds + movl %ecx, %es + movl %ecx, %fs + movl %ecx, %gs + movl %ecx, %ss + ljmpl $8, $1f - r_base + +/* + * This is 16-bit protected mode code to disable paging and the cache, + * switch to real mode and jump to the BIOS reset code. + * + * The instruction that switches to real mode by writing to CR0 must be + * followed immediately by a far jump instruction, which set CS to a + * valid value for real mode, and flushes the prefetch queue to avoid + * running instructions that have already been decoded in protected + * mode. + * + * Clears all the flags except ET, especially PG (paging), PE + * (protected-mode enable) and TS (task switch for coprocessor state + * save). Flushes the TLB after paging has been disabled. Sets CD and + * NW, to disable the cache on a 486, and invalidates the cache. This + * is more like the state of a 486 after reset. I don't know if + * something else should be done for other chips. + * + * More could be done here to set up the registers as if a CPU reset had + * occurred; hopefully real BIOSs don't assume much. This is not the + * actual BIOS entry point, anyway (that is at 0xfffffff0). + * + * Most of this work is probably excessive, but it is what is tested. + */ + .code16 +1: + xorl %ecx, %ecx + movl %cr0, %eax + andl $0x00000011, %eax + orl $0x60000000, %eax + movl %eax, %cr0 + movl %ecx, %cr3 + movl %cr0, %edx + andl $0x60000000, %edx /* If no cache bits -> no wbinvd */ + jz 2f + wbinvd +2: + andb $0x10, %al + movl %eax, %cr0 + .byte 0xea /* ljmpw */ +101: .word 0 /* Offset */ +102: .word 0 /* Segment */ + +bios: + ljmpw $0xf000, $0xfff0 + +apm: + movw $0x1000, %ax + movw %ax, %ss + movw $0xf000, %sp + movw $0x5307, %ax + movw $0x0001, %bx + movw $0x0003, %cx + int $0x15 + +END(machine_real_restart_asm) + + .balign 16 + /* These must match <asm/reboot.h */ +dispatch_table: + .word bios - r_base + .word apm - r_base +END(dispatch_table) + + .balign 16 +machine_real_restart_idt: + .word 0xffff /* Length - real mode default value */ + .long 0 /* Base - real mode default value */ +END(machine_real_restart_idt) + + .balign 16 +ENTRY(machine_real_restart_gdt) + .quad 0 /* Self-pointer, filled in by PM code */ + .quad 0 /* 16-bit code segment, filled in by PM code */ + /* + * 16-bit data segment with the selector value 16 = 0x10 and + * base value 0x100; since this is consistent with real mode + * semantics we don't have to reload the segments once CR0.PE = 0. + */ + .quad GDT_ENTRY(0x0093, 0x100, 0xffff) +END(machine_real_restart_gdt) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b176f2b1f45d..5a0484a95ad6 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -294,30 +294,11 @@ static void __init init_gbpages(void) else direct_gbpages = 0; } - -static void __init cleanup_highmap_brk_end(void) -{ - pud_t *pud; - pmd_t *pmd; - - mmu_cr4_features = read_cr4(); - - /* - * _brk_end cannot change anymore, but it and _end may be - * located on different 2M pages. cleanup_highmap(), however, - * can only consider _end when it runs, so destroy any - * mappings beyond _brk_end here. - */ - pud = pud_offset(pgd_offset_k(_brk_end), _brk_end); - pmd = pmd_offset(pud, _brk_end - 1); - while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1)) - pmd_clear(pmd); -} #else static inline void init_gbpages(void) { } -static inline void cleanup_highmap_brk_end(void) +static void __init cleanup_highmap(void) { } #endif @@ -330,8 +311,6 @@ static void __init reserve_brk(void) /* Mark brk area as locked down and no longer taking any new allocations */ _brk_start = 0; - - cleanup_highmap_brk_end(); } #ifdef CONFIG_BLK_DEV_INITRD @@ -640,28 +619,6 @@ void __init reserve_standard_io_resources(void) } -/* - * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by - * is_kdump_kernel() to determine if we are booting after a panic. Hence - * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. - */ - -#ifdef CONFIG_CRASH_DUMP -/* elfcorehdr= specifies the location of elf core header - * stored by the crashed kernel. This option will be passed - * by kexec loader to the capture kernel. - */ -static int __init setup_elfcorehdr(char *arg) -{ - char *end; - if (!arg) - return -EINVAL; - elfcorehdr_addr = memparse(arg, &end); - return end > arg ? 0 : -EINVAL; -} -early_param("elfcorehdr", setup_elfcorehdr); -#endif - static __init void reserve_ibft_region(void) { unsigned long addr, size = 0; @@ -950,6 +907,8 @@ void __init setup_arch(char **cmdline_p) */ reserve_brk(); + cleanup_highmap(); + memblock.current_limit = get_max_mapped(); memblock_x86_fill(); @@ -963,15 +922,8 @@ void __init setup_arch(char **cmdline_p) printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", max_pfn_mapped<<PAGE_SHIFT); - reserve_trampoline_memory(); + setup_trampolines(); -#ifdef CONFIG_ACPI_SLEEP - /* - * Reserve low memory region for sleep support. - * even before init_memory_mapping - */ - acpi_reserve_wakeup_memory(); -#endif init_gbpages(); /* max_pfn_mapped is updated here */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index e9efdfd51c8d..c2871d3c71b6 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -711,7 +711,7 @@ do_rest: stack_start = c_idle.idle->thread.sp; /* start_ip had better be page-aligned! */ - start_ip = setup_trampoline(); + start_ip = trampoline_address(); /* So we see what's up */ announce_cpu(cpu, apicid); @@ -721,6 +721,8 @@ do_rest: * the targeted processor. */ + printk(KERN_DEBUG "smpboot cpu %d: start_ip = %lx\n", cpu, start_ip); + atomic_set(&init_deasserted, 0); if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { @@ -774,8 +776,8 @@ do_rest: pr_debug("CPU%d: has booted.\n", cpu); else { boot_error = 1; - if (*((volatile unsigned char *)trampoline_base) - == 0xA5) + if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) + == 0xA5A5A5A5) /* trampoline started but...? */ pr_err("CPU%d: Stuck ??\n", cpu); else @@ -801,7 +803,7 @@ do_rest: } /* mark "stuck" area as not stuck */ - *((volatile unsigned long *)trampoline_base) = 0; + *(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) = 0; if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { /* diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 938c8e10a19a..6515733a289d 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -73,7 +73,7 @@ static const struct stacktrace_ops save_stack_ops_nosched = { */ void save_stack_trace(struct stack_trace *trace) { - dump_trace(current, NULL, NULL, &save_stack_ops, trace); + dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } @@ -81,14 +81,14 @@ EXPORT_SYMBOL_GPL(save_stack_trace); void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs) { - dump_trace(current, regs, NULL, &save_stack_ops, trace); + dump_trace(current, regs, NULL, 0, &save_stack_ops, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { - dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace); + dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 58de45ee08b6..7977f0cfe339 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -166,7 +166,7 @@ static void enable_step(struct task_struct *child, bool block) * Make sure block stepping (BTF) is not enabled unless it should be. * Note that we don't try to worry about any is_setting_trap_flag() * instructions after the first when using block stepping. - * So noone should try to use debugger block stepping in a program + * So no one should try to use debugger block stepping in a program * that uses user-mode single stepping itself. */ if (enable_single_step(child) && block) { diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 5f181742e8f9..abce34d5c79d 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -343,3 +343,4 @@ ENTRY(sys_call_table) .long sys_name_to_handle_at .long sys_open_by_handle_at .long sys_clock_adjtime + .long sys_syncfs diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 7e4515957a1c..8927486a4649 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -39,7 +39,7 @@ int __ref arch_register_cpu(int num) /* * CPU0 cannot be offlined due to several * restrictions and assumptions in kernel. This basically - * doesnt add a control file, one cannot attempt to offline + * doesn't add a control file, one cannot attempt to offline * BSP. * * Also certain PCI quirks require not to enable hotplug control diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index a375616d77f7..a91ae7709b49 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -2,39 +2,41 @@ #include <linux/memblock.h> #include <asm/trampoline.h> +#include <asm/cacheflush.h> #include <asm/pgtable.h> -#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) -#define __trampinit -#define __trampinitdata -#else -#define __trampinit __cpuinit -#define __trampinitdata __cpuinitdata -#endif +unsigned char *x86_trampoline_base; -/* ready for x86_64 and x86 */ -unsigned char *__trampinitdata trampoline_base; - -void __init reserve_trampoline_memory(void) +void __init setup_trampolines(void) { phys_addr_t mem; + size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start); /* Has to be in very low memory so we can execute real-mode AP code. */ - mem = memblock_find_in_range(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE); + mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); if (mem == MEMBLOCK_ERROR) panic("Cannot allocate trampoline\n"); - trampoline_base = __va(mem); - memblock_x86_reserve_range(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE"); + x86_trampoline_base = __va(mem); + memblock_x86_reserve_range(mem, mem + size, "TRAMPOLINE"); + + printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", + x86_trampoline_base, (unsigned long long)mem, size); + + memcpy(x86_trampoline_base, x86_trampoline_start, size); } /* - * Currently trivial. Write the real->protected mode - * bootstrap into the page concerned. The caller - * has made sure it's suitably aligned. + * setup_trampolines() gets called very early, to guarantee the + * availability of low memory. This is before the proper kernel page + * tables are set up, so we cannot set page permissions in that + * function. Thus, we use an arch_initcall instead. */ -unsigned long __trampinit setup_trampoline(void) +static int __init configure_trampolines(void) { - memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); - return virt_to_phys(trampoline_base); + size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start); + + set_memory_x((unsigned long)x86_trampoline_base, size >> PAGE_SHIFT); + return 0; } +arch_initcall(configure_trampolines); diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S index 8508237e8e43..451c0a7ef7fd 100644 --- a/arch/x86/kernel/trampoline_32.S +++ b/arch/x86/kernel/trampoline_32.S @@ -32,9 +32,11 @@ #include <asm/segment.h> #include <asm/page_types.h> -/* We can free up trampoline after bootup if cpu hotplug is not supported. */ -__CPUINITRODATA -.code16 +#ifdef CONFIG_SMP + + .section ".x86_trampoline","a" + .balign PAGE_SIZE + .code16 ENTRY(trampoline_data) r_base = . @@ -44,7 +46,7 @@ r_base = . cli # We should be safe anyway - movl $0xA5A5A5A5, trampoline_data - r_base + movl $0xA5A5A5A5, trampoline_status - r_base # write marker for master knows we're running /* GDT tables in non default location kernel can be beyond 16MB and @@ -72,5 +74,10 @@ boot_idt_descr: .word 0 # idt limit = 0 .long 0 # idt base = 0L +ENTRY(trampoline_status) + .long 0 + .globl trampoline_end trampoline_end: + +#endif /* CONFIG_SMP */ diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S index 075d130efcf9..09ff51799e96 100644 --- a/arch/x86/kernel/trampoline_64.S +++ b/arch/x86/kernel/trampoline_64.S @@ -32,13 +32,9 @@ #include <asm/segment.h> #include <asm/processor-flags.h> -#ifdef CONFIG_ACPI_SLEEP -.section .rodata, "a", @progbits -#else -/* We can free up the trampoline after bootup if cpu hotplug is not supported. */ -__CPUINITRODATA -#endif -.code16 + .section ".x86_trampoline","a" + .balign PAGE_SIZE + .code16 ENTRY(trampoline_data) r_base = . @@ -50,7 +46,7 @@ r_base = . mov %ax, %ss - movl $0xA5A5A5A5, trampoline_data - r_base + movl $0xA5A5A5A5, trampoline_status - r_base # write marker for master knows we're running # Setup stack @@ -64,10 +60,13 @@ r_base = . movzx %ax, %esi # Find the 32bit trampoline location shll $4, %esi - # Fixup the vectors - addl %esi, startup_32_vector - r_base - addl %esi, startup_64_vector - r_base - addl %esi, tgdt + 2 - r_base # Fixup the gdt pointer + # Fixup the absolute vectors + leal (startup_32 - r_base)(%esi), %eax + movl %eax, startup_32_vector - r_base + leal (startup_64 - r_base)(%esi), %eax + movl %eax, startup_64_vector - r_base + leal (tgdt - r_base)(%esi), %eax + movl %eax, (tgdt + 2 - r_base) /* * GDT tables in non default location kernel can be beyond 16MB and @@ -129,6 +128,7 @@ no_longmode: jmp no_longmode #include "verify_cpu.S" + .balign 4 # Careful these need to be in the same 64K segment as the above; tidt: .word 0 # idt limit = 0 @@ -156,6 +156,10 @@ startup_64_vector: .long startup_64 - r_base .word __KERNEL_CS, 0 + .balign 4 +ENTRY(trampoline_status) + .long 0 + trampoline_stack: .org 0x1000 trampoline_stack_end: diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index ffe5755caa8b..9335bf7dd2e7 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -427,7 +427,7 @@ unsigned long native_calibrate_tsc(void) * the delta to the previous read. We keep track of the min * and max values of that delta. The delta is mostly defined * by the IO time of the PIT access, so we can detect when a - * SMI/SMM disturbance happend between the two reads. If the + * SMI/SMM disturbance happened between the two reads. If the * maximum time is significantly larger than the minimum time, * then we discard the result and have another try. * @@ -900,7 +900,7 @@ static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); * timer based, instead of loop based, we don't block the boot * process while this longer calibration is done. * - * If there are any calibration anomolies (too many SMIs, etc), + * If there are any calibration anomalies (too many SMIs, etc), * or the refined calibration is off by 1% of the fast early * calibration, we throw out the new calibration and use the * early calibration. diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index 0edefc19a113..b9242bacbe59 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -18,7 +18,7 @@ * This file is expected to run in 32bit code. Currently: * * arch/x86/boot/compressed/head_64.S: Boot cpu verification - * arch/x86/kernel/trampoline_64.S: secondary processor verfication + * arch/x86/kernel/trampoline_64.S: secondary processor verification * arch/x86/kernel/head_32.S: processor startup * * verify_cpu, returns the status of longmode and SSE in register %eax. diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 0381e1f3baed..624a2016198e 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -231,7 +231,7 @@ SECTIONS * output PHDR, so the next output section - .init.text - should * start another segment - init. */ - PERCPU_VADDR(0, :percpu) + PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu) #endif INIT_TEXT_SECTION(PAGE_SIZE) @@ -241,6 +241,18 @@ SECTIONS INIT_DATA_SECTION(16) + /* + * Code and data for a variety of lowlevel trampolines, to be + * copied into base memory (< 1 MiB) during initialization. + * Since it is copied early, the main copy can be discarded + * afterwards. + */ + .x86_trampoline : AT(ADDR(.x86_trampoline) - LOAD_OFFSET) { + x86_trampoline_start = .; + *(.x86_trampoline) + x86_trampoline_end = .; + } + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { __x86_cpu_dev_start = .; *(.x86_cpu_dev.init) @@ -292,6 +304,7 @@ SECTIONS *(.iommu_table) __iommu_table_end = .; } + . = ALIGN(8); /* * .exit.text is discard at runtime, not link time, to deal with @@ -306,7 +319,7 @@ SECTIONS } #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) - PERCPU(PAGE_SIZE) + PERCPU(INTERNODE_CACHE_BYTES, PAGE_SIZE) #endif . = ALIGN(PAGE_SIZE); diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 547128546cc3..a3911343976b 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -53,7 +53,7 @@ void __sanitize_i387_state(struct task_struct *tsk) /* * None of the feature bits are in init state. So nothing else - * to do for us, as the memory layout is upto date. + * to do for us, as the memory layout is up to date. */ if ((xstate_bv & pcntxt_mask) == pcntxt_mask) return; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index caf966781d25..0ad47b819a8b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -76,6 +76,7 @@ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ /* Misc flags */ +#define VendorSpecific (1<<22) /* Vendor specific instruction */ #define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ #define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ #define Undefined (1<<25) /* No Such Instruction */ @@ -877,7 +878,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, if (selector & 1 << 2) { struct desc_struct desc; memset (dt, 0, sizeof *dt); - if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu)) + if (!ops->get_cached_descriptor(&desc, NULL, VCPU_SREG_LDTR, + ctxt->vcpu)) return; dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ @@ -929,6 +931,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, return ret; } +/* Does not support long mode */ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, u16 selector, int seg) @@ -1040,7 +1043,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, } load: ops->set_segment_selector(selector, seg, ctxt->vcpu); - ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); + ops->set_cached_descriptor(&seg_desc, 0, seg, ctxt->vcpu); return X86EMUL_CONTINUE; exception: emulate_exception(ctxt, err_vec, err_code, true); @@ -1560,7 +1563,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, struct desc_struct *ss) { memset(cs, 0, sizeof(struct desc_struct)); - ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu); + ops->get_cached_descriptor(cs, NULL, VCPU_SREG_CS, ctxt->vcpu); memset(ss, 0, sizeof(struct desc_struct)); cs->l = 0; /* will be adjusted later */ @@ -1607,9 +1610,9 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs.d = 0; cs.l = 1; } - ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu); ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); - ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu); ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); c->regs[VCPU_REGS_RCX] = c->eip; @@ -1679,9 +1682,9 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs.l = 1; } - ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu); ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); - ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu); ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); @@ -1736,9 +1739,9 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs_sel |= SELECTOR_RPL_MASK; ss_sel |= SELECTOR_RPL_MASK; - ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu); ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); - ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu); ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); c->eip = c->regs[VCPU_REGS_RDX]; @@ -1764,24 +1767,28 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, u16 port, u16 len) { struct desc_struct tr_seg; + u32 base3; int r; - u16 io_bitmap_ptr; - u8 perm, bit_idx = port & 0x7; + u16 io_bitmap_ptr, perm, bit_idx = port & 0x7; unsigned mask = (1 << len) - 1; + unsigned long base; - ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu); + ops->get_cached_descriptor(&tr_seg, &base3, VCPU_SREG_TR, ctxt->vcpu); if (!tr_seg.p) return false; if (desc_limit_scaled(&tr_seg) < 103) return false; - r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2, - ctxt->vcpu, NULL); + base = get_desc_base(&tr_seg); +#ifdef CONFIG_X86_64 + base |= ((u64)base3) << 32; +#endif + r = ops->read_std(base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, NULL); if (r != X86EMUL_CONTINUE) return false; if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) return false; - r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8, - &perm, 1, ctxt->vcpu, NULL); + r = ops->read_std(base + io_bitmap_ptr + port/8, &perm, 2, ctxt->vcpu, + NULL); if (r != X86EMUL_CONTINUE) return false; if ((perm >> bit_idx) & mask) @@ -2126,7 +2133,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, } ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu); - ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu); + ops->set_cached_descriptor(&next_tss_desc, 0, VCPU_SREG_TR, ctxt->vcpu); ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu); if (has_error_code) { @@ -2365,7 +2372,8 @@ static struct group_dual group7 = { { D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem | ModRM | ByteOp | Priv | NoAccess), }, { - D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv), + D(SrcNone | ModRM | Priv | VendorSpecific), N, + N, D(SrcNone | ModRM | Priv | VendorSpecific), D(SrcNone | ModRM | DstMem | Mov), N, D(SrcMem16 | ModRM | Mov | Priv), N, } }; @@ -2489,7 +2497,7 @@ static struct opcode opcode_table[256] = { static struct opcode twobyte_table[256] = { /* 0x00 - 0x0F */ N, GD(0, &group7), N, N, - N, D(ImplicitOps), D(ImplicitOps | Priv), N, + N, D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv), N, D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N, N, D(ImplicitOps | ModRM), N, N, /* 0x10 - 0x1F */ @@ -2502,7 +2510,8 @@ static struct opcode twobyte_table[256] = { /* 0x30 - 0x3F */ D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc), D(ImplicitOps | Priv), N, - D(ImplicitOps), D(ImplicitOps | Priv), N, N, + D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific), + N, N, N, N, N, N, N, N, N, N, /* 0x40 - 0x4F */ X16(D(DstReg | SrcMem | ModRM | Mov)), @@ -2741,6 +2750,9 @@ done_prefixes: if (c->d == 0 || (c->d & Undefined)) return -1; + if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn) + return -1; + if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) c->op_bytes = 8; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 3cece05e4ac4..19fe855e7953 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -62,9 +62,6 @@ static void pic_unlock(struct kvm_pic *s) } if (!found) - found = s->kvm->bsp_vcpu; - - if (!found) return; kvm_make_request(KVM_REQ_EVENT, found); @@ -75,7 +72,6 @@ static void pic_unlock(struct kvm_pic *s) static void pic_clear_isr(struct kvm_kpic_state *s, int irq) { s->isr &= ~(1 << irq); - s->isr_ack |= (1 << irq); if (s != &s->pics_state->pics[0]) irq += 8; /* @@ -89,16 +85,6 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) pic_lock(s->pics_state); } -void kvm_pic_clear_isr_ack(struct kvm *kvm) -{ - struct kvm_pic *s = pic_irqchip(kvm); - - pic_lock(s); - s->pics[0].isr_ack = 0xff; - s->pics[1].isr_ack = 0xff; - pic_unlock(s); -} - /* * set irq level. If an edge is detected, then the IRR is set to 1 */ @@ -281,7 +267,6 @@ void kvm_pic_reset(struct kvm_kpic_state *s) s->irr = 0; s->imr = 0; s->isr = 0; - s->isr_ack = 0xff; s->priority_add = 0; s->irq_base = 0; s->read_reg_select = 0; @@ -545,15 +530,11 @@ static int picdev_read(struct kvm_io_device *this, */ static void pic_irq_request(struct kvm *kvm, int level) { - struct kvm_vcpu *vcpu = kvm->bsp_vcpu; struct kvm_pic *s = pic_irqchip(kvm); - int irq = pic_get_irq(&s->pics[0]); - s->output = level; - if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { - s->pics[0].isr_ack &= ~(1 << irq); + if (!s->output) s->wakeup_needed = true; - } + s->output = level; } static const struct kvm_io_device_ops picdev_ops = { @@ -575,8 +556,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s->pics[1].elcr_mask = 0xde; s->pics[0].pics_state = s; s->pics[1].pics_state = s; - s->pics[0].isr_ack = 0xff; - s->pics[1].isr_ack = 0xff; /* * Initialize PIO device diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 93cf9d0d3653..2b2255b1f04b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -417,10 +417,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_INIT: if (level) { result = 1; - if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) - printk(KERN_DEBUG - "INIT on a runnable vcpu %d\n", - vcpu->vcpu_id); vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); @@ -875,8 +871,8 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); - if (vcpu->arch.apic->regs_page) - __free_page(vcpu->arch.apic->regs_page); + if (vcpu->arch.apic->regs) + free_page((unsigned long)vcpu->arch.apic->regs); kfree(vcpu->arch.apic); } @@ -1065,13 +1061,12 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) vcpu->arch.apic = apic; - apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO); - if (apic->regs_page == NULL) { + apic->regs = (void *)get_zeroed_page(GFP_KERNEL); + if (!apic->regs) { printk(KERN_ERR "malloc apic regs error for vcpu %x\n", vcpu->vcpu_id); goto nomem_free_apic; } - apic->regs = page_address(apic->regs_page); apic->vcpu = vcpu; hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index f5fe32c5edad..52c9e6b9e725 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -13,7 +13,6 @@ struct kvm_lapic { u32 divide_count; struct kvm_vcpu *vcpu; bool irr_pending; - struct page *regs_page; void *regs; gpa_t vapic_addr; struct page *vapic_page; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f02b8edc3d44..22fae7593ee7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -111,9 +111,6 @@ module_param(oos_shadow, bool, 0644); #define PT64_LEVEL_SHIFT(level) \ (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) -#define PT64_LEVEL_MASK(level) \ - (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) - #define PT64_INDEX(address, level)\ (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) @@ -123,8 +120,6 @@ module_param(oos_shadow, bool, 0644); #define PT32_LEVEL_SHIFT(level) \ (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) -#define PT32_LEVEL_MASK(level) \ - (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) #define PT32_LVL_OFFSET_MASK(level) \ (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ * PT32_LEVEL_BITS))) - 1)) @@ -379,15 +374,15 @@ static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, int min) { - struct page *page; + void *page; if (cache->nobjs >= min) return 0; while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - page = alloc_page(GFP_KERNEL); + page = (void *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; - cache->objects[cache->nobjs++] = page_address(page); + cache->objects[cache->nobjs++] = page; } return 0; } @@ -554,13 +549,23 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) return ret; } -static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) +static struct kvm_memory_slot * +gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, + bool no_dirty_log) { struct kvm_memory_slot *slot; - slot = gfn_to_memslot(vcpu->kvm, large_gfn); - if (slot && slot->dirty_bitmap) - return true; - return false; + + slot = gfn_to_memslot(vcpu->kvm, gfn); + if (!slot || slot->flags & KVM_MEMSLOT_INVALID || + (no_dirty_log && slot->dirty_bitmap)) + slot = NULL; + + return slot; +} + +static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) +{ + return gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true); } static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) @@ -1032,9 +1037,9 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) ASSERT(is_empty_shadow_page(sp->spt)); hlist_del(&sp->hash_link); list_del(&sp->link); - __free_page(virt_to_page(sp->spt)); + free_page((unsigned long)sp->spt); if (!sp->role.direct) - __free_page(virt_to_page(sp->gfns)); + free_page((unsigned long)sp->gfns); kmem_cache_free(mmu_page_header_cache, sp); kvm_mod_used_mmu_pages(kvm, -1); } @@ -1199,6 +1204,13 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { } +static void nonpaging_update_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *spte, + const void *pte, unsigned long mmu_seq) +{ + WARN_ON(1); +} + #define KVM_PAGE_ARRAY_NR 16 struct kvm_mmu_pages { @@ -2150,26 +2162,13 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } -static struct kvm_memory_slot * -pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) -{ - struct kvm_memory_slot *slot; - - slot = gfn_to_memslot(vcpu->kvm, gfn); - if (!slot || slot->flags & KVM_MEMSLOT_INVALID || - (no_dirty_log && slot->dirty_bitmap)) - slot = NULL; - - return slot; -} - static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { struct kvm_memory_slot *slot; unsigned long hva; - slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log); + slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); if (!slot) { get_page(bad_page); return page_to_pfn(bad_page); @@ -2190,7 +2189,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, gfn_t gfn; gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); - if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK)) + if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK)) return -1; ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); @@ -2804,6 +2803,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu, context->prefetch_page = nonpaging_prefetch_page; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; + context->update_pte = nonpaging_update_pte; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; @@ -2933,6 +2933,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, context->prefetch_page = paging64_prefetch_page; context->sync_page = paging64_sync_page; context->invlpg = paging64_invlpg; + context->update_pte = paging64_update_pte; context->free = paging_free; context->root_level = level; context->shadow_root_level = level; @@ -2961,6 +2962,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, context->prefetch_page = paging32_prefetch_page; context->sync_page = paging32_sync_page; context->invlpg = paging32_invlpg; + context->update_pte = paging32_update_pte; context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; @@ -2985,6 +2987,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->prefetch_page = nonpaging_prefetch_page; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; + context->update_pte = nonpaging_update_pte; context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->root_hpa = INVALID_PAGE; context->direct_map = true; @@ -3089,8 +3092,6 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) static int init_kvm_mmu(struct kvm_vcpu *vcpu) { - vcpu->arch.update_pte.pfn = bad_pfn; - if (mmu_is_nested(vcpu)) return init_kvm_nested_mmu(vcpu); else if (tdp_enabled) @@ -3164,7 +3165,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, - const void *new) + const void *new, unsigned long mmu_seq) { if (sp->role.level != PT_PAGE_TABLE_LEVEL) { ++vcpu->kvm->stat.mmu_pde_zapped; @@ -3172,10 +3173,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, } ++vcpu->kvm->stat.mmu_pte_updated; - if (!sp->role.cr4_pae) - paging32_update_pte(vcpu, sp, spte, new); - else - paging64_update_pte(vcpu, sp, spte, new); + vcpu->arch.mmu.update_pte(vcpu, sp, spte, new, mmu_seq); } static bool need_remote_flush(u64 old, u64 new) @@ -3210,28 +3208,6 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) return !!(spte && (*spte & shadow_accessed_mask)); } -static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - u64 gpte) -{ - gfn_t gfn; - pfn_t pfn; - - if (!is_present_gpte(gpte)) - return; - gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; - - vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; - smp_rmb(); - pfn = gfn_to_pfn(vcpu->kvm, gfn); - - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - return; - } - vcpu->arch.update_pte.gfn = gfn; - vcpu->arch.update_pte.pfn = pfn; -} - static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) { u64 *spte = vcpu->arch.last_pte_updated; @@ -3253,21 +3229,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_mmu_page *sp; struct hlist_node *node; LIST_HEAD(invalid_list); - u64 entry, gentry; - u64 *spte; - unsigned offset = offset_in_page(gpa); - unsigned pte_size; - unsigned page_offset; - unsigned misaligned; - unsigned quadrant; - int level; - int flooded = 0; - int npte; - int r; - int invlpg_counter; + unsigned long mmu_seq; + u64 entry, gentry, *spte; + unsigned pte_size, page_offset, misaligned, quadrant, offset; + int level, npte, invlpg_counter, r, flooded = 0; bool remote_flush, local_flush, zap_page; zap_page = remote_flush = local_flush = false; + offset = offset_in_page(gpa); pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); @@ -3275,9 +3244,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, /* * Assume that the pte write on a page table of the same type - * as the current vcpu paging mode. This is nearly always true - * (might be false while changing modes). Note it is verified later - * by update_pte(). + * as the current vcpu paging mode since we update the sptes only + * when they have the same mode. */ if ((is_pae(vcpu) && bytes == 4) || !new) { /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ @@ -3303,15 +3271,17 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, break; } - mmu_guess_page_from_pte_write(vcpu, gpa, gentry); + mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); + spin_lock(&vcpu->kvm->mmu_lock); if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) gentry = 0; - kvm_mmu_access_page(vcpu, gfn); kvm_mmu_free_some_pages(vcpu); ++vcpu->kvm->stat.mmu_pte_write; trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); if (guest_initiated) { + kvm_mmu_access_page(vcpu, gfn); if (gfn == vcpu->arch.last_pt_write_gfn && !last_updated_pte_accessed(vcpu)) { ++vcpu->arch.last_pt_write_count; @@ -3375,7 +3345,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (gentry && !((sp->role.word ^ vcpu->arch.mmu.base_role.word) & mask.word)) - mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); + mmu_pte_write_new_pte(vcpu, sp, spte, &gentry, + mmu_seq); if (!remote_flush && need_remote_flush(entry, *spte)) remote_flush = true; ++spte; @@ -3385,10 +3356,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); spin_unlock(&vcpu->kvm->mmu_lock); - if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { - kvm_release_pfn_clean(vcpu->arch.update_pte.pfn); - vcpu->arch.update_pte.pfn = bad_pfn; - } } int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) @@ -3538,14 +3505,23 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) if (!test_bit(slot, sp->slot_bitmap)) continue; - if (sp->role.level != PT_PAGE_TABLE_LEVEL) - continue; - pt = sp->spt; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (!is_shadow_present_pte(pt[i]) || + !is_last_spte(pt[i], sp->role.level)) + continue; + + if (is_large_pte(pt[i])) { + drop_spte(kvm, &pt[i], + shadow_trap_nonpresent_pte); + --kvm->stat.lpages; + continue; + } + /* avoid RMW */ if (is_writable_pte(pt[i])) update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); + } } kvm_flush_remote_tlbs(kvm); } @@ -3583,7 +3559,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) if (nr_to_scan == 0) goto out; - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { int idx, freed_pages; @@ -3606,7 +3582,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) if (kvm_freed) list_move_tail(&kvm_freed->vm_list, &vm_list); - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); out: return percpu_counter_read_positive(&kvm_total_used_mmu_pages); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 6bccc24c4181..c6397795d865 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -31,7 +31,6 @@ #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) #define PT_LEVEL_BITS PT64_LEVEL_BITS #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS 4 @@ -48,7 +47,6 @@ #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT32_INDEX(addr, level) - #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) #define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_MAX_FULL_LEVELS 2 #define CMPXCHG cmpxchg @@ -327,7 +325,7 @@ no_present: } static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, const void *pte) + u64 *spte, const void *pte, unsigned long mmu_seq) { pt_element_t gpte; unsigned pte_access; @@ -339,16 +337,16 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); - if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) + pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); return; - pfn = vcpu->arch.update_pte.pfn; - if (is_error_pfn(pfn)) - return; - if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq)) + } + if (mmu_notifier_retry(vcpu, mmu_seq)) return; - kvm_get_pfn(pfn); + /* - * we call mmu_set_spte() with host_writable = true beacuse that + * we call mmu_set_spte() with host_writable = true because that * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). */ mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, @@ -829,7 +827,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef FNAME #undef PT_BASE_ADDR_MASK #undef PT_INDEX -#undef PT_LEVEL_MASK #undef PT_LVL_ADDR_MASK #undef PT_LVL_OFFSET_MASK #undef PT_LEVEL_BITS diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 63fec1531e89..6bb15d583e47 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -135,6 +135,8 @@ struct vcpu_svm { u32 *msrpm; + ulong nmi_iret_rip; + struct nested_state nested; bool nmi_singlestep; @@ -1153,8 +1155,10 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); load_gs_index(svm->host.gs); #else +#ifdef CONFIG_X86_32_LAZY_GS loadsegment(gs, svm->host.gs); #endif +#endif for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); } @@ -2653,6 +2657,7 @@ static int iret_interception(struct vcpu_svm *svm) ++svm->vcpu.stat.nmi_window_exits; clr_intercept(svm, INTERCEPT_IRET); svm->vcpu.arch.hflags |= HF_IRET_MASK; + svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); return 1; } @@ -3474,7 +3479,12 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) svm->int3_injected = 0; - if (svm->vcpu.arch.hflags & HF_IRET_MASK) { + /* + * If we've made progress since setting HF_IRET_MASK, we've + * executed an IRET and can allow NMI injection. + */ + if ((svm->vcpu.arch.hflags & HF_IRET_MASK) + && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) { svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); } @@ -3641,19 +3651,30 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) wrmsrl(MSR_GS_BASE, svm->host.gs_base); #else loadsegment(fs, svm->host.fs); +#ifndef CONFIG_X86_32_LAZY_GS + loadsegment(gs, svm->host.gs); +#endif #endif reload_tss(vcpu); local_irq_disable(); - stgi(); - vcpu->arch.cr2 = svm->vmcb->save.cr2; vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; + if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) + kvm_before_handle_nmi(&svm->vcpu); + + stgi(); + + /* Any pending NMI will happen here */ + + if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) + kvm_after_handle_nmi(&svm->vcpu); + sync_cr8_to_lapic(vcpu); svm->next_rip = 0; diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index fc7a101c4a35..abd86e865be3 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -25,7 +25,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer) /* * There is a race window between reading and incrementing, but we do - * not care about potentially loosing timer events in the !reinject + * not care about potentially losing timer events in the !reinject * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked * in vcpu_enter_guest. */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bf89ec2cfb82..5b4cdcbd154c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -93,14 +93,14 @@ module_param(yield_on_hlt, bool, S_IRUGO); * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive * executions of PAUSE in a loop. Also indicate if ple enabled. - * According to test, this time is usually small than 41 cycles. + * According to test, this time is usually smaller than 128 cycles. * ple_window: upper bound on the amount of time a guest is allowed to execute * in a PAUSE loop. Tests indicate that most spinlocks are held for * less than 2^12 cycles * Time is measured based on a counter that runs at the same rate as the TSC, * refer SDM volume 3b section 21.6.13 & 22.1.3. */ -#define KVM_VMX_DEFAULT_PLE_GAP 41 +#define KVM_VMX_DEFAULT_PLE_GAP 128 #define KVM_VMX_DEFAULT_PLE_WINDOW 4096 static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; module_param(ple_gap, int, S_IRUGO); @@ -176,11 +176,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_vmx, vcpu); } -static int init_rmode(struct kvm *kvm); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); +static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -1333,19 +1333,25 @@ static __init int vmx_disabled_by_bios(void) rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); if (msr & FEATURE_CONTROL_LOCKED) { + /* launched w/ TXT and VMX disabled */ if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) && tboot_enabled()) return 1; + /* launched w/o TXT and VMX only enabled w/ TXT */ if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) + && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) && !tboot_enabled()) { printk(KERN_WARNING "kvm: disable TXT in the BIOS or " - " activate TXT before enabling KVM\n"); + "activate TXT before enabling KVM\n"); return 1; } + /* launched w/o TXT and VMX disabled */ + if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) + && !tboot_enabled()) + return 1; } return 0; - /* locked but not enabled */ } static void kvm_cpu_vmxon(u64 addr) @@ -1683,6 +1689,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmx->emulation_required = 1; vmx->rmode.vm86_active = 0; + vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector); vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); @@ -1756,6 +1763,19 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmx->emulation_required = 1; vmx->rmode.vm86_active = 1; + /* + * Very old userspace does not call KVM_SET_TSS_ADDR before entering + * vcpu. Call it here with phys address pointing 16M below 4G. + */ + if (!vcpu->kvm->arch.tss_addr) { + printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " + "called before entering vcpu\n"); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + vmx_set_tss_addr(vcpu->kvm, 0xfeffd000); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + } + + vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR); vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); @@ -1794,7 +1814,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu) continue_rmode: kvm_mmu_reset_context(vcpu); - init_rmode(vcpu->kvm); } static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) @@ -2030,23 +2049,40 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vmcs_writel(GUEST_CR4, hw_cr4); } -static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) -{ - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - - return vmcs_readl(sf->base); -} - static void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { + struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + struct kvm_save_segment *save; u32 ar; + if (vmx->rmode.vm86_active + && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES + || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS + || seg == VCPU_SREG_GS) + && !emulate_invalid_guest_state) { + switch (seg) { + case VCPU_SREG_TR: save = &vmx->rmode.tr; break; + case VCPU_SREG_ES: save = &vmx->rmode.es; break; + case VCPU_SREG_DS: save = &vmx->rmode.ds; break; + case VCPU_SREG_FS: save = &vmx->rmode.fs; break; + case VCPU_SREG_GS: save = &vmx->rmode.gs; break; + default: BUG(); + } + var->selector = save->selector; + var->base = save->base; + var->limit = save->limit; + ar = save->ar; + if (seg == VCPU_SREG_TR + || var->selector == vmcs_read16(sf->selector)) + goto use_saved_rmode_seg; + } var->base = vmcs_readl(sf->base); var->limit = vmcs_read32(sf->limit); var->selector = vmcs_read16(sf->selector); ar = vmcs_read32(sf->ar_bytes); +use_saved_rmode_seg: if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) ar = 0; var->type = ar & 15; @@ -2060,6 +2096,18 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, var->unusable = (ar >> 16) & 1; } +static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + struct kvm_segment s; + + if (to_vmx(vcpu)->rmode.vm86_active) { + vmx_get_segment(vcpu, &s, seg); + return s.base; + } + return vmcs_readl(sf->base); +} + static int vmx_get_cpl(struct kvm_vcpu *vcpu) { if (!is_protmode(vcpu)) @@ -2101,6 +2149,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, u32 ar; if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { + vmcs_write16(sf->selector, var->selector); vmx->rmode.tr.selector = var->selector; vmx->rmode.tr.base = var->base; vmx->rmode.tr.limit = var->limit; @@ -2361,11 +2410,12 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu) static int init_rmode_tss(struct kvm *kvm) { - gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; + gfn_t fn; u16 data = 0; - int ret = 0; - int r; + int r, idx, ret = 0; + idx = srcu_read_lock(&kvm->srcu); + fn = rmode_tss_base(kvm) >> PAGE_SHIFT; r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); if (r < 0) goto out; @@ -2389,12 +2439,13 @@ static int init_rmode_tss(struct kvm *kvm) ret = 1; out: + srcu_read_unlock(&kvm->srcu, idx); return ret; } static int init_rmode_identity_map(struct kvm *kvm) { - int i, r, ret; + int i, idx, r, ret; pfn_t identity_map_pfn; u32 tmp; @@ -2409,6 +2460,7 @@ static int init_rmode_identity_map(struct kvm *kvm) return 1; ret = 0; identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; + idx = srcu_read_lock(&kvm->srcu); r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); if (r < 0) goto out; @@ -2424,6 +2476,7 @@ static int init_rmode_identity_map(struct kvm *kvm) kvm->arch.ept_identity_pagetable_done = true; ret = 1; out: + srcu_read_unlock(&kvm->srcu, idx); return ret; } @@ -2699,22 +2752,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) return 0; } -static int init_rmode(struct kvm *kvm) -{ - int idx, ret = 0; - - idx = srcu_read_lock(&kvm->srcu); - if (!init_rmode_tss(kvm)) - goto exit; - if (!init_rmode_identity_map(kvm)) - goto exit; - - ret = 1; -exit: - srcu_read_unlock(&kvm->srcu, idx); - return ret; -} - static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2722,10 +2759,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) int ret; vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); - if (!init_rmode(vmx->vcpu.kvm)) { - ret = -ENOMEM; - goto out; - } vmx->rmode.vm86_active = 0; @@ -2805,7 +2838,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); if (vm_need_tpr_shadow(vmx->vcpu.kvm)) vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, - page_to_phys(vmx->vcpu.arch.apic->regs_page)); + __pa(vmx->vcpu.arch.apic->regs)); vmcs_write32(TPR_THRESHOLD, 0); } @@ -2971,6 +3004,9 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) if (ret) return ret; kvm->arch.tss_addr = addr; + if (!init_rmode_tss(kvm)) + return -ENOMEM; + return 0; } @@ -3962,7 +3998,7 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu) #define Q "l" #endif -static void vmx_vcpu_run(struct kvm_vcpu *vcpu) +static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3991,6 +4027,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) asm( /* Store host registers */ "push %%"R"dx; push %%"R"bp;" + "push %%"R"cx \n\t" /* placeholder for guest rcx */ "push %%"R"cx \n\t" "cmp %%"R"sp, %c[host_rsp](%0) \n\t" "je 1f \n\t" @@ -4032,10 +4069,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" ".Lkvm_vmx_return: " /* Save guest registers, load host registers, keep flags */ - "xchg %0, (%%"R"sp) \n\t" + "mov %0, %c[wordsize](%%"R"sp) \n\t" + "pop %0 \n\t" "mov %%"R"ax, %c[rax](%0) \n\t" "mov %%"R"bx, %c[rbx](%0) \n\t" - "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t" + "pop"Q" %c[rcx](%0) \n\t" "mov %%"R"dx, %c[rdx](%0) \n\t" "mov %%"R"si, %c[rsi](%0) \n\t" "mov %%"R"di, %c[rdi](%0) \n\t" @@ -4053,7 +4091,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) "mov %%cr2, %%"R"ax \n\t" "mov %%"R"ax, %c[cr2](%0) \n\t" - "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t" + "pop %%"R"bp; pop %%"R"dx \n\t" "setbe %c[fail](%0) \n\t" : : "c"(vmx), "d"((unsigned long)HOST_RSP), [launched]"i"(offsetof(struct vcpu_vmx, launched)), @@ -4076,7 +4114,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), #endif - [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) + [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), + [wordsize]"i"(sizeof(ulong)) : "cc", "memory" , R"ax", R"bx", R"di", R"si" #ifdef CONFIG_X86_64 @@ -4183,8 +4222,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!kvm->arch.ept_identity_map_addr) kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; + err = -ENOMEM; if (alloc_identity_pagetable(kvm) != 0) goto free_vmcs; + if (!init_rmode_identity_map(kvm)) + goto free_vmcs; } return &vmx->vcpu; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bcc0efce85bf..58f517b59645 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -81,9 +81,10 @@ * - enable LME and LMA per default on 64 bit KVM */ #ifdef CONFIG_X86_64 -static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; +static +u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); #else -static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; +static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM @@ -360,8 +361,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) void kvm_inject_nmi(struct kvm_vcpu *vcpu) { + kvm_make_request(KVM_REQ_NMI, vcpu); kvm_make_request(KVM_REQ_EVENT, vcpu); - vcpu->arch.nmi_pending = 1; } EXPORT_SYMBOL_GPL(kvm_inject_nmi); @@ -525,8 +526,10 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) kvm_x86_ops->set_cr0(vcpu, cr0); - if ((cr0 ^ old_cr0) & X86_CR0_PG) + if ((cr0 ^ old_cr0) & X86_CR0_PG) { kvm_clear_async_pf_completion_queue(vcpu); + kvm_async_pf_hash_reset(vcpu); + } if ((cr0 ^ old_cr0) & update_bits) kvm_mmu_reset_context(vcpu); @@ -1017,7 +1020,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) unsigned long flags; s64 sdiff; - spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); + raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = data - native_read_tsc(); ns = get_kernel_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; @@ -1028,7 +1031,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) /* * Special case: close write to TSC within 5 seconds of * another CPU is interpreted as an attempt to synchronize - * The 5 seconds is to accomodate host load / swapping as + * The 5 seconds is to accommodate host load / swapping as * well as any reset of TSC during the boot process. * * In that case, for a reliable TSC, we can match TSC offsets, @@ -1050,7 +1053,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) kvm->arch.last_tsc_write = data; kvm->arch.last_tsc_offset = offset; kvm_x86_ops->write_tsc_offset(vcpu, offset); - spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); /* Reset of TSC must disable overshoot protection below */ vcpu->arch.hv_clock.tsc_timestamp = 0; @@ -1453,6 +1456,14 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) return 0; } +static void kvmclock_reset(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.time_page) { + kvm_release_page_dirty(vcpu->arch.time_page); + vcpu->arch.time_page = NULL; + } +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { @@ -1510,10 +1521,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { - if (vcpu->arch.time_page) { - kvm_release_page_dirty(vcpu->arch.time_page); - vcpu->arch.time_page = NULL; - } + kvmclock_reset(vcpu); vcpu->arch.time = data; kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -1592,6 +1600,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) } else return set_msr_hyperv(vcpu, msr, data); break; + case MSR_IA32_BBL_CR_CTL3: + /* Drop writes to this legacy MSR -- see rdmsr + * counterpart for further detail. + */ + pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); + break; default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); @@ -1846,6 +1860,19 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) } else return get_msr_hyperv(vcpu, msr, pdata); break; + case MSR_IA32_BBL_CR_CTL3: + /* This legacy MSR exists but isn't fully documented in current + * silicon. It is however accessed by winxp in very narrow + * scenarios where it sets bit #19, itself documented as + * a "reserved" bit. Best effort attempt to source coherent + * read data here should the balance of the register be + * interpreted by the guest: + * + * L2 cache control register 3: 64GB range, 256KB size, + * enabled, latency 0x1, configured + */ + data = 0xbe702111; + break; default: if (!ignore_msrs) { pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); @@ -2100,8 +2127,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (check_tsc_unstable()) { kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); vcpu->arch.tsc_catchup = 1; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); } + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); if (vcpu->cpu != cpu) kvm_migrate_timers(vcpu); vcpu->cpu = cpu; @@ -2575,9 +2602,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, if (mce->status & MCI_STATUS_UC) { if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { - printk(KERN_DEBUG "kvm: set_mce: " - "injects mce exception while " - "previous one is in progress!\n"); kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return 0; } @@ -2648,8 +2672,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.pending = events->interrupt.injected; vcpu->arch.interrupt.nr = events->interrupt.nr; vcpu->arch.interrupt.soft = events->interrupt.soft; - if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) - kvm_pic_clear_isr_ack(vcpu->kvm); if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) kvm_x86_ops->set_interrupt_shadow(vcpu, events->interrupt.shadow); @@ -4140,8 +4162,8 @@ static unsigned long emulator_get_cached_segment_base(int seg, return get_segment_base(vcpu, seg); } -static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, - struct kvm_vcpu *vcpu) +static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3, + int seg, struct kvm_vcpu *vcpu) { struct kvm_segment var; @@ -4154,6 +4176,10 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, var.limit >>= 12; set_desc_limit(desc, var.limit); set_desc_base(desc, (unsigned long)var.base); +#ifdef CONFIG_X86_64 + if (base3) + *base3 = var.base >> 32; +#endif desc->type = var.type; desc->s = var.s; desc->dpl = var.dpl; @@ -4166,8 +4192,8 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, return true; } -static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg, - struct kvm_vcpu *vcpu) +static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3, + int seg, struct kvm_vcpu *vcpu) { struct kvm_segment var; @@ -4175,6 +4201,9 @@ static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg, kvm_get_segment(vcpu, &var, seg); var.base = get_desc_base(desc); +#ifdef CONFIG_X86_64 + var.base |= ((u64)base3) << 32; +#endif var.limit = get_desc_limit(desc); if (desc->g) var.limit = (var.limit << 12) | 0xfff; @@ -4390,41 +4419,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, vcpu->arch.emulate_ctxt.have_exception = false; vcpu->arch.emulate_ctxt.perm_ok = false; + vcpu->arch.emulate_ctxt.only_vendor_specific_insn + = emulation_type & EMULTYPE_TRAP_UD; + r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); - if (r == X86EMUL_PROPAGATE_FAULT) - goto done; trace_kvm_emulate_insn_start(vcpu); - - /* Only allow emulation of specific instructions on #UD - * (namely VMMCALL, sysenter, sysexit, syscall)*/ - if (emulation_type & EMULTYPE_TRAP_UD) { - if (!c->twobyte) - return EMULATE_FAIL; - switch (c->b) { - case 0x01: /* VMMCALL */ - if (c->modrm_mod != 3 || c->modrm_rm != 1) - return EMULATE_FAIL; - break; - case 0x34: /* sysenter */ - case 0x35: /* sysexit */ - if (c->modrm_mod != 0 || c->modrm_rm != 0) - return EMULATE_FAIL; - break; - case 0x05: /* syscall */ - if (c->modrm_mod != 0 || c->modrm_rm != 0) - return EMULATE_FAIL; - break; - default: - return EMULATE_FAIL; - } - - if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) - return EMULATE_FAIL; - } - ++vcpu->stat.insn_emulation; if (r) { + if (emulation_type & EMULTYPE_TRAP_UD) + return EMULATE_FAIL; if (reexecute_instruction(vcpu, cr2)) return EMULATE_DONE; if (emulation_type & EMULTYPE_SKIP) @@ -4452,7 +4456,6 @@ restart: return handle_emulation_failure(vcpu); } -done: if (vcpu->arch.emulate_ctxt.have_exception) { inject_emulated_exception(vcpu); r = EMULATE_DONE; @@ -4562,7 +4565,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != freq->cpu) @@ -4572,7 +4575,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va send_ipi = 1; } } - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); if (freq->old < freq->new && send_ipi) { /* @@ -5185,6 +5188,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 1; goto out; } + if (kvm_check_request(KVM_REQ_NMI, vcpu)) + vcpu->arch.nmi_pending = true; } r = kvm_mmu_reload(vcpu); @@ -5213,14 +5218,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_load_guest_fpu(vcpu); kvm_load_guest_xcr0(vcpu); - atomic_set(&vcpu->guest_mode, 1); - smp_wmb(); + vcpu->mode = IN_GUEST_MODE; + + /* We should set ->mode before check ->requests, + * see the comment in make_all_cpus_request. + */ + smp_mb(); local_irq_disable(); - if (!atomic_read(&vcpu->guest_mode) || vcpu->requests + if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests || need_resched() || signal_pending(current)) { - atomic_set(&vcpu->guest_mode, 0); + vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); local_irq_enable(); preempt_enable(); @@ -5256,7 +5265,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); - atomic_set(&vcpu->guest_mode, 0); + vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); local_irq_enable(); @@ -5574,7 +5583,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { int mmu_reset_needed = 0; - int pending_vec, max_bits; + int pending_vec, max_bits, idx; struct desc_ptr dt; dt.size = sregs->idt.limit; @@ -5603,10 +5612,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_x86_ops->set_cr4(vcpu, sregs->cr4); if (sregs->cr4 & X86_CR4_OSXSAVE) update_cpuid(vcpu); + + idx = srcu_read_lock(&vcpu->kvm->srcu); if (!is_long_mode(vcpu) && is_pae(vcpu)) { load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); mmu_reset_needed = 1; } + srcu_read_unlock(&vcpu->kvm->srcu, idx); if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); @@ -5617,8 +5629,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, if (pending_vec < max_bits) { kvm_queue_interrupt(vcpu, pending_vec, false); pr_debug("Set back pending irq %d\n", pending_vec); - if (irqchip_in_kernel(vcpu->kvm)) - kvm_pic_clear_isr_ack(vcpu->kvm); } kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); @@ -5814,10 +5824,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { - if (vcpu->arch.time_page) { - kvm_release_page_dirty(vcpu->arch.time_page); - vcpu->arch.time_page = NULL; - } + kvmclock_reset(vcpu); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fx_free(vcpu); @@ -5878,6 +5885,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.apf.msr_val = 0; + kvmclock_reset(vcpu); + kvm_clear_async_pf_completion_queue(vcpu); kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; @@ -6005,7 +6014,7 @@ int kvm_arch_init_vm(struct kvm *kvm) /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); - spin_lock_init(&kvm->arch.tsc_write_lock); + raw_spin_lock_init(&kvm->arch.tsc_write_lock); return 0; } @@ -6103,7 +6112,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, int user_alloc) { - int npages = mem->memory_size >> PAGE_SHIFT; + int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; if (!user_alloc && !old.user_alloc && old.rmap && !npages) { int ret; @@ -6118,12 +6127,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, "failed to munmap memory\n"); } + if (!kvm->arch.n_requested_mmu_pages) + nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); + spin_lock(&kvm->mmu_lock); - if (!kvm->arch.n_requested_mmu_pages) { - unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); + if (nr_mmu_pages) kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); - } - kvm_mmu_slot_remove_write_access(kvm, mem->slot); spin_unlock(&kvm->mmu_lock); } @@ -6157,7 +6166,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) me = get_cpu(); if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) - if (atomic_xchg(&vcpu->guest_mode, 0)) + if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE) smp_send_reschedule(cpu); put_cpu(); } diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index b9ec1c74943c..1cd608973ce5 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -397,7 +397,7 @@ static void lguest_load_tr_desc(void) * instead we just use the real "cpuid" instruction. Then I pretty much turned * off feature bits until the Guest booted. (Don't say that: you'll damage * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is - * hardly future proof.) Noone's listening! They don't like you anyway, + * hardly future proof.) No one's listening! They don't like you anyway, * parenthetic weirdo! * * Replacing the cpuid so we can turn features off is great for the kernel, but diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index e10cf070ede0..f2479f19ddde 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -42,4 +42,5 @@ else lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o + lib-y += cmpxchg16b_emu.o endif diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S new file mode 100644 index 000000000000..3e8b08a6de2b --- /dev/null +++ b/arch/x86/lib/cmpxchg16b_emu.S @@ -0,0 +1,59 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + */ +#include <linux/linkage.h> +#include <asm/alternative-asm.h> +#include <asm/frame.h> +#include <asm/dwarf2.h> + +.text + +/* + * Inputs: + * %rsi : memory location to compare + * %rax : low 64 bits of old value + * %rdx : high 64 bits of old value + * %rbx : low 64 bits of new value + * %rcx : high 64 bits of new value + * %al : Operation successful + */ +ENTRY(this_cpu_cmpxchg16b_emu) +CFI_STARTPROC + +# +# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not +# via the ZF. Caller will access %al to get result. +# +# Note that this is only useful for a cpuops operation. Meaning that we +# do *not* have a fully atomic operation but just an operation that is +# *atomic* on a single cpu (as provided by the this_cpu_xx class of +# macros). +# +this_cpu_cmpxchg16b_emu: + pushf + cli + + cmpq %gs:(%rsi), %rax + jne not_same + cmpq %gs:8(%rsi), %rdx + jne not_same + + movq %rbx, %gs:(%rsi) + movq %rcx, %gs:8(%rsi) + + popf + mov $1, %al + ret + + not_same: + popf + xor %al,%al + ret + +CFI_ENDPROC + +ENDPROC(this_cpu_cmpxchg16b_emu) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index a460158b5ac5..99e482615195 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -117,7 +117,7 @@ ENDPROC(bad_from_user) * rdx count * * Output: - * eax uncopied bytes or 0 if successfull. + * eax uncopied bytes or 0 if successful. */ ENTRY(copy_user_generic_unrolled) CFI_STARTPROC diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S index f0dba36578ea..fb903b758da8 100644 --- a/arch/x86/lib/csum-copy_64.S +++ b/arch/x86/lib/csum-copy_64.S @@ -1,6 +1,6 @@ /* - * Copyright 2002,2003 Andi Kleen, SuSE Labs. - * + * Copyright 2002, 2003 Andi Kleen, SuSE Labs. + * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive * for more details. No warranty for anything given at all. @@ -11,82 +11,82 @@ /* * Checksum copy with exception handling. - * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the + * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the * destination is zeroed. - * + * * Input * rdi source * rsi destination * edx len (32bit) - * ecx sum (32bit) + * ecx sum (32bit) * r8 src_err_ptr (int) * r9 dst_err_ptr (int) * * Output * eax 64bit sum. undefined in case of exception. - * - * Wrappers need to take care of valid exception sum and zeroing. + * + * Wrappers need to take care of valid exception sum and zeroing. * They also should align source or destination to 8 bytes. */ .macro source 10: - .section __ex_table,"a" + .section __ex_table, "a" .align 8 - .quad 10b,.Lbad_source + .quad 10b, .Lbad_source .previous .endm - + .macro dest 20: - .section __ex_table,"a" + .section __ex_table, "a" .align 8 - .quad 20b,.Lbad_dest + .quad 20b, .Lbad_dest .previous .endm - + .macro ignore L=.Lignore 30: - .section __ex_table,"a" + .section __ex_table, "a" .align 8 - .quad 30b,\L + .quad 30b, \L .previous .endm - - + + ENTRY(csum_partial_copy_generic) CFI_STARTPROC - cmpl $3*64,%edx - jle .Lignore + cmpl $3*64, %edx + jle .Lignore -.Lignore: - subq $7*8,%rsp +.Lignore: + subq $7*8, %rsp CFI_ADJUST_CFA_OFFSET 7*8 - movq %rbx,2*8(%rsp) + movq %rbx, 2*8(%rsp) CFI_REL_OFFSET rbx, 2*8 - movq %r12,3*8(%rsp) + movq %r12, 3*8(%rsp) CFI_REL_OFFSET r12, 3*8 - movq %r14,4*8(%rsp) + movq %r14, 4*8(%rsp) CFI_REL_OFFSET r14, 4*8 - movq %r13,5*8(%rsp) + movq %r13, 5*8(%rsp) CFI_REL_OFFSET r13, 5*8 - movq %rbp,6*8(%rsp) + movq %rbp, 6*8(%rsp) CFI_REL_OFFSET rbp, 6*8 - movq %r8,(%rsp) - movq %r9,1*8(%rsp) - - movl %ecx,%eax - movl %edx,%ecx + movq %r8, (%rsp) + movq %r9, 1*8(%rsp) - xorl %r9d,%r9d - movq %rcx,%r12 + movl %ecx, %eax + movl %edx, %ecx - shrq $6,%r12 - jz .Lhandle_tail /* < 64 */ + xorl %r9d, %r9d + movq %rcx, %r12 + + shrq $6, %r12 + jz .Lhandle_tail /* < 64 */ clc - + /* main loop. clear in 64 byte blocks */ /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */ /* r11: temp3, rdx: temp4, r12 loopcnt */ @@ -94,156 +94,156 @@ ENTRY(csum_partial_copy_generic) .p2align 4 .Lloop: source - movq (%rdi),%rbx + movq (%rdi), %rbx source - movq 8(%rdi),%r8 + movq 8(%rdi), %r8 source - movq 16(%rdi),%r11 + movq 16(%rdi), %r11 source - movq 24(%rdi),%rdx + movq 24(%rdi), %rdx source - movq 32(%rdi),%r10 + movq 32(%rdi), %r10 source - movq 40(%rdi),%rbp + movq 40(%rdi), %rbp source - movq 48(%rdi),%r14 + movq 48(%rdi), %r14 source - movq 56(%rdi),%r13 - + movq 56(%rdi), %r13 + ignore 2f prefetcht0 5*64(%rdi) -2: - adcq %rbx,%rax - adcq %r8,%rax - adcq %r11,%rax - adcq %rdx,%rax - adcq %r10,%rax - adcq %rbp,%rax - adcq %r14,%rax - adcq %r13,%rax +2: + adcq %rbx, %rax + adcq %r8, %rax + adcq %r11, %rax + adcq %rdx, %rax + adcq %r10, %rax + adcq %rbp, %rax + adcq %r14, %rax + adcq %r13, %rax decl %r12d - + dest - movq %rbx,(%rsi) + movq %rbx, (%rsi) dest - movq %r8,8(%rsi) + movq %r8, 8(%rsi) dest - movq %r11,16(%rsi) + movq %r11, 16(%rsi) dest - movq %rdx,24(%rsi) + movq %rdx, 24(%rsi) dest - movq %r10,32(%rsi) + movq %r10, 32(%rsi) dest - movq %rbp,40(%rsi) + movq %rbp, 40(%rsi) dest - movq %r14,48(%rsi) + movq %r14, 48(%rsi) dest - movq %r13,56(%rsi) - + movq %r13, 56(%rsi) + 3: - - leaq 64(%rdi),%rdi - leaq 64(%rsi),%rsi - jnz .Lloop + leaq 64(%rdi), %rdi + leaq 64(%rsi), %rsi - adcq %r9,%rax + jnz .Lloop - /* do last upto 56 bytes */ + adcq %r9, %rax + + /* do last up to 56 bytes */ .Lhandle_tail: /* ecx: count */ - movl %ecx,%r10d - andl $63,%ecx - shrl $3,%ecx - jz .Lfold + movl %ecx, %r10d + andl $63, %ecx + shrl $3, %ecx + jz .Lfold clc .p2align 4 -.Lloop_8: +.Lloop_8: source - movq (%rdi),%rbx - adcq %rbx,%rax + movq (%rdi), %rbx + adcq %rbx, %rax decl %ecx dest - movq %rbx,(%rsi) - leaq 8(%rsi),%rsi /* preserve carry */ - leaq 8(%rdi),%rdi + movq %rbx, (%rsi) + leaq 8(%rsi), %rsi /* preserve carry */ + leaq 8(%rdi), %rdi jnz .Lloop_8 - adcq %r9,%rax /* add in carry */ + adcq %r9, %rax /* add in carry */ .Lfold: /* reduce checksum to 32bits */ - movl %eax,%ebx - shrq $32,%rax - addl %ebx,%eax - adcl %r9d,%eax + movl %eax, %ebx + shrq $32, %rax + addl %ebx, %eax + adcl %r9d, %eax - /* do last upto 6 bytes */ + /* do last up to 6 bytes */ .Lhandle_7: - movl %r10d,%ecx - andl $7,%ecx - shrl $1,%ecx + movl %r10d, %ecx + andl $7, %ecx + shrl $1, %ecx jz .Lhandle_1 - movl $2,%edx - xorl %ebx,%ebx - clc + movl $2, %edx + xorl %ebx, %ebx + clc .p2align 4 -.Lloop_1: +.Lloop_1: source - movw (%rdi),%bx - adcl %ebx,%eax + movw (%rdi), %bx + adcl %ebx, %eax decl %ecx dest - movw %bx,(%rsi) - leaq 2(%rdi),%rdi - leaq 2(%rsi),%rsi + movw %bx, (%rsi) + leaq 2(%rdi), %rdi + leaq 2(%rsi), %rsi jnz .Lloop_1 - adcl %r9d,%eax /* add in carry */ - + adcl %r9d, %eax /* add in carry */ + /* handle last odd byte */ .Lhandle_1: - testl $1,%r10d + testl $1, %r10d jz .Lende - xorl %ebx,%ebx + xorl %ebx, %ebx source - movb (%rdi),%bl + movb (%rdi), %bl dest - movb %bl,(%rsi) - addl %ebx,%eax - adcl %r9d,%eax /* carry */ - + movb %bl, (%rsi) + addl %ebx, %eax + adcl %r9d, %eax /* carry */ + CFI_REMEMBER_STATE .Lende: - movq 2*8(%rsp),%rbx + movq 2*8(%rsp), %rbx CFI_RESTORE rbx - movq 3*8(%rsp),%r12 + movq 3*8(%rsp), %r12 CFI_RESTORE r12 - movq 4*8(%rsp),%r14 + movq 4*8(%rsp), %r14 CFI_RESTORE r14 - movq 5*8(%rsp),%r13 + movq 5*8(%rsp), %r13 CFI_RESTORE r13 - movq 6*8(%rsp),%rbp + movq 6*8(%rsp), %rbp CFI_RESTORE rbp - addq $7*8,%rsp + addq $7*8, %rsp CFI_ADJUST_CFA_OFFSET -7*8 ret CFI_RESTORE_STATE /* Exception handlers. Very simple, zeroing is done in the wrappers */ .Lbad_source: - movq (%rsp),%rax - testq %rax,%rax + movq (%rsp), %rax + testq %rax, %rax jz .Lende - movl $-EFAULT,(%rax) + movl $-EFAULT, (%rax) jmp .Lende - + .Lbad_dest: - movq 8(%rsp),%rax - testq %rax,%rax - jz .Lende - movl $-EFAULT,(%rax) + movq 8(%rsp), %rax + testq %rax, %rax + jz .Lende + movl $-EFAULT, (%rax) jmp .Lende CFI_ENDPROC ENDPROC(csum_partial_copy_generic) diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c index bf51144d97e1..9845371c5c36 100644 --- a/arch/x86/lib/csum-partial_64.c +++ b/arch/x86/lib/csum-partial_64.c @@ -84,7 +84,7 @@ static unsigned do_csum(const unsigned char *buff, unsigned len) count64--; } - /* last upto 7 8byte blocks */ + /* last up to 7 8byte blocks */ count %= 8; while (count) { asm("addq %1,%0\n\t" diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 069ce7c37c01..d4203988504a 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -326,7 +326,7 @@ try_again: if (mm->free_area_cache < len) goto fail; - /* either no address requested or cant fit in requested address hole */ + /* either no address requested or can't fit in requested address hole */ addr = (mm->free_area_cache - len) & huge_page_mask(h); do { /* diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 73ad7ebd6e9c..80088f994193 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -917,7 +917,7 @@ static void mark_nxdata_nx(void) { /* * When this called, init has already been executed and released, - * so everything past _etext sould be NX. + * so everything past _etext should be NX. */ unsigned long start = PFN_ALIGN(_etext); /* diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a08a62cb136e..794233587287 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -51,6 +51,8 @@ #include <asm/numa.h> #include <asm/cacheflush.h> #include <asm/init.h> +#include <asm/uv/uv.h> +#include <asm/setup.h> static int __init parse_direct_gbpages_off(char *arg) { @@ -293,18 +295,18 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) * to the compile time generated pmds. This results in invalid pmds up * to the point where we hit the physaddr 0 mapping. * - * We limit the mappings to the region from _text to _end. _end is - * rounded up to the 2MB boundary. This catches the invalid pmds as + * We limit the mappings to the region from _text to _brk_end. _brk_end + * is rounded up to the 2MB boundary. This catches the invalid pmds as * well, as they are located before _text: */ void __init cleanup_highmap(void) { unsigned long vaddr = __START_KERNEL_map; - unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1; + unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT); + unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; pmd_t *pmd = level2_kernel_pgt; - pmd_t *last_pmd = pmd + PTRS_PER_PMD; - for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) { + for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) { if (pmd_none(*pmd)) continue; if (vaddr < (unsigned long) _text || vaddr > end) @@ -860,18 +862,18 @@ static struct vm_area_struct gate_vma = { .vm_flags = VM_READ | VM_EXEC }; -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { #ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(tsk, TIF_IA32)) + if (!mm || mm->context.ia32_compat) return NULL; #endif return &gate_vma; } -int in_gate_area(struct task_struct *task, unsigned long addr) +int in_gate_area(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma = get_gate_vma(task); + struct vm_area_struct *vma = get_gate_vma(mm); if (!vma) return 0; @@ -880,11 +882,11 @@ int in_gate_area(struct task_struct *task, unsigned long addr) } /* - * Use this when you have no reliable task/vma, typically from interrupt - * context. It is less reliable than using the task's vma and may give - * false positives: + * Use this when you have no reliable mm, typically from interrupt + * context. It is less reliable than using a task's mm and may give + * false positives. */ -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); } @@ -898,6 +900,19 @@ const char *arch_vma_name(struct vm_area_struct *vma) return NULL; } +#ifdef CONFIG_X86_UV +#define MIN_MEMORY_BLOCK_SIZE (1 << SECTION_SIZE_BITS) + +unsigned long memory_block_size_bytes(void) +{ + if (is_uv_system()) { + printk(KERN_INFO "UV: memory block size 2GB\n"); + return 2UL * 1024 * 1024 * 1024; + } + return MIN_MEMORY_BLOCK_SIZE; +} +#endif + #ifdef CONFIG_SPARSEMEM_VMEMMAP /* * Initialise the sparsemem vmemmap using huge-pages at the PMD level. diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 9ec0f209a6a4..e8c00cc72033 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -446,7 +446,7 @@ static int __init numa_alloc_distance(void) * @distance: NUMA distance * * Set the distance from node @from to @to to @distance. If distance table - * doesn't exist, one which is large enough to accomodate all the currently + * doesn't exist, one which is large enough to accommodate all the currently * known nodes will be created. * * If such table cannot be allocated, a warning is printed and further diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 90825f2eb0f4..f9e526742fa1 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -310,7 +310,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, * these shared mappings are made of small page mappings. * Thus this don't enforce !RW mapping for small page kernel * text mapping logic will help Linux Xen parvirt guest boot - * aswell. + * as well. */ if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) pgprot_val(forbidden) |= _PAGE_RW; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 0113d19c8aa6..8573b83a63d0 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -168,8 +168,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) * section 8.1: in PAE mode we explicitly have to flush the * TLB via cr3 if the top-level pgd is changed... */ - if (mm == current->active_mm) - write_cr3(read_cr3()); + flush_tlb_mm(mm); } #else /* !CONFIG_X86_PAE */ diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 72cbec14d783..2d49d4e19a36 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -126,7 +126,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) if (!user_mode_vm(regs)) { unsigned long stack = kernel_stack_pointer(regs); if (depth) - dump_trace(NULL, regs, (unsigned long *)stack, + dump_trace(NULL, regs, (unsigned long *)stack, 0, &backtrace_ops, &depth); return; } diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 9fadec074142..98ab13058f89 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -50,7 +50,7 @@ static inline void setup_num_counters(void) #endif } -static int inline addr_increment(void) +static inline int addr_increment(void) { #ifdef CONFIG_SMP return smp_num_siblings == 2 ? 2 : 1; diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index b1805b78842f..494f2e7ea2b4 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -241,7 +241,7 @@ void __init pcibios_resource_survey(void) e820_reserve_resources_late(); /* * Insert the IO APIC resources after PCI initialization has - * occured to handle IO APICS that are mapped in on a BAR in + * occurred to handle IO APICS that are mapped in on a BAR in * PCI space, but before trying to assign unassigned pci res. */ ioapic_insert_resources(); @@ -304,7 +304,7 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, /* * ioremap() and ioremap_nocache() defaults to UC MINUS for now. * To avoid attribute conflicts, request UC MINUS here - * aswell. + * as well. */ prot |= _PAGE_CACHE_UC_MINUS; diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 87e6c8323117..8201165bae28 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -597,21 +597,18 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route return 1; } - if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN) && - (device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX)) { + if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN && + device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX) + || (device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN && + device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX) + || (device >= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MIN && + device <= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MAX)) { r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; return 1; } - if ((device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN) && - (device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)) { - r->name = "PIIX/ICH"; - r->get = pirq_piix_get; - r->set = pirq_piix_set; - return 1; - } return 0; } diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index 8c4085a95ef1..e37b407a0ee8 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -50,7 +50,7 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, name = "ioapic-level"; } - irq = xen_map_pirq_gsi(map_irq.pirq, gsi, shareable, name); + irq = xen_bind_pirq_gsi_to_irq(gsi, map_irq.pirq, shareable, name); printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq); @@ -237,6 +237,7 @@ static int xen_pcifront_enable_irq(struct pci_dev *dev) { int rc; int share = 1; + int pirq; u8 gsi; rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi); @@ -246,13 +247,21 @@ static int xen_pcifront_enable_irq(struct pci_dev *dev) return rc; } + rc = xen_allocate_pirq_gsi(gsi); + if (rc < 0) { + dev_warn(&dev->dev, "Xen PCI: failed to allocate a PIRQ for GSI%d: %d\n", + gsi, rc); + return rc; + } + pirq = rc; + if (gsi < NR_IRQS_LEGACY) share = 0; - rc = xen_allocate_pirq(gsi, share, "pcifront"); + rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront"); if (rc < 0) { - dev_warn(&dev->dev, "Xen PCI: failed to register GSI%d: %d\n", - gsi, rc); + dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to IRQ: %d\n", + gsi, pirq, rc); return rc; } @@ -309,7 +318,7 @@ int __init pci_xen_hvm_init(void) #ifdef CONFIG_XEN_DOM0 static int xen_register_pirq(u32 gsi, int triggering) { - int rc, irq; + int rc, pirq, irq = -1; struct physdev_map_pirq map_irq; int shareable = 0; char *name; @@ -325,17 +334,20 @@ static int xen_register_pirq(u32 gsi, int triggering) name = "ioapic-level"; } - irq = xen_allocate_pirq(gsi, shareable, name); - - printk(KERN_DEBUG "xen: --> irq=%d\n", irq); + pirq = xen_allocate_pirq_gsi(gsi); + if (pirq < 0) + goto out; + irq = xen_bind_pirq_gsi_to_irq(gsi, pirq, shareable, name); if (irq < 0) goto out; + printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d\n", pirq, irq); + map_irq.domid = DOMID_SELF; map_irq.type = MAP_PIRQ_TYPE_GSI; map_irq.index = gsi; - map_irq.pirq = irq; + map_irq.pirq = pirq; rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); if (rc) { @@ -422,13 +434,18 @@ static int __init pci_xen_initial_domain(void) void __init xen_setup_pirqs(void) { - int irq; + int pirq, irq; pci_xen_initial_domain(); if (0 == nr_ioapics) { - for (irq = 0; irq < NR_IRQS_LEGACY; irq++) - xen_allocate_pirq(irq, 0, "xt-pic"); + for (irq = 0; irq < NR_IRQS_LEGACY; irq++) { + pirq = xen_allocate_pirq_gsi(irq); + if (WARN(pirq < 0, + "Could not allocate PIRQ for legacy interrupt\n")) + break; + irq = xen_bind_pirq_gsi_to_irq(irq, pirq, 0, "xt-pic"); + } return; } diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c index 127775696d6c..99513642a0e6 100644 --- a/arch/x86/platform/olpc/olpc-xo1.c +++ b/arch/x86/platform/olpc/olpc-xo1.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/platform_device.h> #include <linux/pm.h> +#include <linux/mfd/core.h> #include <asm/io.h> #include <asm/olpc.h> @@ -56,25 +57,24 @@ static void xo1_power_off(void) static int __devinit olpc_xo1_probe(struct platform_device *pdev) { struct resource *res; + int err; /* don't run on non-XOs */ if (!machine_is_olpc()) return -ENODEV; + err = mfd_cell_enable(pdev); + if (err) + return err; + res = platform_get_resource(pdev, IORESOURCE_IO, 0); if (!res) { dev_err(&pdev->dev, "can't fetch device resource info\n"); return -EIO; } - - if (!request_region(res->start, resource_size(res), DRV_NAME)) { - dev_err(&pdev->dev, "can't request region\n"); - return -EIO; - } - - if (strcmp(pdev->name, "cs5535-pms") == 0) + if (strcmp(pdev->name, "olpc-xo1-pms") == 0) pms_base = res->start; - else if (strcmp(pdev->name, "cs5535-acpi") == 0) + else if (strcmp(pdev->name, "olpc-xo1-ac-acpi") == 0) acpi_base = res->start; /* If we have both addresses, we can override the poweroff hook */ @@ -88,14 +88,11 @@ static int __devinit olpc_xo1_probe(struct platform_device *pdev) static int __devexit olpc_xo1_remove(struct platform_device *pdev) { - struct resource *r; - - r = platform_get_resource(pdev, IORESOURCE_IO, 0); - release_region(r->start, resource_size(r)); + mfd_cell_disable(pdev); - if (strcmp(pdev->name, "cs5535-pms") == 0) + if (strcmp(pdev->name, "olpc-xo1-pms") == 0) pms_base = 0; - else if (strcmp(pdev->name, "cs5535-acpi") == 0) + else if (strcmp(pdev->name, "olpc-xo1-acpi") == 0) acpi_base = 0; pm_power_off = NULL; @@ -104,7 +101,7 @@ static int __devexit olpc_xo1_remove(struct platform_device *pdev) static struct platform_driver cs5535_pms_drv = { .driver = { - .name = "cs5535-pms", + .name = "olpc-xo1-pms", .owner = THIS_MODULE, }, .probe = olpc_xo1_probe, @@ -113,7 +110,7 @@ static struct platform_driver cs5535_pms_drv = { static struct platform_driver cs5535_acpi_drv = { .driver = { - .name = "cs5535-acpi", + .name = "olpc-xo1-acpi", .owner = THIS_MODULE, }, .probe = olpc_xo1_probe, @@ -124,26 +121,27 @@ static int __init olpc_xo1_init(void) { int r; - r = platform_driver_register(&cs5535_pms_drv); + r = mfd_shared_platform_driver_register(&cs5535_pms_drv, "cs5535-pms"); if (r) return r; - r = platform_driver_register(&cs5535_acpi_drv); + r = mfd_shared_platform_driver_register(&cs5535_acpi_drv, + "cs5535-acpi"); if (r) - platform_driver_unregister(&cs5535_pms_drv); + mfd_shared_platform_driver_unregister(&cs5535_pms_drv); return r; } static void __exit olpc_xo1_exit(void) { - platform_driver_unregister(&cs5535_acpi_drv); - platform_driver_unregister(&cs5535_pms_drv); + mfd_shared_platform_driver_unregister(&cs5535_acpi_drv); + mfd_shared_platform_driver_unregister(&cs5535_pms_drv); } MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>"); MODULE_LICENSE("GPL"); -MODULE_ALIAS("platform:olpc-xo1"); +MODULE_ALIAS("platform:cs5535-pms"); module_init(olpc_xo1_init); module_exit(olpc_xo1_exit); diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 36df991985b2..468d591dde31 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -417,24 +417,25 @@ const char *arch_vma_name(struct vm_area_struct *vma) return NULL; } -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { - struct mm_struct *mm = tsk->mm; - - /* Check to see if this task was created in compat vdso mode */ + /* + * Check to see if the corresponding task was created in compat vdso + * mode. + */ if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE) return &gate_vma; return NULL; } -int in_gate_area(struct task_struct *task, unsigned long addr) +int in_gate_area(struct mm_struct *mm, unsigned long addr) { - const struct vm_area_struct *vma = get_gate_vma(task); + const struct vm_area_struct *vma = get_gate_vma(mm); return vma && addr >= vma->vm_start && addr < vma->vm_end; } -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { return 0; } diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index e4343fe488ed..1c7121ba18ff 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -38,7 +38,7 @@ config XEN_MAX_DOMAIN_MEMORY config XEN_SAVE_RESTORE bool - depends on XEN && PM + depends on XEN default y config XEN_DEBUG_FS diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3f6f3347aa17..c82df6c9c0f0 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -79,8 +79,7 @@ /* * Protects atomic reservation decrease/increase against concurrent increases. - * Also protects non-atomic updates of current_pages and driver_pages, and - * balloon lists. + * Also protects non-atomic updates of current_pages and balloon lists. */ DEFINE_SPINLOCK(xen_reservation_lock); @@ -1488,10 +1487,12 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) /* * If the new pfn is within the range of the newly allocated * kernel pagetable, and it isn't being mapped into an - * early_ioremap fixmap slot, make sure it is RO. + * early_ioremap fixmap slot as a freshly allocated page, make sure + * it is RO. */ - if (!is_early_ioremap_ptep(ptep) && - pfn >= pgt_buf_start && pfn < pgt_buf_end) + if (((!is_early_ioremap_ptep(ptep) && + pfn >= pgt_buf_start && pfn < pgt_buf_end)) || + (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1))) pte = pte_wrprotect(pte); return pte; @@ -1701,9 +1702,6 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { pte_t pte; - if (pfn > max_pfn_mapped) - max_pfn_mapped = pfn; - if (!pte_none(pte_page[pteidx])) continue; @@ -1745,7 +1743,7 @@ static void convert_pfn_mfn(void *v) } /* - * Set up the inital kernel pagetable. + * Set up the initial kernel pagetable. * * We can construct this by grafting the Xen provided pagetable into * head_64.S's preconstructed pagetables. We copy the Xen L2's into @@ -1761,6 +1759,12 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, pud_t *l3; pmd_t *l2; + /* max_pfn_mapped is the last pfn mapped in the initial memory + * mappings. Considering that on Xen after the kernel mappings we + * have the mappings of some pages that don't exist in pfn space, we + * set max_pfn_mapped to the last real pfn mapped. */ + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); + /* Zap identity mapping */ init_level4_pgt[0] = __pgd(0); @@ -1865,9 +1869,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, initial_kernel_pmd = extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); - max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + - xen_start_info->nr_pt_frames * PAGE_SIZE + - 512*1024); + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); |