diff options
Diffstat (limited to 'arch/x86/xen')
-rw-r--r-- | arch/x86/xen/Kconfig | 2 | ||||
-rw-r--r-- | arch/x86/xen/Makefile | 1 | ||||
-rw-r--r-- | arch/x86/xen/apic.c | 2 | ||||
-rw-r--r-- | arch/x86/xen/enlighten.c | 281 | ||||
-rw-r--r-- | arch/x86/xen/mmu.c | 423 | ||||
-rw-r--r-- | arch/x86/xen/mmu.h | 1 | ||||
-rw-r--r-- | arch/x86/xen/pci-swiotlb-xen.c | 4 | ||||
-rw-r--r-- | arch/x86/xen/platform-pci-unplug.c | 4 | ||||
-rw-r--r-- | arch/x86/xen/setup.c | 103 | ||||
-rw-r--r-- | arch/x86/xen/smp.c | 81 | ||||
-rw-r--r-- | arch/x86/xen/smp.h | 8 | ||||
-rw-r--r-- | arch/x86/xen/spinlock.c | 19 | ||||
-rw-r--r-- | arch/x86/xen/time.c | 4 | ||||
-rw-r--r-- | arch/x86/xen/xen-head.S | 62 | ||||
-rw-r--r-- | arch/x86/xen/xen-ops.h | 1 | ||||
-rw-r--r-- | arch/x86/xen/xen-pvh.S | 161 |
16 files changed, 646 insertions, 511 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index c7b15f3e2cf3..76b6dbd627df 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -53,5 +53,5 @@ config XEN_DEBUG_FS config XEN_PVH bool "Support for running as a PVH guest" - depends on X86_64 && XEN && XEN_PVHVM + depends on XEN && XEN_PVHVM && ACPI def_bool n diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index e47e52787d32..cb0164aee156 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -23,3 +23,4 @@ obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o obj-$(CONFIG_XEN_DOM0) += vga.o obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o obj-$(CONFIG_XEN_EFI) += efi.o +obj-$(CONFIG_XEN_PVH) += xen-pvh.o diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 44c88ad1841a..bcea81f36fc5 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -145,7 +145,7 @@ static void xen_silent_inquire(int apicid) static int xen_cpu_present_to_apicid(int cpu) { if (cpu_present(cpu)) - return xen_get_apic_id(xen_apic_read(APIC_ID)); + return cpu_data(cpu).apicid; else return BAD_APICID; } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 51ef95232725..30822e8e64ac 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -45,6 +45,7 @@ #include <xen/interface/memory.h> #include <xen/interface/nmi.h> #include <xen/interface/xen-mca.h> +#include <xen/interface/hvm/start_info.h> #include <xen/features.h> #include <xen/page.h> #include <xen/hvm.h> @@ -75,6 +76,7 @@ #include <asm/mwait.h> #include <asm/pci_x86.h> #include <asm/cpu.h> +#include <asm/e820/api.h> #ifdef CONFIG_ACPI #include <linux/acpi.h> @@ -176,6 +178,20 @@ struct tls_descs { */ static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); +#ifdef CONFIG_XEN_PVH +/* + * PVH variables. + * + * xen_pvh and pvh_bootparams need to live in data segment since they + * are used after startup_{32|64}, which clear .bss, are invoked. + */ +bool xen_pvh __attribute__((section(".data"))) = 0; +struct boot_params pvh_bootparams __attribute__((section(".data"))); + +struct hvm_start_info pvh_start_info; +unsigned int pvh_start_info_sz = sizeof(pvh_start_info); +#endif + static void clamp_max_cpus(void) { #ifdef CONFIG_SMP @@ -695,7 +711,7 @@ static void load_TLS_descriptor(struct thread_struct *t, *shadow = t->tls_array[i]; - gdt = get_cpu_gdt_table(cpu); + gdt = get_cpu_gdt_rw(cpu); maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); mc = __xen_mc_entry(0); @@ -1138,10 +1154,11 @@ void xen_setup_vcpu_info_placement(void) xen_vcpu_setup(cpu); } - /* xen_vcpu_setup managed to place the vcpu_info within the - * percpu area for all cpus, so make use of it. Note that for - * PVH we want to use native IRQ mechanism. */ - if (have_vcpu_info_placement && !xen_pvh_domain()) { + /* + * xen_vcpu_setup managed to place the vcpu_info within the + * percpu area for all cpus, so make use of it. + */ + if (have_vcpu_info_placement) { pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); @@ -1413,49 +1430,9 @@ static void __init xen_boot_params_init_edd(void) * Set up the GDT and segment registers for -fstack-protector. Until * we do this, we have to be careful not to call any stack-protected * function, which is most of the kernel. - * - * Note, that it is __ref because the only caller of this after init - * is PVH which is not going to use xen_load_gdt_boot or other - * __init functions. */ -static void __ref xen_setup_gdt(int cpu) +static void xen_setup_gdt(int cpu) { - if (xen_feature(XENFEAT_auto_translated_physmap)) { -#ifdef CONFIG_X86_64 - unsigned long dummy; - - load_percpu_segment(cpu); /* We need to access per-cpu area */ - switch_to_new_gdt(cpu); /* GDT and GS set */ - - /* We are switching of the Xen provided GDT to our HVM mode - * GDT. The new GDT has __KERNEL_CS with CS.L = 1 - * and we are jumping to reload it. - */ - asm volatile ("pushq %0\n" - "leaq 1f(%%rip),%0\n" - "pushq %0\n" - "lretq\n" - "1:\n" - : "=&r" (dummy) : "0" (__KERNEL_CS)); - - /* - * While not needed, we also set the %es, %ds, and %fs - * to zero. We don't care about %ss as it is NULL. - * Strictly speaking this is not needed as Xen zeros those - * out (and also MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE) - * - * Linux zeros them in cpu_init() and in secondary_startup_64 - * (for BSP). - */ - loadsegment(es, 0); - loadsegment(ds, 0); - loadsegment(fs, 0); -#else - /* PVH: TODO Implement. */ - BUG(); -#endif - return; /* PVH does not need any PV GDT ops. */ - } pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot; pv_cpu_ops.load_gdt = xen_load_gdt_boot; @@ -1466,59 +1443,6 @@ static void __ref xen_setup_gdt(int cpu) pv_cpu_ops.load_gdt = xen_load_gdt; } -#ifdef CONFIG_XEN_PVH -/* - * A PV guest starts with default flags that are not set for PVH, set them - * here asap. - */ -static void xen_pvh_set_cr_flags(int cpu) -{ - - /* Some of these are setup in 'secondary_startup_64'. The others: - * X86_CR0_TS, X86_CR0_PE, X86_CR0_ET are set by Xen for HVM guests - * (which PVH shared codepaths), while X86_CR0_PG is for PVH. */ - write_cr0(read_cr0() | X86_CR0_MP | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM); - - if (!cpu) - return; - /* - * For BSP, PSE PGE are set in probe_page_size_mask(), for APs - * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu__init_cpu(). - */ - if (boot_cpu_has(X86_FEATURE_PSE)) - cr4_set_bits_and_update_boot(X86_CR4_PSE); - - if (boot_cpu_has(X86_FEATURE_PGE)) - cr4_set_bits_and_update_boot(X86_CR4_PGE); -} - -/* - * Note, that it is ref - because the only caller of this after init - * is PVH which is not going to use xen_load_gdt_boot or other - * __init functions. - */ -void __ref xen_pvh_secondary_vcpu_init(int cpu) -{ - xen_setup_gdt(cpu); - xen_pvh_set_cr_flags(cpu); -} - -static void __init xen_pvh_early_guest_init(void) -{ - if (!xen_feature(XENFEAT_auto_translated_physmap)) - return; - - BUG_ON(!xen_feature(XENFEAT_hvm_callback_vector)); - - xen_pvh_early_cpu_init(0, false); - xen_pvh_set_cr_flags(0); - -#ifdef CONFIG_X86_32 - BUG(); /* PVH: Implement proper support. */ -#endif -} -#endif /* CONFIG_XEN_PVH */ - static void __init xen_dom0_set_legacy_features(void) { x86_platform.legacy.rtc = 1; @@ -1555,24 +1479,17 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_domain_type = XEN_PV_DOMAIN; xen_setup_features(); -#ifdef CONFIG_XEN_PVH - xen_pvh_early_guest_init(); -#endif + xen_setup_machphys_mapping(); /* Install Xen paravirt ops */ pv_info = xen_info; pv_init_ops = xen_init_ops; - if (!xen_pvh_domain()) { - pv_cpu_ops = xen_cpu_ops; + pv_cpu_ops = xen_cpu_ops; - x86_platform.get_nmi_reason = xen_get_nmi_reason; - } + x86_platform.get_nmi_reason = xen_get_nmi_reason; - if (xen_feature(XENFEAT_auto_translated_physmap)) - x86_init.resources.memory_setup = xen_auto_xlated_memory_setup; - else - x86_init.resources.memory_setup = xen_memory_setup; + x86_init.resources.memory_setup = xen_memory_setup; x86_init.oem.arch_setup = xen_arch_setup; x86_init.oem.banner = xen_banner; @@ -1665,24 +1582,20 @@ asmlinkage __visible void __init xen_start_kernel(void) /* set the limit of our address space */ xen_reserve_top(); - /* PVH: runs at default kernel iopl of 0 */ - if (!xen_pvh_domain()) { - /* - * We used to do this in xen_arch_setup, but that is too late - * on AMD were early_cpu_init (run before ->arch_setup()) calls - * early_amd_init which pokes 0xcf8 port. - */ - set_iopl.iopl = 1; - rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); - if (rc != 0) - xen_raw_printk("physdev_op failed %d\n", rc); - } + /* + * We used to do this in xen_arch_setup, but that is too late + * on AMD were early_cpu_init (run before ->arch_setup()) calls + * early_amd_init which pokes 0xcf8 port. + */ + set_iopl.iopl = 1; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); + if (rc != 0) + xen_raw_printk("physdev_op failed %d\n", rc); #ifdef CONFIG_X86_32 /* set up basic CPUID stuff */ cpu_detect(&new_cpu_data); set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU); - new_cpu_data.wp_works_ok = 1; new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1); #endif @@ -1758,6 +1671,100 @@ asmlinkage __visible void __init xen_start_kernel(void) #endif } +#ifdef CONFIG_XEN_PVH + +static void xen_pvh_arch_setup(void) +{ +#ifdef CONFIG_ACPI + /* Make sure we don't fall back to (default) ACPI_IRQ_MODEL_PIC. */ + if (nr_ioapics == 0) + acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM; +#endif +} + +static void __init init_pvh_bootparams(void) +{ + struct xen_memory_map memmap; + unsigned int i; + int rc; + + memset(&pvh_bootparams, 0, sizeof(pvh_bootparams)); + + memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table); + set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table); + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); + if (rc) { + xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); + BUG(); + } + + if (memmap.nr_entries < E820_MAX_ENTRIES_ZEROPAGE - 1) { + pvh_bootparams.e820_table[memmap.nr_entries].addr = + ISA_START_ADDRESS; + pvh_bootparams.e820_table[memmap.nr_entries].size = + ISA_END_ADDRESS - ISA_START_ADDRESS; + pvh_bootparams.e820_table[memmap.nr_entries].type = + E820_TYPE_RESERVED; + memmap.nr_entries++; + } else + xen_raw_printk("Warning: Can fit ISA range into e820\n"); + + pvh_bootparams.e820_entries = memmap.nr_entries; + for (i = 0; i < pvh_bootparams.e820_entries; i++) + e820__range_add(pvh_bootparams.e820_table[i].addr, + pvh_bootparams.e820_table[i].size, + pvh_bootparams.e820_table[i].type); + + e820__update_table(e820_table); + + pvh_bootparams.hdr.cmd_line_ptr = + pvh_start_info.cmdline_paddr; + + /* The first module is always ramdisk. */ + if (pvh_start_info.nr_modules) { + struct hvm_modlist_entry *modaddr = + __va(pvh_start_info.modlist_paddr); + pvh_bootparams.hdr.ramdisk_image = modaddr->paddr; + pvh_bootparams.hdr.ramdisk_size = modaddr->size; + } + + /* + * See Documentation/x86/boot.txt. + * + * Version 2.12 supports Xen entry point but we will use default x86/PC + * environment (i.e. hardware_subarch 0). + */ + pvh_bootparams.hdr.version = 0x212; + pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */ +} + +/* + * This routine (and those that it might call) should not use + * anything that lives in .bss since that segment will be cleared later. + */ +void __init xen_prepare_pvh(void) +{ + u32 msr; + u64 pfn; + + if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) { + xen_raw_printk("Error: Unexpected magic value (0x%08x)\n", + pvh_start_info.magic); + BUG(); + } + + xen_pvh = 1; + + msr = cpuid_ebx(xen_cpuid_base() + 2); + pfn = __pa(hypercall_page); + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + + init_pvh_bootparams(); + + x86_init.oem.arch_setup = xen_pvh_arch_setup; +} +#endif + void __ref xen_hvm_init_shared_info(void) { int cpu; @@ -1797,20 +1804,29 @@ void __ref xen_hvm_init_shared_info(void) static void __init init_hvm_pv_info(void) { int major, minor; - uint32_t eax, ebx, ecx, edx, pages, msr, base; - u64 pfn; + uint32_t eax, ebx, ecx, edx, base; base = xen_cpuid_base(); - cpuid(base + 1, &eax, &ebx, &ecx, &edx); + eax = cpuid_eax(base + 1); major = eax >> 16; minor = eax & 0xffff; printk(KERN_INFO "Xen version %d.%d.\n", major, minor); - cpuid(base + 2, &pages, &msr, &ecx, &edx); + xen_domain_type = XEN_HVM_DOMAIN; - pfn = __pa(hypercall_page); - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + /* PVH set up hypercall page in xen_prepare_pvh(). */ + if (xen_pvh_domain()) + pv_info.name = "Xen PVH"; + else { + u64 pfn; + uint32_t msr; + + pv_info.name = "Xen HVM"; + msr = cpuid_ebx(base + 2); + pfn = __pa(hypercall_page); + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + } xen_setup_features(); @@ -1819,10 +1835,6 @@ static void __init init_hvm_pv_info(void) this_cpu_write(xen_vcpu_id, ebx); else this_cpu_write(xen_vcpu_id, smp_processor_id()); - - pv_info.name = "Xen HVM"; - - xen_domain_type = XEN_HVM_DOMAIN; } #endif @@ -1910,6 +1922,9 @@ static void __init xen_hvm_guest_init(void) x86_init.irqs.intr_init = xen_init_IRQ; xen_hvm_init_time_ops(); xen_hvm_init_mmu_ops(); + + if (xen_pvh_domain()) + machine_ops.emergency_restart = xen_emergency_restart; #ifdef CONFIG_KEXEC_CORE machine_ops.shutdown = xen_hvm_shutdown; machine_ops.crash_shutdown = xen_hvm_crash_shutdown; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 7d5afdb417cc..f226038a39ca 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -38,7 +38,7 @@ * * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 */ -#include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/highmem.h> #include <linux/debugfs.h> #include <linux/bug.h> @@ -58,7 +58,7 @@ #include <asm/mmu_context.h> #include <asm/setup.h> #include <asm/paravirt.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/linkage.h> #include <asm/page.h> #include <asm/init.h> @@ -535,40 +535,41 @@ static pgd_t *xen_get_user_pgd(pgd_t *pgd) return user_ptr; } -static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) +static void __xen_set_p4d_hyper(p4d_t *ptr, p4d_t val) { struct mmu_update u; u.ptr = virt_to_machine(ptr).maddr; - u.val = pgd_val_ma(val); + u.val = p4d_val_ma(val); xen_extend_mmu_update(&u); } /* - * Raw hypercall-based set_pgd, intended for in early boot before + * Raw hypercall-based set_p4d, intended for in early boot before * there's a page structure. This implies: * 1. The only existing pagetable is the kernel's * 2. It is always pinned * 3. It has no user pagetable attached to it */ -static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) +static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val) { preempt_disable(); xen_mc_batch(); - __xen_set_pgd_hyper(ptr, val); + __xen_set_p4d_hyper(ptr, val); xen_mc_issue(PARAVIRT_LAZY_MMU); preempt_enable(); } -static void xen_set_pgd(pgd_t *ptr, pgd_t val) +static void xen_set_p4d(p4d_t *ptr, p4d_t val) { - pgd_t *user_ptr = xen_get_user_pgd(ptr); + pgd_t *user_ptr = xen_get_user_pgd((pgd_t *)ptr); + pgd_t pgd_val; - trace_xen_mmu_set_pgd(ptr, user_ptr, val); + trace_xen_mmu_set_p4d(ptr, (p4d_t *)user_ptr, val); /* If page is not pinned, we can just update the entry directly */ @@ -576,7 +577,8 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val) *ptr = val; if (user_ptr) { WARN_ON(xen_page_pinned(user_ptr)); - *user_ptr = val; + pgd_val.pgd = p4d_val_ma(val); + *user_ptr = pgd_val; } return; } @@ -585,14 +587,72 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val) user updates together. */ xen_mc_batch(); - __xen_set_pgd_hyper(ptr, val); + __xen_set_p4d_hyper(ptr, val); if (user_ptr) - __xen_set_pgd_hyper(user_ptr, val); + __xen_set_p4d_hyper((p4d_t *)user_ptr, val); xen_mc_issue(PARAVIRT_LAZY_MMU); } #endif /* CONFIG_PGTABLE_LEVELS == 4 */ +static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd, + int (*func)(struct mm_struct *mm, struct page *, enum pt_level), + bool last, unsigned long limit) +{ + int i, nr, flush = 0; + + nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD; + for (i = 0; i < nr; i++) { + if (!pmd_none(pmd[i])) + flush |= (*func)(mm, pmd_page(pmd[i]), PT_PTE); + } + return flush; +} + +static int xen_pud_walk(struct mm_struct *mm, pud_t *pud, + int (*func)(struct mm_struct *mm, struct page *, enum pt_level), + bool last, unsigned long limit) +{ + int i, nr, flush = 0; + + nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD; + for (i = 0; i < nr; i++) { + pmd_t *pmd; + + if (pud_none(pud[i])) + continue; + + pmd = pmd_offset(&pud[i], 0); + if (PTRS_PER_PMD > 1) + flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); + flush |= xen_pmd_walk(mm, pmd, func, + last && i == nr - 1, limit); + } + return flush; +} + +static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d, + int (*func)(struct mm_struct *mm, struct page *, enum pt_level), + bool last, unsigned long limit) +{ + int i, nr, flush = 0; + + nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D; + for (i = 0; i < nr; i++) { + pud_t *pud; + + if (p4d_none(p4d[i])) + continue; + + pud = pud_offset(&p4d[i], 0); + if (PTRS_PER_PUD > 1) + flush |= (*func)(mm, virt_to_page(pud), PT_PUD); + flush |= xen_pud_walk(mm, pud, func, + last && i == nr - 1, limit); + } + return flush; +} + /* * (Yet another) pagetable walker. This one is intended for pinning a * pagetable. This means that it walks a pagetable and calls the @@ -613,10 +673,8 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, enum pt_level), unsigned long limit) { - int flush = 0; + int i, nr, flush = 0; unsigned hole_low, hole_high; - unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; - unsigned pgdidx, pudidx, pmdidx; /* The limit is the last byte to be touched */ limit--; @@ -633,65 +691,22 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, hole_low = pgd_index(USER_LIMIT); hole_high = pgd_index(PAGE_OFFSET); - pgdidx_limit = pgd_index(limit); -#if PTRS_PER_PUD > 1 - pudidx_limit = pud_index(limit); -#else - pudidx_limit = 0; -#endif -#if PTRS_PER_PMD > 1 - pmdidx_limit = pmd_index(limit); -#else - pmdidx_limit = 0; -#endif - - for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { - pud_t *pud; + nr = pgd_index(limit) + 1; + for (i = 0; i < nr; i++) { + p4d_t *p4d; - if (pgdidx >= hole_low && pgdidx < hole_high) + if (i >= hole_low && i < hole_high) continue; - if (!pgd_val(pgd[pgdidx])) + if (pgd_none(pgd[i])) continue; - pud = pud_offset(&pgd[pgdidx], 0); - - if (PTRS_PER_PUD > 1) /* not folded */ - flush |= (*func)(mm, virt_to_page(pud), PT_PUD); - - for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { - pmd_t *pmd; - - if (pgdidx == pgdidx_limit && - pudidx > pudidx_limit) - goto out; - - if (pud_none(pud[pudidx])) - continue; - - pmd = pmd_offset(&pud[pudidx], 0); - - if (PTRS_PER_PMD > 1) /* not folded */ - flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); - - for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { - struct page *pte; - - if (pgdidx == pgdidx_limit && - pudidx == pudidx_limit && - pmdidx > pmdidx_limit) - goto out; - - if (pmd_none(pmd[pmdidx])) - continue; - - pte = pmd_page(pmd[pmdidx]); - flush |= (*func)(mm, pte, PT_PTE); - } - } + p4d = p4d_offset(&pgd[i], 0); + if (PTRS_PER_P4D > 1) + flush |= (*func)(mm, virt_to_page(p4d), PT_P4D); + flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit); } -out: /* Do the top level last, so that the callbacks can use it as a cue to do final things like tlb flushes. */ flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); @@ -1150,57 +1165,97 @@ static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin) xen_free_ro_pages(pa, PAGE_SIZE); } +static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin) +{ + unsigned long pa; + pte_t *pte_tbl; + int i; + + if (pmd_large(*pmd)) { + pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, PMD_SIZE); + return; + } + + pte_tbl = pte_offset_kernel(pmd, 0); + for (i = 0; i < PTRS_PER_PTE; i++) { + if (pte_none(pte_tbl[i])) + continue; + pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT; + xen_free_ro_pages(pa, PAGE_SIZE); + } + set_pmd(pmd, __pmd(0)); + xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin); +} + +static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin) +{ + unsigned long pa; + pmd_t *pmd_tbl; + int i; + + if (pud_large(*pud)) { + pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, PUD_SIZE); + return; + } + + pmd_tbl = pmd_offset(pud, 0); + for (i = 0; i < PTRS_PER_PMD; i++) { + if (pmd_none(pmd_tbl[i])) + continue; + xen_cleanmfnmap_pmd(pmd_tbl + i, unpin); + } + set_pud(pud, __pud(0)); + xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin); +} + +static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin) +{ + unsigned long pa; + pud_t *pud_tbl; + int i; + + if (p4d_large(*p4d)) { + pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, P4D_SIZE); + return; + } + + pud_tbl = pud_offset(p4d, 0); + for (i = 0; i < PTRS_PER_PUD; i++) { + if (pud_none(pud_tbl[i])) + continue; + xen_cleanmfnmap_pud(pud_tbl + i, unpin); + } + set_p4d(p4d, __p4d(0)); + xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin); +} + /* * Since it is well isolated we can (and since it is perhaps large we should) * also free the page tables mapping the initial P->M table. */ static void __init xen_cleanmfnmap(unsigned long vaddr) { - unsigned long va = vaddr & PMD_MASK; - unsigned long pa; - pgd_t *pgd = pgd_offset_k(va); - pud_t *pud_page = pud_offset(pgd, 0); - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + pgd_t *pgd; + p4d_t *p4d; unsigned int i; bool unpin; unpin = (vaddr == 2 * PGDIR_SIZE); - set_pgd(pgd, __pgd(0)); - do { - pud = pud_page + pud_index(va); - if (pud_none(*pud)) { - va += PUD_SIZE; - } else if (pud_large(*pud)) { - pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; - xen_free_ro_pages(pa, PUD_SIZE); - va += PUD_SIZE; - } else { - pmd = pmd_offset(pud, va); - if (pmd_large(*pmd)) { - pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; - xen_free_ro_pages(pa, PMD_SIZE); - } else if (!pmd_none(*pmd)) { - pte = pte_offset_kernel(pmd, va); - set_pmd(pmd, __pmd(0)); - for (i = 0; i < PTRS_PER_PTE; ++i) { - if (pte_none(pte[i])) - break; - pa = pte_pfn(pte[i]) << PAGE_SHIFT; - xen_free_ro_pages(pa, PAGE_SIZE); - } - xen_cleanmfnmap_free_pgtbl(pte, unpin); - } - va += PMD_SIZE; - if (pmd_index(va)) - continue; - set_pud(pud, __pud(0)); - xen_cleanmfnmap_free_pgtbl(pmd, unpin); - } - - } while (pud_index(va) || pmd_index(va)); - xen_cleanmfnmap_free_pgtbl(pud_page, unpin); + vaddr &= PMD_MASK; + pgd = pgd_offset_k(vaddr); + p4d = p4d_offset(pgd, 0); + for (i = 0; i < PTRS_PER_P4D; i++) { + if (p4d_none(p4d[i])) + continue; + xen_cleanmfnmap_p4d(p4d + i, unpin); + } + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + set_pgd(pgd, __pgd(0)); + xen_cleanmfnmap_free_pgtbl(p4d, unpin); + } } static void __init xen_pagetable_p2m_free(void) @@ -1538,7 +1593,6 @@ static int xen_pgd_alloc(struct mm_struct *mm) BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); } #endif - return ret; } @@ -1730,7 +1784,7 @@ static void xen_release_pmd(unsigned long pfn) xen_release_ptpage(pfn, PT_PMD); } -#if CONFIG_PGTABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS >= 4 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) { xen_alloc_ptpage(mm, pfn, PT_PUD); @@ -1792,10 +1846,6 @@ static void __init set_page_prot_flags(void *addr, pgprot_t prot, unsigned long pfn = __pa(addr) >> PAGE_SHIFT; pte_t pte = pfn_pte(pfn, prot); - /* For PVH no need to set R/O or R/W to pin them or unpin them. */ - if (xen_feature(XENFEAT_auto_translated_physmap)) - return; - if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags)) BUG(); } @@ -1902,8 +1952,7 @@ static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, * level2_ident_pgt, and level2_kernel_pgt. This means that only the * kernel has a physical mapping to start with - but that's enough to * get __va working. We need to fill in the rest of the physical - * mapping once some sort of allocator has been set up. NOTE: for - * PVH, the page tables are native. + * mapping once some sort of allocator has been set up. */ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { @@ -2076,21 +2125,27 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) */ void __init xen_relocate_p2m(void) { - phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys; + phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys; unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end; - int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud; + int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d; pte_t *pt; pmd_t *pmd; pud_t *pud; + p4d_t *p4d = NULL; pgd_t *pgd; unsigned long *new_p2m; + int save_pud; size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT; n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT; n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT; - n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; - n_frames = n_pte + n_pt + n_pmd + n_pud; + n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT; + if (PTRS_PER_P4D > 1) + n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; + else + n_p4d = 0; + n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d; new_area = xen_find_free_area(PFN_PHYS(n_frames)); if (!new_area) { @@ -2106,55 +2161,76 @@ void __init xen_relocate_p2m(void) * To avoid any possible virtual address collision, just use * 2 * PUD_SIZE for the new area. */ - pud_phys = new_area; + p4d_phys = new_area; + pud_phys = p4d_phys + PFN_PHYS(n_p4d); pmd_phys = pud_phys + PFN_PHYS(n_pud); pt_phys = pmd_phys + PFN_PHYS(n_pmd); p2m_pfn = PFN_DOWN(pt_phys) + n_pt; pgd = __va(read_cr3()); new_p2m = (unsigned long *)(2 * PGDIR_SIZE); - for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { - pud = early_memremap(pud_phys, PAGE_SIZE); - clear_page(pud); - for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); - idx_pmd++) { - pmd = early_memremap(pmd_phys, PAGE_SIZE); - clear_page(pmd); - for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); - idx_pt++) { - pt = early_memremap(pt_phys, PAGE_SIZE); - clear_page(pt); - for (idx_pte = 0; - idx_pte < min(n_pte, PTRS_PER_PTE); - idx_pte++) { - set_pte(pt + idx_pte, - pfn_pte(p2m_pfn, PAGE_KERNEL)); - p2m_pfn++; + idx_p4d = 0; + save_pud = n_pud; + do { + if (n_p4d > 0) { + p4d = early_memremap(p4d_phys, PAGE_SIZE); + clear_page(p4d); + n_pud = min(save_pud, PTRS_PER_P4D); + } + for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { + pud = early_memremap(pud_phys, PAGE_SIZE); + clear_page(pud); + for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); + idx_pmd++) { + pmd = early_memremap(pmd_phys, PAGE_SIZE); + clear_page(pmd); + for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); + idx_pt++) { + pt = early_memremap(pt_phys, PAGE_SIZE); + clear_page(pt); + for (idx_pte = 0; + idx_pte < min(n_pte, PTRS_PER_PTE); + idx_pte++) { + set_pte(pt + idx_pte, + pfn_pte(p2m_pfn, PAGE_KERNEL)); + p2m_pfn++; + } + n_pte -= PTRS_PER_PTE; + early_memunmap(pt, PAGE_SIZE); + make_lowmem_page_readonly(__va(pt_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, + PFN_DOWN(pt_phys)); + set_pmd(pmd + idx_pt, + __pmd(_PAGE_TABLE | pt_phys)); + pt_phys += PAGE_SIZE; } - n_pte -= PTRS_PER_PTE; - early_memunmap(pt, PAGE_SIZE); - make_lowmem_page_readonly(__va(pt_phys)); - pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, - PFN_DOWN(pt_phys)); - set_pmd(pmd + idx_pt, - __pmd(_PAGE_TABLE | pt_phys)); - pt_phys += PAGE_SIZE; + n_pt -= PTRS_PER_PMD; + early_memunmap(pmd, PAGE_SIZE); + make_lowmem_page_readonly(__va(pmd_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, + PFN_DOWN(pmd_phys)); + set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); + pmd_phys += PAGE_SIZE; } - n_pt -= PTRS_PER_PMD; - early_memunmap(pmd, PAGE_SIZE); - make_lowmem_page_readonly(__va(pmd_phys)); - pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, - PFN_DOWN(pmd_phys)); - set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); - pmd_phys += PAGE_SIZE; + n_pmd -= PTRS_PER_PUD; + early_memunmap(pud, PAGE_SIZE); + make_lowmem_page_readonly(__va(pud_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); + if (n_p4d > 0) + set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys)); + else + set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); + pud_phys += PAGE_SIZE; } - n_pmd -= PTRS_PER_PUD; - early_memunmap(pud, PAGE_SIZE); - make_lowmem_page_readonly(__va(pud_phys)); - pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); - set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); - pud_phys += PAGE_SIZE; - } + if (n_p4d > 0) { + save_pud -= PTRS_PER_P4D; + early_memunmap(p4d, PAGE_SIZE); + make_lowmem_page_readonly(__va(p4d_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys)); + set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys)); + p4d_phys += PAGE_SIZE; + } + } while (++idx_p4d < n_p4d); /* Now copy the old p2m info to the new area. */ memcpy(new_p2m, xen_p2m_addr, size); @@ -2331,6 +2407,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #endif case FIX_TEXT_POKE0: case FIX_TEXT_POKE1: + case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END: /* All local page mappings */ pte = pfn_pte(phys, prot); break; @@ -2383,8 +2460,8 @@ static void __init xen_post_allocator_init(void) pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pmd = xen_set_pmd; pv_mmu_ops.set_pud = xen_set_pud; -#if CONFIG_PGTABLE_LEVELS == 4 - pv_mmu_ops.set_pgd = xen_set_pgd; +#if CONFIG_PGTABLE_LEVELS >= 4 + pv_mmu_ops.set_p4d = xen_set_p4d; #endif /* This will work as long as patching hasn't happened yet @@ -2393,7 +2470,7 @@ static void __init xen_post_allocator_init(void) pv_mmu_ops.alloc_pmd = xen_alloc_pmd; pv_mmu_ops.release_pte = xen_release_pte; pv_mmu_ops.release_pmd = xen_release_pmd; -#if CONFIG_PGTABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS >= 4 pv_mmu_ops.alloc_pud = xen_alloc_pud; pv_mmu_ops.release_pud = xen_release_pud; #endif @@ -2459,10 +2536,10 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), -#if CONFIG_PGTABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS >= 4 .pud_val = PV_CALLEE_SAVE(xen_pud_val), .make_pud = PV_CALLEE_SAVE(xen_make_pud), - .set_pgd = xen_set_pgd_hyper, + .set_p4d = xen_set_p4d_hyper, .alloc_pud = xen_alloc_pmd_init, .release_pud = xen_release_pmd_init, @@ -2812,16 +2889,6 @@ static int do_remap_gfn(struct vm_area_struct *vma, BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); - if (xen_feature(XENFEAT_auto_translated_physmap)) { -#ifdef CONFIG_XEN_PVH - /* We need to update the local page tables and the xen HAP */ - return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr, - prot, domid, pages); -#else - return -EINVAL; -#endif - } - rmd.mfn = gfn; rmd.prot = prot; /* We use the err_ptr to indicate if there we are doing a contiguous @@ -2915,10 +2982,6 @@ int xen_unmap_domain_gfn_range(struct vm_area_struct *vma, if (!pages || !xen_feature(XENFEAT_auto_translated_physmap)) return 0; -#ifdef CONFIG_XEN_PVH - return xen_xlate_unmap_gfn_range(vma, numpgs, pages); -#else return -EINVAL; -#endif } EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range); diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 73809bb951b4..3fe2b3292915 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -5,6 +5,7 @@ enum pt_level { PT_PGD, + PT_P4D, PT_PUD, PT_PMD, PT_PTE diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index a9fafb5c8738..42b08f8fc2ca 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -18,7 +18,7 @@ int xen_swiotlb __read_mostly; -static struct dma_map_ops xen_swiotlb_dma_ops = { +static const struct dma_map_ops xen_swiotlb_dma_ops = { .alloc = xen_swiotlb_alloc_coherent, .free = xen_swiotlb_free_coherent, .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, @@ -48,7 +48,7 @@ int __init pci_xen_swiotlb_detect(void) * activate this IOMMU. If running as PV privileged, activate it * irregardless. */ - if ((xen_initial_domain() || swiotlb || swiotlb_force)) + if (xen_initial_domain() || swiotlb || swiotlb_force == SWIOTLB_FORCE) xen_swiotlb = 1; /* If we are running under Xen, we MUST disable the native SWIOTLB. diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index 90d1b83cf35f..33a783c77d96 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -73,8 +73,8 @@ bool xen_has_pv_devices(void) if (!xen_domain()) return false; - /* PV domains always have them. */ - if (xen_pv_domain()) + /* PV and PVH domains always have them. */ + if (xen_pv_domain() || xen_pvh_domain()) return true; /* And user has xen_platform_pci=0 set in guest config as diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 8c394e30e5fe..a5bf7c451435 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -14,7 +14,7 @@ #include <asm/elf.h> #include <asm/vdso.h> -#include <asm/e820.h> +#include <asm/e820/api.h> #include <asm/setup.h> #include <asm/acpi.h> #include <asm/numa.h> @@ -41,8 +41,7 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; unsigned long xen_released_pages; /* E820 map used during setting up memory. */ -static struct e820entry xen_e820_map[E820_X_MAX] __initdata; -static u32 xen_e820_map_entries __initdata; +static struct e820_table xen_e820_table __initdata; /* * Buffer used to remap identity mapped pages. We only need the virtual space. @@ -198,15 +197,15 @@ void __init xen_inv_extra_mem(void) */ static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn) { - const struct e820entry *entry = xen_e820_map; + const struct e820_entry *entry = xen_e820_table.entries; unsigned int i; unsigned long done = 0; - for (i = 0; i < xen_e820_map_entries; i++, entry++) { + for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) { unsigned long s_pfn; unsigned long e_pfn; - if (entry->type != E820_RAM) + if (entry->type != E820_TYPE_RAM) continue; e_pfn = PFN_DOWN(entry->addr + entry->size); @@ -457,7 +456,7 @@ static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages, { phys_addr_t start = 0; unsigned long ret_val = 0; - const struct e820entry *entry = xen_e820_map; + const struct e820_entry *entry = xen_e820_table.entries; int i; /* @@ -471,13 +470,13 @@ static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages, * example) the DMI tables in a reserved region that begins on * a non-page boundary. */ - for (i = 0; i < xen_e820_map_entries; i++, entry++) { + for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) { phys_addr_t end = entry->addr + entry->size; - if (entry->type == E820_RAM || i == xen_e820_map_entries - 1) { + if (entry->type == E820_TYPE_RAM || i == xen_e820_table.nr_entries - 1) { unsigned long start_pfn = PFN_DOWN(start); unsigned long end_pfn = PFN_UP(end); - if (entry->type == E820_RAM) + if (entry->type == E820_TYPE_RAM) end_pfn = PFN_UP(entry->addr); if (start_pfn < end_pfn) @@ -591,28 +590,28 @@ static void __init xen_align_and_add_e820_region(phys_addr_t start, phys_addr_t end = start + size; /* Align RAM regions to page boundaries. */ - if (type == E820_RAM) { + if (type == E820_TYPE_RAM) { start = PAGE_ALIGN(start); end &= ~((phys_addr_t)PAGE_SIZE - 1); } - e820_add_region(start, end - start, type); + e820__range_add(start, end - start, type); } static void __init xen_ignore_unusable(void) { - struct e820entry *entry = xen_e820_map; + struct e820_entry *entry = xen_e820_table.entries; unsigned int i; - for (i = 0; i < xen_e820_map_entries; i++, entry++) { - if (entry->type == E820_UNUSABLE) - entry->type = E820_RAM; + for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) { + if (entry->type == E820_TYPE_UNUSABLE) + entry->type = E820_TYPE_RAM; } } bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) { - struct e820entry *entry; + struct e820_entry *entry; unsigned mapcnt; phys_addr_t end; @@ -620,10 +619,10 @@ bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) return false; end = start + size; - entry = xen_e820_map; + entry = xen_e820_table.entries; - for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++) { - if (entry->type == E820_RAM && entry->addr <= start && + for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) { + if (entry->type == E820_TYPE_RAM && entry->addr <= start && (entry->addr + entry->size) >= end) return false; @@ -645,10 +644,10 @@ phys_addr_t __init xen_find_free_area(phys_addr_t size) { unsigned mapcnt; phys_addr_t addr, start; - struct e820entry *entry = xen_e820_map; + struct e820_entry *entry = xen_e820_table.entries; - for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++, entry++) { - if (entry->type != E820_RAM || entry->size < size) + for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++, entry++) { + if (entry->type != E820_TYPE_RAM || entry->size < size) continue; start = entry->addr; for (addr = start; addr < start + size; addr += PAGE_SIZE) { @@ -713,10 +712,9 @@ static void __init xen_reserve_xen_mfnlist(void) size = PFN_PHYS(xen_start_info->nr_p2m_frames); } - if (!xen_is_e820_reserved(start, size)) { - memblock_reserve(start, size); + memblock_reserve(start, size); + if (!xen_is_e820_reserved(start, size)) return; - } #ifdef CONFIG_X86_32 /* @@ -727,6 +725,7 @@ static void __init xen_reserve_xen_mfnlist(void) BUG(); #else xen_relocate_p2m(); + memblock_free(start, size); #endif } @@ -750,8 +749,8 @@ char * __init xen_memory_setup(void) max_pfn = min(max_pfn, xen_start_info->nr_pages); mem_end = PFN_PHYS(max_pfn); - memmap.nr_entries = ARRAY_SIZE(xen_e820_map); - set_xen_guest_handle(memmap.buffer, xen_e820_map); + memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries); + set_xen_guest_handle(memmap.buffer, xen_e820_table.entries); op = xen_initial_domain() ? XENMEM_machine_memory_map : @@ -760,16 +759,16 @@ char * __init xen_memory_setup(void) if (rc == -ENOSYS) { BUG_ON(xen_initial_domain()); memmap.nr_entries = 1; - xen_e820_map[0].addr = 0ULL; - xen_e820_map[0].size = mem_end; + xen_e820_table.entries[0].addr = 0ULL; + xen_e820_table.entries[0].size = mem_end; /* 8MB slack (to balance backend allocations). */ - xen_e820_map[0].size += 8ULL << 20; - xen_e820_map[0].type = E820_RAM; + xen_e820_table.entries[0].size += 8ULL << 20; + xen_e820_table.entries[0].type = E820_TYPE_RAM; rc = 0; } BUG_ON(rc); BUG_ON(memmap.nr_entries == 0); - xen_e820_map_entries = memmap.nr_entries; + xen_e820_table.nr_entries = memmap.nr_entries; /* * Xen won't allow a 1:1 mapping to be created to UNUSABLE @@ -783,8 +782,7 @@ char * __init xen_memory_setup(void) xen_ignore_unusable(); /* Make sure the Xen-supplied memory map is well-ordered. */ - sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map), - &xen_e820_map_entries); + e820__update_table(&xen_e820_table); max_pages = xen_get_max_pages(); @@ -811,15 +809,15 @@ char * __init xen_memory_setup(void) extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), extra_pages, max_pages - max_pfn); i = 0; - addr = xen_e820_map[0].addr; - size = xen_e820_map[0].size; - while (i < xen_e820_map_entries) { + addr = xen_e820_table.entries[0].addr; + size = xen_e820_table.entries[0].size; + while (i < xen_e820_table.nr_entries) { bool discard = false; chunk_size = size; - type = xen_e820_map[i].type; + type = xen_e820_table.entries[i].type; - if (type == E820_RAM) { + if (type == E820_TYPE_RAM) { if (addr < mem_end) { chunk_size = min(size, mem_end - addr); } else if (extra_pages) { @@ -840,9 +838,9 @@ char * __init xen_memory_setup(void) size -= chunk_size; if (size == 0) { i++; - if (i < xen_e820_map_entries) { - addr = xen_e820_map[i].addr; - size = xen_e820_map[i].size; + if (i < xen_e820_table.nr_entries) { + addr = xen_e820_table.entries[i].addr; + size = xen_e820_table.entries[i].size; } } } @@ -858,10 +856,9 @@ char * __init xen_memory_setup(void) * reserve ISA memory anyway because too many things poke * about in there. */ - e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, - E820_RESERVED); + e820__range_add(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_TYPE_RESERVED); - sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map); + e820__update_table(e820_table); /* * Check whether the kernel itself conflicts with the target E820 map. @@ -923,21 +920,19 @@ char * __init xen_auto_xlated_memory_setup(void) int i; int rc; - memmap.nr_entries = ARRAY_SIZE(xen_e820_map); - set_xen_guest_handle(memmap.buffer, xen_e820_map); + memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries); + set_xen_guest_handle(memmap.buffer, xen_e820_table.entries); rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); if (rc < 0) panic("No memory map (%d)\n", rc); - xen_e820_map_entries = memmap.nr_entries; + xen_e820_table.nr_entries = memmap.nr_entries; - sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map), - &xen_e820_map_entries); + e820__update_table(&xen_e820_table); - for (i = 0; i < xen_e820_map_entries; i++) - e820_add_region(xen_e820_map[i].addr, xen_e820_map[i].size, - xen_e820_map[i].type); + for (i = 0; i < xen_e820_table.nr_entries; i++) + e820__range_add(xen_e820_table.entries[i].addr, xen_e820_table.entries[i].size, xen_e820_table.entries[i].type); /* Remove p2m info, it is not needed. */ xen_start_info->mfn_list = 0; diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 311acad7dad2..eaa36162ed4a 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -18,6 +18,7 @@ #include <linux/smp.h> #include <linux/irq_work.h> #include <linux/tick.h> +#include <linux/nmi.h> #include <asm/paravirt.h> #include <asm/desc.h> @@ -99,18 +100,8 @@ static void cpu_bringup(void) local_irq_enable(); } -/* - * Note: cpu parameter is only relevant for PVH. The reason for passing it - * is we can't do smp_processor_id until the percpu segments are loaded, for - * which we need the cpu number! So we pass it in rdi as first parameter. - */ -asmlinkage __visible void cpu_bringup_and_idle(int cpu) +asmlinkage __visible void cpu_bringup_and_idle(void) { -#ifdef CONFIG_XEN_PVH - if (xen_feature(XENFEAT_auto_translated_physmap) && - xen_feature(XENFEAT_supervisor_mode_kernel)) - xen_pvh_secondary_vcpu_init(cpu); -#endif cpu_bringup(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } @@ -401,64 +392,50 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) if (ctxt == NULL) return -ENOMEM; - gdt = get_cpu_gdt_table(cpu); + gdt = get_cpu_gdt_rw(cpu); #ifdef CONFIG_X86_32 - /* Note: PVH is not yet supported on x86_32. */ ctxt->user_regs.fs = __KERNEL_PERCPU; ctxt->user_regs.gs = __KERNEL_STACK_CANARY; #endif memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); - if (!xen_feature(XENFEAT_auto_translated_physmap)) { - ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; - ctxt->flags = VGCF_IN_KERNEL; - ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ - ctxt->user_regs.ds = __USER_DS; - ctxt->user_regs.es = __USER_DS; - ctxt->user_regs.ss = __KERNEL_DS; + ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; + ctxt->flags = VGCF_IN_KERNEL; + ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ + ctxt->user_regs.ds = __USER_DS; + ctxt->user_regs.es = __USER_DS; + ctxt->user_regs.ss = __KERNEL_DS; - xen_copy_trap_info(ctxt->trap_ctxt); + xen_copy_trap_info(ctxt->trap_ctxt); - ctxt->ldt_ents = 0; + ctxt->ldt_ents = 0; - BUG_ON((unsigned long)gdt & ~PAGE_MASK); + BUG_ON((unsigned long)gdt & ~PAGE_MASK); - gdt_mfn = arbitrary_virt_to_mfn(gdt); - make_lowmem_page_readonly(gdt); - make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); + gdt_mfn = arbitrary_virt_to_mfn(gdt); + make_lowmem_page_readonly(gdt); + make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); - ctxt->gdt_frames[0] = gdt_mfn; - ctxt->gdt_ents = GDT_ENTRIES; + ctxt->gdt_frames[0] = gdt_mfn; + ctxt->gdt_ents = GDT_ENTRIES; - ctxt->kernel_ss = __KERNEL_DS; - ctxt->kernel_sp = idle->thread.sp0; + ctxt->kernel_ss = __KERNEL_DS; + ctxt->kernel_sp = idle->thread.sp0; #ifdef CONFIG_X86_32 - ctxt->event_callback_cs = __KERNEL_CS; - ctxt->failsafe_callback_cs = __KERNEL_CS; + ctxt->event_callback_cs = __KERNEL_CS; + ctxt->failsafe_callback_cs = __KERNEL_CS; #else - ctxt->gs_base_kernel = per_cpu_offset(cpu); -#endif - ctxt->event_callback_eip = - (unsigned long)xen_hypervisor_callback; - ctxt->failsafe_callback_eip = - (unsigned long)xen_failsafe_callback; - ctxt->user_regs.cs = __KERNEL_CS; - per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); - } -#ifdef CONFIG_XEN_PVH - else { - /* - * The vcpu comes on kernel page tables which have the NX pte - * bit set. This means before DS/SS is touched, NX in - * EFER must be set. Hence the following assembly glue code. - */ - ctxt->user_regs.eip = (unsigned long)xen_pvh_early_cpu_init; - ctxt->user_regs.rdi = cpu; - ctxt->user_regs.rsi = true; /* entry == true */ - } + ctxt->gs_base_kernel = per_cpu_offset(cpu); #endif + ctxt->event_callback_eip = + (unsigned long)xen_hypervisor_callback; + ctxt->failsafe_callback_eip = + (unsigned long)xen_failsafe_callback; + ctxt->user_regs.cs = __KERNEL_CS; + per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); + ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir)); if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt)) diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h index c5c16dc4f694..9beef333584a 100644 --- a/arch/x86/xen/smp.h +++ b/arch/x86/xen/smp.h @@ -21,12 +21,4 @@ static inline int xen_smp_intr_init(unsigned int cpu) static inline void xen_smp_intr_free(unsigned int cpu) {} #endif /* CONFIG_SMP */ -#ifdef CONFIG_XEN_PVH -extern void xen_pvh_early_cpu_init(int cpu, bool entry); -#else -static inline void xen_pvh_early_cpu_init(int cpu, bool entry) -{ -} -#endif - #endif diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index e8a9ea7d7a21..25a7c4302ce7 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -141,25 +141,6 @@ void __init xen_init_spinlocks(void) pv_lock_ops.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen); } -/* - * While the jump_label init code needs to happend _after_ the jump labels are - * enabled and before SMP is started. Hence we use pre-SMP initcall level - * init. We cannot do it in xen_init_spinlocks as that is done before - * jump labels are activated. - */ -static __init int xen_init_spinlocks_jump(void) -{ - if (!xen_pvspin) - return 0; - - if (!xen_domain()) - return 0; - - static_key_slow_inc(¶virt_ticketlocks_enabled); - return 0; -} -early_initcall(xen_init_spinlocks_jump); - static __init int xen_parse_nopvspin(char *arg) { xen_pvspin = false; diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 1e69956d7852..7a3089285c59 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -209,7 +209,9 @@ static const struct clock_event_device xen_timerop_clockevent = { .features = CLOCK_EVT_FEAT_ONESHOT, .max_delta_ns = 0xffffffff, + .max_delta_ticks = 0xffffffff, .min_delta_ns = TIMER_SLOP, + .min_delta_ticks = TIMER_SLOP, .mult = 1, .shift = 0, @@ -268,7 +270,9 @@ static const struct clock_event_device xen_vcpuop_clockevent = { .features = CLOCK_EVT_FEAT_ONESHOT, .max_delta_ns = 0xffffffff, + .max_delta_ticks = 0xffffffff, .min_delta_ns = TIMER_SLOP, + .min_delta_ticks = TIMER_SLOP, .mult = 1, .shift = 0, diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 7f8d8abf4c1a..37794e42b67d 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -16,25 +16,6 @@ #include <xen/interface/xen-mca.h> #include <asm/xen/interface.h> -#ifdef CONFIG_XEN_PVH -#define PVH_FEATURES_STR "|writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel" -/* Note the lack of 'hvm_callback_vector'. Older hypervisor will - * balk at this being part of XEN_ELFNOTE_FEATURES, so we put it in - * XEN_ELFNOTE_SUPPORTED_FEATURES which older hypervisors will ignore. - */ -#define PVH_FEATURES ((1 << XENFEAT_writable_page_tables) | \ - (1 << XENFEAT_auto_translated_physmap) | \ - (1 << XENFEAT_supervisor_mode_kernel) | \ - (1 << XENFEAT_hvm_callback_vector)) -/* The XENFEAT_writable_page_tables is not stricly necessary as we set that - * up regardless whether this CONFIG option is enabled or not, but it - * clarifies what the right flags need to be. - */ -#else -#define PVH_FEATURES_STR "" -#define PVH_FEATURES (0) -#endif - __INIT ENTRY(startup_xen) cld @@ -54,41 +35,6 @@ ENTRY(startup_xen) __FINIT -#ifdef CONFIG_XEN_PVH -/* - * xen_pvh_early_cpu_init() - early PVH VCPU initialization - * @cpu: this cpu number (%rdi) - * @entry: true if this is a secondary vcpu coming up on this entry - * point, false if this is the boot CPU being initialized for - * the first time (%rsi) - * - * Note: This is called as a function on the boot CPU, and is the entry point - * on the secondary CPU. - */ -ENTRY(xen_pvh_early_cpu_init) - mov %rsi, %r11 - - /* Gather features to see if NX implemented. */ - mov $0x80000001, %eax - cpuid - mov %edx, %esi - - mov $MSR_EFER, %ecx - rdmsr - bts $_EFER_SCE, %eax - - bt $20, %esi - jnc 1f /* No NX, skip setting it */ - bts $_EFER_NX, %eax -1: wrmsr -#ifdef CONFIG_SMP - cmp $0, %r11b - jne cpu_bringup_and_idle -#endif - ret - -#endif /* CONFIG_XEN_PVH */ - .pushsection .text .balign PAGE_SIZE ENTRY(hypercall_page) @@ -114,10 +60,10 @@ ENTRY(hypercall_page) #endif ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) - ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables|pae_pgdir_above_4gb"; .asciz PVH_FEATURES_STR) - ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long (PVH_FEATURES) | - (1 << XENFEAT_writable_page_tables) | - (1 << XENFEAT_dom0)) + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, + .ascii "!writable_page_tables|pae_pgdir_above_4gb") + ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, + .long (1 << XENFEAT_writable_page_tables) | (1 << XENFEAT_dom0)) ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index ac0a2b0f9e62..f6a41c41ebc7 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -146,5 +146,4 @@ __visible void xen_adjust_exception_frame(void); extern int xen_panic_handler_init(void); -void xen_pvh_secondary_vcpu_init(int cpu); #endif /* XEN_OPS_H */ diff --git a/arch/x86/xen/xen-pvh.S b/arch/x86/xen/xen-pvh.S new file mode 100644 index 000000000000..5e246716d58f --- /dev/null +++ b/arch/x86/xen/xen-pvh.S @@ -0,0 +1,161 @@ +/* + * Copyright C 2016, Oracle and/or its affiliates. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see <http://www.gnu.org/licenses/>. + */ + + .code32 + .text +#define _pa(x) ((x) - __START_KERNEL_map) + +#include <linux/elfnote.h> +#include <linux/init.h> +#include <linux/linkage.h> +#include <asm/segment.h> +#include <asm/asm.h> +#include <asm/boot.h> +#include <asm/processor-flags.h> +#include <asm/msr.h> +#include <xen/interface/elfnote.h> + + __HEAD + +/* + * Entry point for PVH guests. + * + * Xen ABI specifies the following register state when we come here: + * + * - `ebx`: contains the physical memory address where the loader has placed + * the boot start info structure. + * - `cr0`: bit 0 (PE) must be set. All the other writeable bits are cleared. + * - `cr4`: all bits are cleared. + * - `cs `: must be a 32-bit read/execute code segment with a base of ‘0’ + * and a limit of ‘0xFFFFFFFF’. The selector value is unspecified. + * - `ds`, `es`: must be a 32-bit read/write data segment with a base of + * ‘0’ and a limit of ‘0xFFFFFFFF’. The selector values are all + * unspecified. + * - `tr`: must be a 32-bit TSS (active) with a base of '0' and a limit + * of '0x67'. + * - `eflags`: bit 17 (VM) must be cleared. Bit 9 (IF) must be cleared. + * Bit 8 (TF) must be cleared. Other bits are all unspecified. + * + * All other processor registers and flag bits are unspecified. The OS is in + * charge of setting up it's own stack, GDT and IDT. + */ + +ENTRY(pvh_start_xen) + cld + + lgdt (_pa(gdt)) + + mov $(__BOOT_DS),%eax + mov %eax,%ds + mov %eax,%es + mov %eax,%ss + + /* Stash hvm_start_info. */ + mov $_pa(pvh_start_info), %edi + mov %ebx, %esi + mov _pa(pvh_start_info_sz), %ecx + shr $2,%ecx + rep + movsl + + mov $_pa(early_stack_end), %esp + + /* Enable PAE mode. */ + mov %cr4, %eax + orl $X86_CR4_PAE, %eax + mov %eax, %cr4 + +#ifdef CONFIG_X86_64 + /* Enable Long mode. */ + mov $MSR_EFER, %ecx + rdmsr + btsl $_EFER_LME, %eax + wrmsr + + /* Enable pre-constructed page tables. */ + mov $_pa(init_level4_pgt), %eax + mov %eax, %cr3 + mov $(X86_CR0_PG | X86_CR0_PE), %eax + mov %eax, %cr0 + + /* Jump to 64-bit mode. */ + ljmp $__KERNEL_CS, $_pa(1f) + + /* 64-bit entry point. */ + .code64 +1: + call xen_prepare_pvh + + /* startup_64 expects boot_params in %rsi. */ + mov $_pa(pvh_bootparams), %rsi + mov $_pa(startup_64), %rax + jmp *%rax + +#else /* CONFIG_X86_64 */ + + call mk_early_pgtbl_32 + + mov $_pa(initial_page_table), %eax + mov %eax, %cr3 + + mov %cr0, %eax + or $(X86_CR0_PG | X86_CR0_PE), %eax + mov %eax, %cr0 + + ljmp $__BOOT_CS, $1f +1: + call xen_prepare_pvh + mov $_pa(pvh_bootparams), %esi + + /* startup_32 doesn't expect paging and PAE to be on. */ + ljmp $__BOOT_CS, $_pa(2f) +2: + mov %cr0, %eax + and $~X86_CR0_PG, %eax + mov %eax, %cr0 + mov %cr4, %eax + and $~X86_CR4_PAE, %eax + mov %eax, %cr4 + + ljmp $__BOOT_CS, $_pa(startup_32) +#endif +END(pvh_start_xen) + + .section ".init.data","aw" + .balign 8 +gdt: + .word gdt_end - gdt_start + .long _pa(gdt_start) + .word 0 +gdt_start: + .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x0000000000000000 /* reserved */ +#ifdef CONFIG_X86_64 + .quad GDT_ENTRY(0xa09a, 0, 0xfffff) /* __KERNEL_CS */ +#else + .quad GDT_ENTRY(0xc09a, 0, 0xfffff) /* __KERNEL_CS */ +#endif + .quad GDT_ENTRY(0xc092, 0, 0xfffff) /* __KERNEL_DS */ +gdt_end: + + .balign 4 +early_stack: + .fill 256, 1, 0 +early_stack_end: + + ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY, + _ASM_PTR (pvh_start_xen - __START_KERNEL_map)) |