summaryrefslogtreecommitdiff
path: root/arch/x86/xen
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/xen')
-rw-r--r--arch/x86/xen/Kconfig2
-rw-r--r--arch/x86/xen/Makefile1
-rw-r--r--arch/x86/xen/apic.c2
-rw-r--r--arch/x86/xen/enlighten.c281
-rw-r--r--arch/x86/xen/mmu.c423
-rw-r--r--arch/x86/xen/mmu.h1
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c4
-rw-r--r--arch/x86/xen/platform-pci-unplug.c4
-rw-r--r--arch/x86/xen/setup.c103
-rw-r--r--arch/x86/xen/smp.c81
-rw-r--r--arch/x86/xen/smp.h8
-rw-r--r--arch/x86/xen/spinlock.c19
-rw-r--r--arch/x86/xen/time.c4
-rw-r--r--arch/x86/xen/xen-head.S62
-rw-r--r--arch/x86/xen/xen-ops.h1
-rw-r--r--arch/x86/xen/xen-pvh.S161
16 files changed, 646 insertions, 511 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index c7b15f3e2cf3..76b6dbd627df 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -53,5 +53,5 @@ config XEN_DEBUG_FS
config XEN_PVH
bool "Support for running as a PVH guest"
- depends on X86_64 && XEN && XEN_PVHVM
+ depends on XEN && XEN_PVHVM && ACPI
def_bool n
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index e47e52787d32..cb0164aee156 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -23,3 +23,4 @@ obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o
obj-$(CONFIG_XEN_DOM0) += vga.o
obj-$(CONFIG_SWIOTLB_XEN) += pci-swiotlb-xen.o
obj-$(CONFIG_XEN_EFI) += efi.o
+obj-$(CONFIG_XEN_PVH) += xen-pvh.o
diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 44c88ad1841a..bcea81f36fc5 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -145,7 +145,7 @@ static void xen_silent_inquire(int apicid)
static int xen_cpu_present_to_apicid(int cpu)
{
if (cpu_present(cpu))
- return xen_get_apic_id(xen_apic_read(APIC_ID));
+ return cpu_data(cpu).apicid;
else
return BAD_APICID;
}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 51ef95232725..30822e8e64ac 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -45,6 +45,7 @@
#include <xen/interface/memory.h>
#include <xen/interface/nmi.h>
#include <xen/interface/xen-mca.h>
+#include <xen/interface/hvm/start_info.h>
#include <xen/features.h>
#include <xen/page.h>
#include <xen/hvm.h>
@@ -75,6 +76,7 @@
#include <asm/mwait.h>
#include <asm/pci_x86.h>
#include <asm/cpu.h>
+#include <asm/e820/api.h>
#ifdef CONFIG_ACPI
#include <linux/acpi.h>
@@ -176,6 +178,20 @@ struct tls_descs {
*/
static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
+#ifdef CONFIG_XEN_PVH
+/*
+ * PVH variables.
+ *
+ * xen_pvh and pvh_bootparams need to live in data segment since they
+ * are used after startup_{32|64}, which clear .bss, are invoked.
+ */
+bool xen_pvh __attribute__((section(".data"))) = 0;
+struct boot_params pvh_bootparams __attribute__((section(".data")));
+
+struct hvm_start_info pvh_start_info;
+unsigned int pvh_start_info_sz = sizeof(pvh_start_info);
+#endif
+
static void clamp_max_cpus(void)
{
#ifdef CONFIG_SMP
@@ -695,7 +711,7 @@ static void load_TLS_descriptor(struct thread_struct *t,
*shadow = t->tls_array[i];
- gdt = get_cpu_gdt_table(cpu);
+ gdt = get_cpu_gdt_rw(cpu);
maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
mc = __xen_mc_entry(0);
@@ -1138,10 +1154,11 @@ void xen_setup_vcpu_info_placement(void)
xen_vcpu_setup(cpu);
}
- /* xen_vcpu_setup managed to place the vcpu_info within the
- * percpu area for all cpus, so make use of it. Note that for
- * PVH we want to use native IRQ mechanism. */
- if (have_vcpu_info_placement && !xen_pvh_domain()) {
+ /*
+ * xen_vcpu_setup managed to place the vcpu_info within the
+ * percpu area for all cpus, so make use of it.
+ */
+ if (have_vcpu_info_placement) {
pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
@@ -1413,49 +1430,9 @@ static void __init xen_boot_params_init_edd(void)
* Set up the GDT and segment registers for -fstack-protector. Until
* we do this, we have to be careful not to call any stack-protected
* function, which is most of the kernel.
- *
- * Note, that it is __ref because the only caller of this after init
- * is PVH which is not going to use xen_load_gdt_boot or other
- * __init functions.
*/
-static void __ref xen_setup_gdt(int cpu)
+static void xen_setup_gdt(int cpu)
{
- if (xen_feature(XENFEAT_auto_translated_physmap)) {
-#ifdef CONFIG_X86_64
- unsigned long dummy;
-
- load_percpu_segment(cpu); /* We need to access per-cpu area */
- switch_to_new_gdt(cpu); /* GDT and GS set */
-
- /* We are switching of the Xen provided GDT to our HVM mode
- * GDT. The new GDT has __KERNEL_CS with CS.L = 1
- * and we are jumping to reload it.
- */
- asm volatile ("pushq %0\n"
- "leaq 1f(%%rip),%0\n"
- "pushq %0\n"
- "lretq\n"
- "1:\n"
- : "=&r" (dummy) : "0" (__KERNEL_CS));
-
- /*
- * While not needed, we also set the %es, %ds, and %fs
- * to zero. We don't care about %ss as it is NULL.
- * Strictly speaking this is not needed as Xen zeros those
- * out (and also MSR_FS_BASE, MSR_GS_BASE, MSR_KERNEL_GS_BASE)
- *
- * Linux zeros them in cpu_init() and in secondary_startup_64
- * (for BSP).
- */
- loadsegment(es, 0);
- loadsegment(ds, 0);
- loadsegment(fs, 0);
-#else
- /* PVH: TODO Implement. */
- BUG();
-#endif
- return; /* PVH does not need any PV GDT ops. */
- }
pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;
pv_cpu_ops.load_gdt = xen_load_gdt_boot;
@@ -1466,59 +1443,6 @@ static void __ref xen_setup_gdt(int cpu)
pv_cpu_ops.load_gdt = xen_load_gdt;
}
-#ifdef CONFIG_XEN_PVH
-/*
- * A PV guest starts with default flags that are not set for PVH, set them
- * here asap.
- */
-static void xen_pvh_set_cr_flags(int cpu)
-{
-
- /* Some of these are setup in 'secondary_startup_64'. The others:
- * X86_CR0_TS, X86_CR0_PE, X86_CR0_ET are set by Xen for HVM guests
- * (which PVH shared codepaths), while X86_CR0_PG is for PVH. */
- write_cr0(read_cr0() | X86_CR0_MP | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM);
-
- if (!cpu)
- return;
- /*
- * For BSP, PSE PGE are set in probe_page_size_mask(), for APs
- * set them here. For all, OSFXSR OSXMMEXCPT are set in fpu__init_cpu().
- */
- if (boot_cpu_has(X86_FEATURE_PSE))
- cr4_set_bits_and_update_boot(X86_CR4_PSE);
-
- if (boot_cpu_has(X86_FEATURE_PGE))
- cr4_set_bits_and_update_boot(X86_CR4_PGE);
-}
-
-/*
- * Note, that it is ref - because the only caller of this after init
- * is PVH which is not going to use xen_load_gdt_boot or other
- * __init functions.
- */
-void __ref xen_pvh_secondary_vcpu_init(int cpu)
-{
- xen_setup_gdt(cpu);
- xen_pvh_set_cr_flags(cpu);
-}
-
-static void __init xen_pvh_early_guest_init(void)
-{
- if (!xen_feature(XENFEAT_auto_translated_physmap))
- return;
-
- BUG_ON(!xen_feature(XENFEAT_hvm_callback_vector));
-
- xen_pvh_early_cpu_init(0, false);
- xen_pvh_set_cr_flags(0);
-
-#ifdef CONFIG_X86_32
- BUG(); /* PVH: Implement proper support. */
-#endif
-}
-#endif /* CONFIG_XEN_PVH */
-
static void __init xen_dom0_set_legacy_features(void)
{
x86_platform.legacy.rtc = 1;
@@ -1555,24 +1479,17 @@ asmlinkage __visible void __init xen_start_kernel(void)
xen_domain_type = XEN_PV_DOMAIN;
xen_setup_features();
-#ifdef CONFIG_XEN_PVH
- xen_pvh_early_guest_init();
-#endif
+
xen_setup_machphys_mapping();
/* Install Xen paravirt ops */
pv_info = xen_info;
pv_init_ops = xen_init_ops;
- if (!xen_pvh_domain()) {
- pv_cpu_ops = xen_cpu_ops;
+ pv_cpu_ops = xen_cpu_ops;
- x86_platform.get_nmi_reason = xen_get_nmi_reason;
- }
+ x86_platform.get_nmi_reason = xen_get_nmi_reason;
- if (xen_feature(XENFEAT_auto_translated_physmap))
- x86_init.resources.memory_setup = xen_auto_xlated_memory_setup;
- else
- x86_init.resources.memory_setup = xen_memory_setup;
+ x86_init.resources.memory_setup = xen_memory_setup;
x86_init.oem.arch_setup = xen_arch_setup;
x86_init.oem.banner = xen_banner;
@@ -1665,24 +1582,20 @@ asmlinkage __visible void __init xen_start_kernel(void)
/* set the limit of our address space */
xen_reserve_top();
- /* PVH: runs at default kernel iopl of 0 */
- if (!xen_pvh_domain()) {
- /*
- * We used to do this in xen_arch_setup, but that is too late
- * on AMD were early_cpu_init (run before ->arch_setup()) calls
- * early_amd_init which pokes 0xcf8 port.
- */
- set_iopl.iopl = 1;
- rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
- if (rc != 0)
- xen_raw_printk("physdev_op failed %d\n", rc);
- }
+ /*
+ * We used to do this in xen_arch_setup, but that is too late
+ * on AMD were early_cpu_init (run before ->arch_setup()) calls
+ * early_amd_init which pokes 0xcf8 port.
+ */
+ set_iopl.iopl = 1;
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+ if (rc != 0)
+ xen_raw_printk("physdev_op failed %d\n", rc);
#ifdef CONFIG_X86_32
/* set up basic CPUID stuff */
cpu_detect(&new_cpu_data);
set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU);
- new_cpu_data.wp_works_ok = 1;
new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1);
#endif
@@ -1758,6 +1671,100 @@ asmlinkage __visible void __init xen_start_kernel(void)
#endif
}
+#ifdef CONFIG_XEN_PVH
+
+static void xen_pvh_arch_setup(void)
+{
+#ifdef CONFIG_ACPI
+ /* Make sure we don't fall back to (default) ACPI_IRQ_MODEL_PIC. */
+ if (nr_ioapics == 0)
+ acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM;
+#endif
+}
+
+static void __init init_pvh_bootparams(void)
+{
+ struct xen_memory_map memmap;
+ unsigned int i;
+ int rc;
+
+ memset(&pvh_bootparams, 0, sizeof(pvh_bootparams));
+
+ memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table);
+ set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table);
+ rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
+ if (rc) {
+ xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc);
+ BUG();
+ }
+
+ if (memmap.nr_entries < E820_MAX_ENTRIES_ZEROPAGE - 1) {
+ pvh_bootparams.e820_table[memmap.nr_entries].addr =
+ ISA_START_ADDRESS;
+ pvh_bootparams.e820_table[memmap.nr_entries].size =
+ ISA_END_ADDRESS - ISA_START_ADDRESS;
+ pvh_bootparams.e820_table[memmap.nr_entries].type =
+ E820_TYPE_RESERVED;
+ memmap.nr_entries++;
+ } else
+ xen_raw_printk("Warning: Can fit ISA range into e820\n");
+
+ pvh_bootparams.e820_entries = memmap.nr_entries;
+ for (i = 0; i < pvh_bootparams.e820_entries; i++)
+ e820__range_add(pvh_bootparams.e820_table[i].addr,
+ pvh_bootparams.e820_table[i].size,
+ pvh_bootparams.e820_table[i].type);
+
+ e820__update_table(e820_table);
+
+ pvh_bootparams.hdr.cmd_line_ptr =
+ pvh_start_info.cmdline_paddr;
+
+ /* The first module is always ramdisk. */
+ if (pvh_start_info.nr_modules) {
+ struct hvm_modlist_entry *modaddr =
+ __va(pvh_start_info.modlist_paddr);
+ pvh_bootparams.hdr.ramdisk_image = modaddr->paddr;
+ pvh_bootparams.hdr.ramdisk_size = modaddr->size;
+ }
+
+ /*
+ * See Documentation/x86/boot.txt.
+ *
+ * Version 2.12 supports Xen entry point but we will use default x86/PC
+ * environment (i.e. hardware_subarch 0).
+ */
+ pvh_bootparams.hdr.version = 0x212;
+ pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */
+}
+
+/*
+ * This routine (and those that it might call) should not use
+ * anything that lives in .bss since that segment will be cleared later.
+ */
+void __init xen_prepare_pvh(void)
+{
+ u32 msr;
+ u64 pfn;
+
+ if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) {
+ xen_raw_printk("Error: Unexpected magic value (0x%08x)\n",
+ pvh_start_info.magic);
+ BUG();
+ }
+
+ xen_pvh = 1;
+
+ msr = cpuid_ebx(xen_cpuid_base() + 2);
+ pfn = __pa(hypercall_page);
+ wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+
+ init_pvh_bootparams();
+
+ x86_init.oem.arch_setup = xen_pvh_arch_setup;
+}
+#endif
+
void __ref xen_hvm_init_shared_info(void)
{
int cpu;
@@ -1797,20 +1804,29 @@ void __ref xen_hvm_init_shared_info(void)
static void __init init_hvm_pv_info(void)
{
int major, minor;
- uint32_t eax, ebx, ecx, edx, pages, msr, base;
- u64 pfn;
+ uint32_t eax, ebx, ecx, edx, base;
base = xen_cpuid_base();
- cpuid(base + 1, &eax, &ebx, &ecx, &edx);
+ eax = cpuid_eax(base + 1);
major = eax >> 16;
minor = eax & 0xffff;
printk(KERN_INFO "Xen version %d.%d.\n", major, minor);
- cpuid(base + 2, &pages, &msr, &ecx, &edx);
+ xen_domain_type = XEN_HVM_DOMAIN;
- pfn = __pa(hypercall_page);
- wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+ /* PVH set up hypercall page in xen_prepare_pvh(). */
+ if (xen_pvh_domain())
+ pv_info.name = "Xen PVH";
+ else {
+ u64 pfn;
+ uint32_t msr;
+
+ pv_info.name = "Xen HVM";
+ msr = cpuid_ebx(base + 2);
+ pfn = __pa(hypercall_page);
+ wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+ }
xen_setup_features();
@@ -1819,10 +1835,6 @@ static void __init init_hvm_pv_info(void)
this_cpu_write(xen_vcpu_id, ebx);
else
this_cpu_write(xen_vcpu_id, smp_processor_id());
-
- pv_info.name = "Xen HVM";
-
- xen_domain_type = XEN_HVM_DOMAIN;
}
#endif
@@ -1910,6 +1922,9 @@ static void __init xen_hvm_guest_init(void)
x86_init.irqs.intr_init = xen_init_IRQ;
xen_hvm_init_time_ops();
xen_hvm_init_mmu_ops();
+
+ if (xen_pvh_domain())
+ machine_ops.emergency_restart = xen_emergency_restart;
#ifdef CONFIG_KEXEC_CORE
machine_ops.shutdown = xen_hvm_shutdown;
machine_ops.crash_shutdown = xen_hvm_crash_shutdown;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 7d5afdb417cc..f226038a39ca 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -38,7 +38,7 @@
*
* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
*/
-#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/highmem.h>
#include <linux/debugfs.h>
#include <linux/bug.h>
@@ -58,7 +58,7 @@
#include <asm/mmu_context.h>
#include <asm/setup.h>
#include <asm/paravirt.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
#include <asm/linkage.h>
#include <asm/page.h>
#include <asm/init.h>
@@ -535,40 +535,41 @@ static pgd_t *xen_get_user_pgd(pgd_t *pgd)
return user_ptr;
}
-static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
+static void __xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
{
struct mmu_update u;
u.ptr = virt_to_machine(ptr).maddr;
- u.val = pgd_val_ma(val);
+ u.val = p4d_val_ma(val);
xen_extend_mmu_update(&u);
}
/*
- * Raw hypercall-based set_pgd, intended for in early boot before
+ * Raw hypercall-based set_p4d, intended for in early boot before
* there's a page structure. This implies:
* 1. The only existing pagetable is the kernel's
* 2. It is always pinned
* 3. It has no user pagetable attached to it
*/
-static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
+static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
{
preempt_disable();
xen_mc_batch();
- __xen_set_pgd_hyper(ptr, val);
+ __xen_set_p4d_hyper(ptr, val);
xen_mc_issue(PARAVIRT_LAZY_MMU);
preempt_enable();
}
-static void xen_set_pgd(pgd_t *ptr, pgd_t val)
+static void xen_set_p4d(p4d_t *ptr, p4d_t val)
{
- pgd_t *user_ptr = xen_get_user_pgd(ptr);
+ pgd_t *user_ptr = xen_get_user_pgd((pgd_t *)ptr);
+ pgd_t pgd_val;
- trace_xen_mmu_set_pgd(ptr, user_ptr, val);
+ trace_xen_mmu_set_p4d(ptr, (p4d_t *)user_ptr, val);
/* If page is not pinned, we can just update the entry
directly */
@@ -576,7 +577,8 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val)
*ptr = val;
if (user_ptr) {
WARN_ON(xen_page_pinned(user_ptr));
- *user_ptr = val;
+ pgd_val.pgd = p4d_val_ma(val);
+ *user_ptr = pgd_val;
}
return;
}
@@ -585,14 +587,72 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val)
user updates together. */
xen_mc_batch();
- __xen_set_pgd_hyper(ptr, val);
+ __xen_set_p4d_hyper(ptr, val);
if (user_ptr)
- __xen_set_pgd_hyper(user_ptr, val);
+ __xen_set_p4d_hyper((p4d_t *)user_ptr, val);
xen_mc_issue(PARAVIRT_LAZY_MMU);
}
#endif /* CONFIG_PGTABLE_LEVELS == 4 */
+static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
+ int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
+ bool last, unsigned long limit)
+{
+ int i, nr, flush = 0;
+
+ nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD;
+ for (i = 0; i < nr; i++) {
+ if (!pmd_none(pmd[i]))
+ flush |= (*func)(mm, pmd_page(pmd[i]), PT_PTE);
+ }
+ return flush;
+}
+
+static int xen_pud_walk(struct mm_struct *mm, pud_t *pud,
+ int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
+ bool last, unsigned long limit)
+{
+ int i, nr, flush = 0;
+
+ nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD;
+ for (i = 0; i < nr; i++) {
+ pmd_t *pmd;
+
+ if (pud_none(pud[i]))
+ continue;
+
+ pmd = pmd_offset(&pud[i], 0);
+ if (PTRS_PER_PMD > 1)
+ flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
+ flush |= xen_pmd_walk(mm, pmd, func,
+ last && i == nr - 1, limit);
+ }
+ return flush;
+}
+
+static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
+ int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
+ bool last, unsigned long limit)
+{
+ int i, nr, flush = 0;
+
+ nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D;
+ for (i = 0; i < nr; i++) {
+ pud_t *pud;
+
+ if (p4d_none(p4d[i]))
+ continue;
+
+ pud = pud_offset(&p4d[i], 0);
+ if (PTRS_PER_PUD > 1)
+ flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
+ flush |= xen_pud_walk(mm, pud, func,
+ last && i == nr - 1, limit);
+ }
+ return flush;
+}
+
/*
* (Yet another) pagetable walker. This one is intended for pinning a
* pagetable. This means that it walks a pagetable and calls the
@@ -613,10 +673,8 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
enum pt_level),
unsigned long limit)
{
- int flush = 0;
+ int i, nr, flush = 0;
unsigned hole_low, hole_high;
- unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
- unsigned pgdidx, pudidx, pmdidx;
/* The limit is the last byte to be touched */
limit--;
@@ -633,65 +691,22 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
hole_low = pgd_index(USER_LIMIT);
hole_high = pgd_index(PAGE_OFFSET);
- pgdidx_limit = pgd_index(limit);
-#if PTRS_PER_PUD > 1
- pudidx_limit = pud_index(limit);
-#else
- pudidx_limit = 0;
-#endif
-#if PTRS_PER_PMD > 1
- pmdidx_limit = pmd_index(limit);
-#else
- pmdidx_limit = 0;
-#endif
-
- for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
- pud_t *pud;
+ nr = pgd_index(limit) + 1;
+ for (i = 0; i < nr; i++) {
+ p4d_t *p4d;
- if (pgdidx >= hole_low && pgdidx < hole_high)
+ if (i >= hole_low && i < hole_high)
continue;
- if (!pgd_val(pgd[pgdidx]))
+ if (pgd_none(pgd[i]))
continue;
- pud = pud_offset(&pgd[pgdidx], 0);
-
- if (PTRS_PER_PUD > 1) /* not folded */
- flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
-
- for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
- pmd_t *pmd;
-
- if (pgdidx == pgdidx_limit &&
- pudidx > pudidx_limit)
- goto out;
-
- if (pud_none(pud[pudidx]))
- continue;
-
- pmd = pmd_offset(&pud[pudidx], 0);
-
- if (PTRS_PER_PMD > 1) /* not folded */
- flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
-
- for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
- struct page *pte;
-
- if (pgdidx == pgdidx_limit &&
- pudidx == pudidx_limit &&
- pmdidx > pmdidx_limit)
- goto out;
-
- if (pmd_none(pmd[pmdidx]))
- continue;
-
- pte = pmd_page(pmd[pmdidx]);
- flush |= (*func)(mm, pte, PT_PTE);
- }
- }
+ p4d = p4d_offset(&pgd[i], 0);
+ if (PTRS_PER_P4D > 1)
+ flush |= (*func)(mm, virt_to_page(p4d), PT_P4D);
+ flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
}
-out:
/* Do the top level last, so that the callbacks can use it as
a cue to do final things like tlb flushes. */
flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
@@ -1150,57 +1165,97 @@ static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
xen_free_ro_pages(pa, PAGE_SIZE);
}
+static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin)
+{
+ unsigned long pa;
+ pte_t *pte_tbl;
+ int i;
+
+ if (pmd_large(*pmd)) {
+ pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
+ xen_free_ro_pages(pa, PMD_SIZE);
+ return;
+ }
+
+ pte_tbl = pte_offset_kernel(pmd, 0);
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ if (pte_none(pte_tbl[i]))
+ continue;
+ pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT;
+ xen_free_ro_pages(pa, PAGE_SIZE);
+ }
+ set_pmd(pmd, __pmd(0));
+ xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin);
+}
+
+static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin)
+{
+ unsigned long pa;
+ pmd_t *pmd_tbl;
+ int i;
+
+ if (pud_large(*pud)) {
+ pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
+ xen_free_ro_pages(pa, PUD_SIZE);
+ return;
+ }
+
+ pmd_tbl = pmd_offset(pud, 0);
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ if (pmd_none(pmd_tbl[i]))
+ continue;
+ xen_cleanmfnmap_pmd(pmd_tbl + i, unpin);
+ }
+ set_pud(pud, __pud(0));
+ xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin);
+}
+
+static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin)
+{
+ unsigned long pa;
+ pud_t *pud_tbl;
+ int i;
+
+ if (p4d_large(*p4d)) {
+ pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK;
+ xen_free_ro_pages(pa, P4D_SIZE);
+ return;
+ }
+
+ pud_tbl = pud_offset(p4d, 0);
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ if (pud_none(pud_tbl[i]))
+ continue;
+ xen_cleanmfnmap_pud(pud_tbl + i, unpin);
+ }
+ set_p4d(p4d, __p4d(0));
+ xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin);
+}
+
/*
* Since it is well isolated we can (and since it is perhaps large we should)
* also free the page tables mapping the initial P->M table.
*/
static void __init xen_cleanmfnmap(unsigned long vaddr)
{
- unsigned long va = vaddr & PMD_MASK;
- unsigned long pa;
- pgd_t *pgd = pgd_offset_k(va);
- pud_t *pud_page = pud_offset(pgd, 0);
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
+ pgd_t *pgd;
+ p4d_t *p4d;
unsigned int i;
bool unpin;
unpin = (vaddr == 2 * PGDIR_SIZE);
- set_pgd(pgd, __pgd(0));
- do {
- pud = pud_page + pud_index(va);
- if (pud_none(*pud)) {
- va += PUD_SIZE;
- } else if (pud_large(*pud)) {
- pa = pud_val(*pud) & PHYSICAL_PAGE_MASK;
- xen_free_ro_pages(pa, PUD_SIZE);
- va += PUD_SIZE;
- } else {
- pmd = pmd_offset(pud, va);
- if (pmd_large(*pmd)) {
- pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK;
- xen_free_ro_pages(pa, PMD_SIZE);
- } else if (!pmd_none(*pmd)) {
- pte = pte_offset_kernel(pmd, va);
- set_pmd(pmd, __pmd(0));
- for (i = 0; i < PTRS_PER_PTE; ++i) {
- if (pte_none(pte[i]))
- break;
- pa = pte_pfn(pte[i]) << PAGE_SHIFT;
- xen_free_ro_pages(pa, PAGE_SIZE);
- }
- xen_cleanmfnmap_free_pgtbl(pte, unpin);
- }
- va += PMD_SIZE;
- if (pmd_index(va))
- continue;
- set_pud(pud, __pud(0));
- xen_cleanmfnmap_free_pgtbl(pmd, unpin);
- }
-
- } while (pud_index(va) || pmd_index(va));
- xen_cleanmfnmap_free_pgtbl(pud_page, unpin);
+ vaddr &= PMD_MASK;
+ pgd = pgd_offset_k(vaddr);
+ p4d = p4d_offset(pgd, 0);
+ for (i = 0; i < PTRS_PER_P4D; i++) {
+ if (p4d_none(p4d[i]))
+ continue;
+ xen_cleanmfnmap_p4d(p4d + i, unpin);
+ }
+ if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+ set_pgd(pgd, __pgd(0));
+ xen_cleanmfnmap_free_pgtbl(p4d, unpin);
+ }
}
static void __init xen_pagetable_p2m_free(void)
@@ -1538,7 +1593,6 @@ static int xen_pgd_alloc(struct mm_struct *mm)
BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
}
#endif
-
return ret;
}
@@ -1730,7 +1784,7 @@ static void xen_release_pmd(unsigned long pfn)
xen_release_ptpage(pfn, PT_PMD);
}
-#if CONFIG_PGTABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS >= 4
static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PUD);
@@ -1792,10 +1846,6 @@ static void __init set_page_prot_flags(void *addr, pgprot_t prot,
unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
pte_t pte = pfn_pte(pfn, prot);
- /* For PVH no need to set R/O or R/W to pin them or unpin them. */
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return;
-
if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags))
BUG();
}
@@ -1902,8 +1952,7 @@ static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end,
* level2_ident_pgt, and level2_kernel_pgt. This means that only the
* kernel has a physical mapping to start with - but that's enough to
* get __va working. We need to fill in the rest of the physical
- * mapping once some sort of allocator has been set up. NOTE: for
- * PVH, the page tables are native.
+ * mapping once some sort of allocator has been set up.
*/
void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
{
@@ -2076,21 +2125,27 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
*/
void __init xen_relocate_p2m(void)
{
- phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys;
+ phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys;
unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
- int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud;
+ int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d;
pte_t *pt;
pmd_t *pmd;
pud_t *pud;
+ p4d_t *p4d = NULL;
pgd_t *pgd;
unsigned long *new_p2m;
+ int save_pud;
size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
- n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
- n_frames = n_pte + n_pt + n_pmd + n_pud;
+ n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT;
+ if (PTRS_PER_P4D > 1)
+ n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
+ else
+ n_p4d = 0;
+ n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d;
new_area = xen_find_free_area(PFN_PHYS(n_frames));
if (!new_area) {
@@ -2106,55 +2161,76 @@ void __init xen_relocate_p2m(void)
* To avoid any possible virtual address collision, just use
* 2 * PUD_SIZE for the new area.
*/
- pud_phys = new_area;
+ p4d_phys = new_area;
+ pud_phys = p4d_phys + PFN_PHYS(n_p4d);
pmd_phys = pud_phys + PFN_PHYS(n_pud);
pt_phys = pmd_phys + PFN_PHYS(n_pmd);
p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
pgd = __va(read_cr3());
new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
- for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
- pud = early_memremap(pud_phys, PAGE_SIZE);
- clear_page(pud);
- for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
- idx_pmd++) {
- pmd = early_memremap(pmd_phys, PAGE_SIZE);
- clear_page(pmd);
- for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
- idx_pt++) {
- pt = early_memremap(pt_phys, PAGE_SIZE);
- clear_page(pt);
- for (idx_pte = 0;
- idx_pte < min(n_pte, PTRS_PER_PTE);
- idx_pte++) {
- set_pte(pt + idx_pte,
- pfn_pte(p2m_pfn, PAGE_KERNEL));
- p2m_pfn++;
+ idx_p4d = 0;
+ save_pud = n_pud;
+ do {
+ if (n_p4d > 0) {
+ p4d = early_memremap(p4d_phys, PAGE_SIZE);
+ clear_page(p4d);
+ n_pud = min(save_pud, PTRS_PER_P4D);
+ }
+ for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
+ pud = early_memremap(pud_phys, PAGE_SIZE);
+ clear_page(pud);
+ for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
+ idx_pmd++) {
+ pmd = early_memremap(pmd_phys, PAGE_SIZE);
+ clear_page(pmd);
+ for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
+ idx_pt++) {
+ pt = early_memremap(pt_phys, PAGE_SIZE);
+ clear_page(pt);
+ for (idx_pte = 0;
+ idx_pte < min(n_pte, PTRS_PER_PTE);
+ idx_pte++) {
+ set_pte(pt + idx_pte,
+ pfn_pte(p2m_pfn, PAGE_KERNEL));
+ p2m_pfn++;
+ }
+ n_pte -= PTRS_PER_PTE;
+ early_memunmap(pt, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pt_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
+ PFN_DOWN(pt_phys));
+ set_pmd(pmd + idx_pt,
+ __pmd(_PAGE_TABLE | pt_phys));
+ pt_phys += PAGE_SIZE;
}
- n_pte -= PTRS_PER_PTE;
- early_memunmap(pt, PAGE_SIZE);
- make_lowmem_page_readonly(__va(pt_phys));
- pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
- PFN_DOWN(pt_phys));
- set_pmd(pmd + idx_pt,
- __pmd(_PAGE_TABLE | pt_phys));
- pt_phys += PAGE_SIZE;
+ n_pt -= PTRS_PER_PMD;
+ early_memunmap(pmd, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pmd_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
+ PFN_DOWN(pmd_phys));
+ set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
+ pmd_phys += PAGE_SIZE;
}
- n_pt -= PTRS_PER_PMD;
- early_memunmap(pmd, PAGE_SIZE);
- make_lowmem_page_readonly(__va(pmd_phys));
- pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
- PFN_DOWN(pmd_phys));
- set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
- pmd_phys += PAGE_SIZE;
+ n_pmd -= PTRS_PER_PUD;
+ early_memunmap(pud, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(pud_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
+ if (n_p4d > 0)
+ set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys));
+ else
+ set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
+ pud_phys += PAGE_SIZE;
}
- n_pmd -= PTRS_PER_PUD;
- early_memunmap(pud, PAGE_SIZE);
- make_lowmem_page_readonly(__va(pud_phys));
- pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
- set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
- pud_phys += PAGE_SIZE;
- }
+ if (n_p4d > 0) {
+ save_pud -= PTRS_PER_P4D;
+ early_memunmap(p4d, PAGE_SIZE);
+ make_lowmem_page_readonly(__va(p4d_phys));
+ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys));
+ set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys));
+ p4d_phys += PAGE_SIZE;
+ }
+ } while (++idx_p4d < n_p4d);
/* Now copy the old p2m info to the new area. */
memcpy(new_p2m, xen_p2m_addr, size);
@@ -2331,6 +2407,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
#endif
case FIX_TEXT_POKE0:
case FIX_TEXT_POKE1:
+ case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END:
/* All local page mappings */
pte = pfn_pte(phys, prot);
break;
@@ -2383,8 +2460,8 @@ static void __init xen_post_allocator_init(void)
pv_mmu_ops.set_pte = xen_set_pte;
pv_mmu_ops.set_pmd = xen_set_pmd;
pv_mmu_ops.set_pud = xen_set_pud;
-#if CONFIG_PGTABLE_LEVELS == 4
- pv_mmu_ops.set_pgd = xen_set_pgd;
+#if CONFIG_PGTABLE_LEVELS >= 4
+ pv_mmu_ops.set_p4d = xen_set_p4d;
#endif
/* This will work as long as patching hasn't happened yet
@@ -2393,7 +2470,7 @@ static void __init xen_post_allocator_init(void)
pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
pv_mmu_ops.release_pte = xen_release_pte;
pv_mmu_ops.release_pmd = xen_release_pmd;
-#if CONFIG_PGTABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS >= 4
pv_mmu_ops.alloc_pud = xen_alloc_pud;
pv_mmu_ops.release_pud = xen_release_pud;
#endif
@@ -2459,10 +2536,10 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
-#if CONFIG_PGTABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS >= 4
.pud_val = PV_CALLEE_SAVE(xen_pud_val),
.make_pud = PV_CALLEE_SAVE(xen_make_pud),
- .set_pgd = xen_set_pgd_hyper,
+ .set_p4d = xen_set_p4d_hyper,
.alloc_pud = xen_alloc_pmd_init,
.release_pud = xen_release_pmd_init,
@@ -2812,16 +2889,6 @@ static int do_remap_gfn(struct vm_area_struct *vma,
BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
- if (xen_feature(XENFEAT_auto_translated_physmap)) {
-#ifdef CONFIG_XEN_PVH
- /* We need to update the local page tables and the xen HAP */
- return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr,
- prot, domid, pages);
-#else
- return -EINVAL;
-#endif
- }
-
rmd.mfn = gfn;
rmd.prot = prot;
/* We use the err_ptr to indicate if there we are doing a contiguous
@@ -2915,10 +2982,6 @@ int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
return 0;
-#ifdef CONFIG_XEN_PVH
- return xen_xlate_unmap_gfn_range(vma, numpgs, pages);
-#else
return -EINVAL;
-#endif
}
EXPORT_SYMBOL_GPL(xen_unmap_domain_gfn_range);
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 73809bb951b4..3fe2b3292915 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -5,6 +5,7 @@
enum pt_level {
PT_PGD,
+ PT_P4D,
PT_PUD,
PT_PMD,
PT_PTE
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
index a9fafb5c8738..42b08f8fc2ca 100644
--- a/arch/x86/xen/pci-swiotlb-xen.c
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -18,7 +18,7 @@
int xen_swiotlb __read_mostly;
-static struct dma_map_ops xen_swiotlb_dma_ops = {
+static const struct dma_map_ops xen_swiotlb_dma_ops = {
.alloc = xen_swiotlb_alloc_coherent,
.free = xen_swiotlb_free_coherent,
.sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu,
@@ -48,7 +48,7 @@ int __init pci_xen_swiotlb_detect(void)
* activate this IOMMU. If running as PV privileged, activate it
* irregardless.
*/
- if ((xen_initial_domain() || swiotlb || swiotlb_force))
+ if (xen_initial_domain() || swiotlb || swiotlb_force == SWIOTLB_FORCE)
xen_swiotlb = 1;
/* If we are running under Xen, we MUST disable the native SWIOTLB.
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index 90d1b83cf35f..33a783c77d96 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -73,8 +73,8 @@ bool xen_has_pv_devices(void)
if (!xen_domain())
return false;
- /* PV domains always have them. */
- if (xen_pv_domain())
+ /* PV and PVH domains always have them. */
+ if (xen_pv_domain() || xen_pvh_domain())
return true;
/* And user has xen_platform_pci=0 set in guest config as
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 8c394e30e5fe..a5bf7c451435 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -14,7 +14,7 @@
#include <asm/elf.h>
#include <asm/vdso.h>
-#include <asm/e820.h>
+#include <asm/e820/api.h>
#include <asm/setup.h>
#include <asm/acpi.h>
#include <asm/numa.h>
@@ -41,8 +41,7 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
unsigned long xen_released_pages;
/* E820 map used during setting up memory. */
-static struct e820entry xen_e820_map[E820_X_MAX] __initdata;
-static u32 xen_e820_map_entries __initdata;
+static struct e820_table xen_e820_table __initdata;
/*
* Buffer used to remap identity mapped pages. We only need the virtual space.
@@ -198,15 +197,15 @@ void __init xen_inv_extra_mem(void)
*/
static unsigned long __init xen_find_pfn_range(unsigned long *min_pfn)
{
- const struct e820entry *entry = xen_e820_map;
+ const struct e820_entry *entry = xen_e820_table.entries;
unsigned int i;
unsigned long done = 0;
- for (i = 0; i < xen_e820_map_entries; i++, entry++) {
+ for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) {
unsigned long s_pfn;
unsigned long e_pfn;
- if (entry->type != E820_RAM)
+ if (entry->type != E820_TYPE_RAM)
continue;
e_pfn = PFN_DOWN(entry->addr + entry->size);
@@ -457,7 +456,7 @@ static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages,
{
phys_addr_t start = 0;
unsigned long ret_val = 0;
- const struct e820entry *entry = xen_e820_map;
+ const struct e820_entry *entry = xen_e820_table.entries;
int i;
/*
@@ -471,13 +470,13 @@ static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages,
* example) the DMI tables in a reserved region that begins on
* a non-page boundary.
*/
- for (i = 0; i < xen_e820_map_entries; i++, entry++) {
+ for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) {
phys_addr_t end = entry->addr + entry->size;
- if (entry->type == E820_RAM || i == xen_e820_map_entries - 1) {
+ if (entry->type == E820_TYPE_RAM || i == xen_e820_table.nr_entries - 1) {
unsigned long start_pfn = PFN_DOWN(start);
unsigned long end_pfn = PFN_UP(end);
- if (entry->type == E820_RAM)
+ if (entry->type == E820_TYPE_RAM)
end_pfn = PFN_UP(entry->addr);
if (start_pfn < end_pfn)
@@ -591,28 +590,28 @@ static void __init xen_align_and_add_e820_region(phys_addr_t start,
phys_addr_t end = start + size;
/* Align RAM regions to page boundaries. */
- if (type == E820_RAM) {
+ if (type == E820_TYPE_RAM) {
start = PAGE_ALIGN(start);
end &= ~((phys_addr_t)PAGE_SIZE - 1);
}
- e820_add_region(start, end - start, type);
+ e820__range_add(start, end - start, type);
}
static void __init xen_ignore_unusable(void)
{
- struct e820entry *entry = xen_e820_map;
+ struct e820_entry *entry = xen_e820_table.entries;
unsigned int i;
- for (i = 0; i < xen_e820_map_entries; i++, entry++) {
- if (entry->type == E820_UNUSABLE)
- entry->type = E820_RAM;
+ for (i = 0; i < xen_e820_table.nr_entries; i++, entry++) {
+ if (entry->type == E820_TYPE_UNUSABLE)
+ entry->type = E820_TYPE_RAM;
}
}
bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
{
- struct e820entry *entry;
+ struct e820_entry *entry;
unsigned mapcnt;
phys_addr_t end;
@@ -620,10 +619,10 @@ bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size)
return false;
end = start + size;
- entry = xen_e820_map;
+ entry = xen_e820_table.entries;
- for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++) {
- if (entry->type == E820_RAM && entry->addr <= start &&
+ for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) {
+ if (entry->type == E820_TYPE_RAM && entry->addr <= start &&
(entry->addr + entry->size) >= end)
return false;
@@ -645,10 +644,10 @@ phys_addr_t __init xen_find_free_area(phys_addr_t size)
{
unsigned mapcnt;
phys_addr_t addr, start;
- struct e820entry *entry = xen_e820_map;
+ struct e820_entry *entry = xen_e820_table.entries;
- for (mapcnt = 0; mapcnt < xen_e820_map_entries; mapcnt++, entry++) {
- if (entry->type != E820_RAM || entry->size < size)
+ for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++, entry++) {
+ if (entry->type != E820_TYPE_RAM || entry->size < size)
continue;
start = entry->addr;
for (addr = start; addr < start + size; addr += PAGE_SIZE) {
@@ -713,10 +712,9 @@ static void __init xen_reserve_xen_mfnlist(void)
size = PFN_PHYS(xen_start_info->nr_p2m_frames);
}
- if (!xen_is_e820_reserved(start, size)) {
- memblock_reserve(start, size);
+ memblock_reserve(start, size);
+ if (!xen_is_e820_reserved(start, size))
return;
- }
#ifdef CONFIG_X86_32
/*
@@ -727,6 +725,7 @@ static void __init xen_reserve_xen_mfnlist(void)
BUG();
#else
xen_relocate_p2m();
+ memblock_free(start, size);
#endif
}
@@ -750,8 +749,8 @@ char * __init xen_memory_setup(void)
max_pfn = min(max_pfn, xen_start_info->nr_pages);
mem_end = PFN_PHYS(max_pfn);
- memmap.nr_entries = ARRAY_SIZE(xen_e820_map);
- set_xen_guest_handle(memmap.buffer, xen_e820_map);
+ memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries);
+ set_xen_guest_handle(memmap.buffer, xen_e820_table.entries);
op = xen_initial_domain() ?
XENMEM_machine_memory_map :
@@ -760,16 +759,16 @@ char * __init xen_memory_setup(void)
if (rc == -ENOSYS) {
BUG_ON(xen_initial_domain());
memmap.nr_entries = 1;
- xen_e820_map[0].addr = 0ULL;
- xen_e820_map[0].size = mem_end;
+ xen_e820_table.entries[0].addr = 0ULL;
+ xen_e820_table.entries[0].size = mem_end;
/* 8MB slack (to balance backend allocations). */
- xen_e820_map[0].size += 8ULL << 20;
- xen_e820_map[0].type = E820_RAM;
+ xen_e820_table.entries[0].size += 8ULL << 20;
+ xen_e820_table.entries[0].type = E820_TYPE_RAM;
rc = 0;
}
BUG_ON(rc);
BUG_ON(memmap.nr_entries == 0);
- xen_e820_map_entries = memmap.nr_entries;
+ xen_e820_table.nr_entries = memmap.nr_entries;
/*
* Xen won't allow a 1:1 mapping to be created to UNUSABLE
@@ -783,8 +782,7 @@ char * __init xen_memory_setup(void)
xen_ignore_unusable();
/* Make sure the Xen-supplied memory map is well-ordered. */
- sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map),
- &xen_e820_map_entries);
+ e820__update_table(&xen_e820_table);
max_pages = xen_get_max_pages();
@@ -811,15 +809,15 @@ char * __init xen_memory_setup(void)
extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
extra_pages, max_pages - max_pfn);
i = 0;
- addr = xen_e820_map[0].addr;
- size = xen_e820_map[0].size;
- while (i < xen_e820_map_entries) {
+ addr = xen_e820_table.entries[0].addr;
+ size = xen_e820_table.entries[0].size;
+ while (i < xen_e820_table.nr_entries) {
bool discard = false;
chunk_size = size;
- type = xen_e820_map[i].type;
+ type = xen_e820_table.entries[i].type;
- if (type == E820_RAM) {
+ if (type == E820_TYPE_RAM) {
if (addr < mem_end) {
chunk_size = min(size, mem_end - addr);
} else if (extra_pages) {
@@ -840,9 +838,9 @@ char * __init xen_memory_setup(void)
size -= chunk_size;
if (size == 0) {
i++;
- if (i < xen_e820_map_entries) {
- addr = xen_e820_map[i].addr;
- size = xen_e820_map[i].size;
+ if (i < xen_e820_table.nr_entries) {
+ addr = xen_e820_table.entries[i].addr;
+ size = xen_e820_table.entries[i].size;
}
}
}
@@ -858,10 +856,9 @@ char * __init xen_memory_setup(void)
* reserve ISA memory anyway because too many things poke
* about in there.
*/
- e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
- E820_RESERVED);
+ e820__range_add(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_TYPE_RESERVED);
- sanitize_e820_map(e820->map, ARRAY_SIZE(e820->map), &e820->nr_map);
+ e820__update_table(e820_table);
/*
* Check whether the kernel itself conflicts with the target E820 map.
@@ -923,21 +920,19 @@ char * __init xen_auto_xlated_memory_setup(void)
int i;
int rc;
- memmap.nr_entries = ARRAY_SIZE(xen_e820_map);
- set_xen_guest_handle(memmap.buffer, xen_e820_map);
+ memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries);
+ set_xen_guest_handle(memmap.buffer, xen_e820_table.entries);
rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
if (rc < 0)
panic("No memory map (%d)\n", rc);
- xen_e820_map_entries = memmap.nr_entries;
+ xen_e820_table.nr_entries = memmap.nr_entries;
- sanitize_e820_map(xen_e820_map, ARRAY_SIZE(xen_e820_map),
- &xen_e820_map_entries);
+ e820__update_table(&xen_e820_table);
- for (i = 0; i < xen_e820_map_entries; i++)
- e820_add_region(xen_e820_map[i].addr, xen_e820_map[i].size,
- xen_e820_map[i].type);
+ for (i = 0; i < xen_e820_table.nr_entries; i++)
+ e820__range_add(xen_e820_table.entries[i].addr, xen_e820_table.entries[i].size, xen_e820_table.entries[i].type);
/* Remove p2m info, it is not needed. */
xen_start_info->mfn_list = 0;
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 311acad7dad2..eaa36162ed4a 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -18,6 +18,7 @@
#include <linux/smp.h>
#include <linux/irq_work.h>
#include <linux/tick.h>
+#include <linux/nmi.h>
#include <asm/paravirt.h>
#include <asm/desc.h>
@@ -99,18 +100,8 @@ static void cpu_bringup(void)
local_irq_enable();
}
-/*
- * Note: cpu parameter is only relevant for PVH. The reason for passing it
- * is we can't do smp_processor_id until the percpu segments are loaded, for
- * which we need the cpu number! So we pass it in rdi as first parameter.
- */
-asmlinkage __visible void cpu_bringup_and_idle(int cpu)
+asmlinkage __visible void cpu_bringup_and_idle(void)
{
-#ifdef CONFIG_XEN_PVH
- if (xen_feature(XENFEAT_auto_translated_physmap) &&
- xen_feature(XENFEAT_supervisor_mode_kernel))
- xen_pvh_secondary_vcpu_init(cpu);
-#endif
cpu_bringup();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}
@@ -401,64 +392,50 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
if (ctxt == NULL)
return -ENOMEM;
- gdt = get_cpu_gdt_table(cpu);
+ gdt = get_cpu_gdt_rw(cpu);
#ifdef CONFIG_X86_32
- /* Note: PVH is not yet supported on x86_32. */
ctxt->user_regs.fs = __KERNEL_PERCPU;
ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
#endif
memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
- if (!xen_feature(XENFEAT_auto_translated_physmap)) {
- ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
- ctxt->flags = VGCF_IN_KERNEL;
- ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
- ctxt->user_regs.ds = __USER_DS;
- ctxt->user_regs.es = __USER_DS;
- ctxt->user_regs.ss = __KERNEL_DS;
+ ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
+ ctxt->flags = VGCF_IN_KERNEL;
+ ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
+ ctxt->user_regs.ds = __USER_DS;
+ ctxt->user_regs.es = __USER_DS;
+ ctxt->user_regs.ss = __KERNEL_DS;
- xen_copy_trap_info(ctxt->trap_ctxt);
+ xen_copy_trap_info(ctxt->trap_ctxt);
- ctxt->ldt_ents = 0;
+ ctxt->ldt_ents = 0;
- BUG_ON((unsigned long)gdt & ~PAGE_MASK);
+ BUG_ON((unsigned long)gdt & ~PAGE_MASK);
- gdt_mfn = arbitrary_virt_to_mfn(gdt);
- make_lowmem_page_readonly(gdt);
- make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
+ gdt_mfn = arbitrary_virt_to_mfn(gdt);
+ make_lowmem_page_readonly(gdt);
+ make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
- ctxt->gdt_frames[0] = gdt_mfn;
- ctxt->gdt_ents = GDT_ENTRIES;
+ ctxt->gdt_frames[0] = gdt_mfn;
+ ctxt->gdt_ents = GDT_ENTRIES;
- ctxt->kernel_ss = __KERNEL_DS;
- ctxt->kernel_sp = idle->thread.sp0;
+ ctxt->kernel_ss = __KERNEL_DS;
+ ctxt->kernel_sp = idle->thread.sp0;
#ifdef CONFIG_X86_32
- ctxt->event_callback_cs = __KERNEL_CS;
- ctxt->failsafe_callback_cs = __KERNEL_CS;
+ ctxt->event_callback_cs = __KERNEL_CS;
+ ctxt->failsafe_callback_cs = __KERNEL_CS;
#else
- ctxt->gs_base_kernel = per_cpu_offset(cpu);
-#endif
- ctxt->event_callback_eip =
- (unsigned long)xen_hypervisor_callback;
- ctxt->failsafe_callback_eip =
- (unsigned long)xen_failsafe_callback;
- ctxt->user_regs.cs = __KERNEL_CS;
- per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
- }
-#ifdef CONFIG_XEN_PVH
- else {
- /*
- * The vcpu comes on kernel page tables which have the NX pte
- * bit set. This means before DS/SS is touched, NX in
- * EFER must be set. Hence the following assembly glue code.
- */
- ctxt->user_regs.eip = (unsigned long)xen_pvh_early_cpu_init;
- ctxt->user_regs.rdi = cpu;
- ctxt->user_regs.rsi = true; /* entry == true */
- }
+ ctxt->gs_base_kernel = per_cpu_offset(cpu);
#endif
+ ctxt->event_callback_eip =
+ (unsigned long)xen_hypervisor_callback;
+ ctxt->failsafe_callback_eip =
+ (unsigned long)xen_failsafe_callback;
+ ctxt->user_regs.cs = __KERNEL_CS;
+ per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
+
ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir));
if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt))
diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h
index c5c16dc4f694..9beef333584a 100644
--- a/arch/x86/xen/smp.h
+++ b/arch/x86/xen/smp.h
@@ -21,12 +21,4 @@ static inline int xen_smp_intr_init(unsigned int cpu)
static inline void xen_smp_intr_free(unsigned int cpu) {}
#endif /* CONFIG_SMP */
-#ifdef CONFIG_XEN_PVH
-extern void xen_pvh_early_cpu_init(int cpu, bool entry);
-#else
-static inline void xen_pvh_early_cpu_init(int cpu, bool entry)
-{
-}
-#endif
-
#endif
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index e8a9ea7d7a21..25a7c4302ce7 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -141,25 +141,6 @@ void __init xen_init_spinlocks(void)
pv_lock_ops.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen);
}
-/*
- * While the jump_label init code needs to happend _after_ the jump labels are
- * enabled and before SMP is started. Hence we use pre-SMP initcall level
- * init. We cannot do it in xen_init_spinlocks as that is done before
- * jump labels are activated.
- */
-static __init int xen_init_spinlocks_jump(void)
-{
- if (!xen_pvspin)
- return 0;
-
- if (!xen_domain())
- return 0;
-
- static_key_slow_inc(&paravirt_ticketlocks_enabled);
- return 0;
-}
-early_initcall(xen_init_spinlocks_jump);
-
static __init int xen_parse_nopvspin(char *arg)
{
xen_pvspin = false;
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 1e69956d7852..7a3089285c59 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -209,7 +209,9 @@ static const struct clock_event_device xen_timerop_clockevent = {
.features = CLOCK_EVT_FEAT_ONESHOT,
.max_delta_ns = 0xffffffff,
+ .max_delta_ticks = 0xffffffff,
.min_delta_ns = TIMER_SLOP,
+ .min_delta_ticks = TIMER_SLOP,
.mult = 1,
.shift = 0,
@@ -268,7 +270,9 @@ static const struct clock_event_device xen_vcpuop_clockevent = {
.features = CLOCK_EVT_FEAT_ONESHOT,
.max_delta_ns = 0xffffffff,
+ .max_delta_ticks = 0xffffffff,
.min_delta_ns = TIMER_SLOP,
+ .min_delta_ticks = TIMER_SLOP,
.mult = 1,
.shift = 0,
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 7f8d8abf4c1a..37794e42b67d 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -16,25 +16,6 @@
#include <xen/interface/xen-mca.h>
#include <asm/xen/interface.h>
-#ifdef CONFIG_XEN_PVH
-#define PVH_FEATURES_STR "|writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel"
-/* Note the lack of 'hvm_callback_vector'. Older hypervisor will
- * balk at this being part of XEN_ELFNOTE_FEATURES, so we put it in
- * XEN_ELFNOTE_SUPPORTED_FEATURES which older hypervisors will ignore.
- */
-#define PVH_FEATURES ((1 << XENFEAT_writable_page_tables) | \
- (1 << XENFEAT_auto_translated_physmap) | \
- (1 << XENFEAT_supervisor_mode_kernel) | \
- (1 << XENFEAT_hvm_callback_vector))
-/* The XENFEAT_writable_page_tables is not stricly necessary as we set that
- * up regardless whether this CONFIG option is enabled or not, but it
- * clarifies what the right flags need to be.
- */
-#else
-#define PVH_FEATURES_STR ""
-#define PVH_FEATURES (0)
-#endif
-
__INIT
ENTRY(startup_xen)
cld
@@ -54,41 +35,6 @@ ENTRY(startup_xen)
__FINIT
-#ifdef CONFIG_XEN_PVH
-/*
- * xen_pvh_early_cpu_init() - early PVH VCPU initialization
- * @cpu: this cpu number (%rdi)
- * @entry: true if this is a secondary vcpu coming up on this entry
- * point, false if this is the boot CPU being initialized for
- * the first time (%rsi)
- *
- * Note: This is called as a function on the boot CPU, and is the entry point
- * on the secondary CPU.
- */
-ENTRY(xen_pvh_early_cpu_init)
- mov %rsi, %r11
-
- /* Gather features to see if NX implemented. */
- mov $0x80000001, %eax
- cpuid
- mov %edx, %esi
-
- mov $MSR_EFER, %ecx
- rdmsr
- bts $_EFER_SCE, %eax
-
- bt $20, %esi
- jnc 1f /* No NX, skip setting it */
- bts $_EFER_NX, %eax
-1: wrmsr
-#ifdef CONFIG_SMP
- cmp $0, %r11b
- jne cpu_bringup_and_idle
-#endif
- ret
-
-#endif /* CONFIG_XEN_PVH */
-
.pushsection .text
.balign PAGE_SIZE
ENTRY(hypercall_page)
@@ -114,10 +60,10 @@ ENTRY(hypercall_page)
#endif
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
- ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .ascii "!writable_page_tables|pae_pgdir_above_4gb"; .asciz PVH_FEATURES_STR)
- ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES, .long (PVH_FEATURES) |
- (1 << XENFEAT_writable_page_tables) |
- (1 << XENFEAT_dom0))
+ ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,
+ .ascii "!writable_page_tables|pae_pgdir_above_4gb")
+ ELFNOTE(Xen, XEN_ELFNOTE_SUPPORTED_FEATURES,
+ .long (1 << XENFEAT_writable_page_tables) | (1 << XENFEAT_dom0))
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index ac0a2b0f9e62..f6a41c41ebc7 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -146,5 +146,4 @@ __visible void xen_adjust_exception_frame(void);
extern int xen_panic_handler_init(void);
-void xen_pvh_secondary_vcpu_init(int cpu);
#endif /* XEN_OPS_H */
diff --git a/arch/x86/xen/xen-pvh.S b/arch/x86/xen/xen-pvh.S
new file mode 100644
index 000000000000..5e246716d58f
--- /dev/null
+++ b/arch/x86/xen/xen-pvh.S
@@ -0,0 +1,161 @@
+/*
+ * Copyright C 2016, Oracle and/or its affiliates. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+ .code32
+ .text
+#define _pa(x) ((x) - __START_KERNEL_map)
+
+#include <linux/elfnote.h>
+#include <linux/init.h>
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/asm.h>
+#include <asm/boot.h>
+#include <asm/processor-flags.h>
+#include <asm/msr.h>
+#include <xen/interface/elfnote.h>
+
+ __HEAD
+
+/*
+ * Entry point for PVH guests.
+ *
+ * Xen ABI specifies the following register state when we come here:
+ *
+ * - `ebx`: contains the physical memory address where the loader has placed
+ * the boot start info structure.
+ * - `cr0`: bit 0 (PE) must be set. All the other writeable bits are cleared.
+ * - `cr4`: all bits are cleared.
+ * - `cs `: must be a 32-bit read/execute code segment with a base of ‘0’
+ * and a limit of ‘0xFFFFFFFF’. The selector value is unspecified.
+ * - `ds`, `es`: must be a 32-bit read/write data segment with a base of
+ * ‘0’ and a limit of ‘0xFFFFFFFF’. The selector values are all
+ * unspecified.
+ * - `tr`: must be a 32-bit TSS (active) with a base of '0' and a limit
+ * of '0x67'.
+ * - `eflags`: bit 17 (VM) must be cleared. Bit 9 (IF) must be cleared.
+ * Bit 8 (TF) must be cleared. Other bits are all unspecified.
+ *
+ * All other processor registers and flag bits are unspecified. The OS is in
+ * charge of setting up it's own stack, GDT and IDT.
+ */
+
+ENTRY(pvh_start_xen)
+ cld
+
+ lgdt (_pa(gdt))
+
+ mov $(__BOOT_DS),%eax
+ mov %eax,%ds
+ mov %eax,%es
+ mov %eax,%ss
+
+ /* Stash hvm_start_info. */
+ mov $_pa(pvh_start_info), %edi
+ mov %ebx, %esi
+ mov _pa(pvh_start_info_sz), %ecx
+ shr $2,%ecx
+ rep
+ movsl
+
+ mov $_pa(early_stack_end), %esp
+
+ /* Enable PAE mode. */
+ mov %cr4, %eax
+ orl $X86_CR4_PAE, %eax
+ mov %eax, %cr4
+
+#ifdef CONFIG_X86_64
+ /* Enable Long mode. */
+ mov $MSR_EFER, %ecx
+ rdmsr
+ btsl $_EFER_LME, %eax
+ wrmsr
+
+ /* Enable pre-constructed page tables. */
+ mov $_pa(init_level4_pgt), %eax
+ mov %eax, %cr3
+ mov $(X86_CR0_PG | X86_CR0_PE), %eax
+ mov %eax, %cr0
+
+ /* Jump to 64-bit mode. */
+ ljmp $__KERNEL_CS, $_pa(1f)
+
+ /* 64-bit entry point. */
+ .code64
+1:
+ call xen_prepare_pvh
+
+ /* startup_64 expects boot_params in %rsi. */
+ mov $_pa(pvh_bootparams), %rsi
+ mov $_pa(startup_64), %rax
+ jmp *%rax
+
+#else /* CONFIG_X86_64 */
+
+ call mk_early_pgtbl_32
+
+ mov $_pa(initial_page_table), %eax
+ mov %eax, %cr3
+
+ mov %cr0, %eax
+ or $(X86_CR0_PG | X86_CR0_PE), %eax
+ mov %eax, %cr0
+
+ ljmp $__BOOT_CS, $1f
+1:
+ call xen_prepare_pvh
+ mov $_pa(pvh_bootparams), %esi
+
+ /* startup_32 doesn't expect paging and PAE to be on. */
+ ljmp $__BOOT_CS, $_pa(2f)
+2:
+ mov %cr0, %eax
+ and $~X86_CR0_PG, %eax
+ mov %eax, %cr0
+ mov %cr4, %eax
+ and $~X86_CR4_PAE, %eax
+ mov %eax, %cr4
+
+ ljmp $__BOOT_CS, $_pa(startup_32)
+#endif
+END(pvh_start_xen)
+
+ .section ".init.data","aw"
+ .balign 8
+gdt:
+ .word gdt_end - gdt_start
+ .long _pa(gdt_start)
+ .word 0
+gdt_start:
+ .quad 0x0000000000000000 /* NULL descriptor */
+ .quad 0x0000000000000000 /* reserved */
+#ifdef CONFIG_X86_64
+ .quad GDT_ENTRY(0xa09a, 0, 0xfffff) /* __KERNEL_CS */
+#else
+ .quad GDT_ENTRY(0xc09a, 0, 0xfffff) /* __KERNEL_CS */
+#endif
+ .quad GDT_ENTRY(0xc092, 0, 0xfffff) /* __KERNEL_DS */
+gdt_end:
+
+ .balign 4
+early_stack:
+ .fill 256, 1, 0
+early_stack_end:
+
+ ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY,
+ _ASM_PTR (pvh_start_xen - __START_KERNEL_map))