diff options
author | Thomas Gleixner <tglx@linutronix.de> | 2007-10-11 11:16:51 +0200 |
---|---|---|
committer | Thomas Gleixner <tglx@linutronix.de> | 2007-10-11 11:16:51 +0200 |
commit | 9702785a747aa27baf46ff504beab6528f21f2dd (patch) | |
tree | ab69d6f802f5b680c33999dc089e44982c74595d /arch/i386 | |
parent | 334e621a01f86d5bc25e4f742e1eaae6e2d2a97a (diff) | |
download | lwn-9702785a747aa27baf46ff504beab6528f21f2dd.tar.gz lwn-9702785a747aa27baf46ff504beab6528f21f2dd.zip |
i386: move xen
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/i386')
-rw-r--r-- | arch/i386/Kconfig | 2 | ||||
-rw-r--r-- | arch/i386/Makefile | 2 | ||||
-rw-r--r-- | arch/i386/kernel/head_32.S | 2 | ||||
-rw-r--r-- | arch/i386/kernel/vsyscall-note_32.S | 2 | ||||
-rw-r--r-- | arch/i386/xen/Kconfig | 11 | ||||
-rw-r--r-- | arch/i386/xen/Makefile | 4 | ||||
-rw-r--r-- | arch/i386/xen/enlighten.c | 1146 | ||||
-rw-r--r-- | arch/i386/xen/events.c | 591 | ||||
-rw-r--r-- | arch/i386/xen/features.c | 29 | ||||
-rw-r--r-- | arch/i386/xen/manage.c | 143 | ||||
-rw-r--r-- | arch/i386/xen/mmu.c | 567 | ||||
-rw-r--r-- | arch/i386/xen/mmu.h | 60 | ||||
-rw-r--r-- | arch/i386/xen/multicalls.c | 90 | ||||
-rw-r--r-- | arch/i386/xen/multicalls.h | 45 | ||||
-rw-r--r-- | arch/i386/xen/setup.c | 111 | ||||
-rw-r--r-- | arch/i386/xen/smp.c | 404 | ||||
-rw-r--r-- | arch/i386/xen/time.c | 593 | ||||
-rw-r--r-- | arch/i386/xen/vdso.h | 4 | ||||
-rw-r--r-- | arch/i386/xen/xen-asm.S | 291 | ||||
-rw-r--r-- | arch/i386/xen/xen-head.S | 38 | ||||
-rw-r--r-- | arch/i386/xen/xen-ops.h | 71 |
21 files changed, 4 insertions, 4202 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 561cc21914b2..0f6186f03034 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -226,7 +226,7 @@ config PARAVIRT However, when run without a hypervisor the kernel is theoretically slower. If in doubt, say N. -source "arch/i386/xen/Kconfig" +source "arch/x86/xen/Kconfig" config VMI bool "VMI Paravirt-ops support" diff --git a/arch/i386/Makefile b/arch/i386/Makefile index c462803a4db2..fc85d6674fcb 100644 --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -94,7 +94,7 @@ mcore-$(CONFIG_X86_ES7000) := arch/x86/mach-default core-$(CONFIG_X86_ES7000) := arch/x86/mach-es7000/ # Xen paravirtualization support -core-$(CONFIG_XEN) += arch/i386/xen/ +core-$(CONFIG_XEN) += arch/x86/xen/ # default subarch .h files mflags-y += -Iinclude/asm-i386/mach-default diff --git a/arch/i386/kernel/head_32.S b/arch/i386/kernel/head_32.S index 8f0382161c91..9150ca9b5f80 100644 --- a/arch/i386/kernel/head_32.S +++ b/arch/i386/kernel/head_32.S @@ -537,7 +537,7 @@ fault_msg: .ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n" .asciz "Stack: %p %p %p %p %p %p %p %p\n" -#include "../xen/xen-head.S" +#include "../../x86/xen/xen-head.S" /* * The IDT and GDT 'descriptors' are a strange 48-bit object diff --git a/arch/i386/kernel/vsyscall-note_32.S b/arch/i386/kernel/vsyscall-note_32.S index 07c0daf78237..fcf376a37f79 100644 --- a/arch/i386/kernel/vsyscall-note_32.S +++ b/arch/i386/kernel/vsyscall-note_32.S @@ -33,7 +33,7 @@ ELFNOTE_END * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen. */ -#include "../xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */ +#include "../../x86/xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */ .globl VDSO_NOTE_MASK ELFNOTE_START(GNU, 2, "a") diff --git a/arch/i386/xen/Kconfig b/arch/i386/xen/Kconfig deleted file mode 100644 index 9df99e1885a4..000000000000 --- a/arch/i386/xen/Kconfig +++ /dev/null @@ -1,11 +0,0 @@ -# -# This Kconfig describes xen options -# - -config XEN - bool "Enable support for Xen hypervisor" - depends on PARAVIRT && X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES - help - This is the Linux Xen port. Enabling this will allow the - kernel to boot in a paravirtualized environment under the - Xen hypervisor. diff --git a/arch/i386/xen/Makefile b/arch/i386/xen/Makefile deleted file mode 100644 index 343df246bd3e..000000000000 --- a/arch/i386/xen/Makefile +++ /dev/null @@ -1,4 +0,0 @@ -obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \ - events.o time.o manage.o xen-asm.o - -obj-$(CONFIG_SMP) += smp.o diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c deleted file mode 100644 index f01bfcd4bdee..000000000000 --- a/arch/i386/xen/enlighten.c +++ /dev/null @@ -1,1146 +0,0 @@ -/* - * Core of Xen paravirt_ops implementation. - * - * This file contains the xen_paravirt_ops structure itself, and the - * implementations for: - * - privileged instructions - * - interrupt flags - * - segment operations - * - booting and setup - * - * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/preempt.h> -#include <linux/hardirq.h> -#include <linux/percpu.h> -#include <linux/delay.h> -#include <linux/start_kernel.h> -#include <linux/sched.h> -#include <linux/bootmem.h> -#include <linux/module.h> -#include <linux/mm.h> -#include <linux/page-flags.h> -#include <linux/highmem.h> -#include <linux/smp.h> - -#include <xen/interface/xen.h> -#include <xen/interface/physdev.h> -#include <xen/interface/vcpu.h> -#include <xen/interface/sched.h> -#include <xen/features.h> -#include <xen/page.h> - -#include <asm/paravirt.h> -#include <asm/page.h> -#include <asm/xen/hypercall.h> -#include <asm/xen/hypervisor.h> -#include <asm/fixmap.h> -#include <asm/processor.h> -#include <asm/setup.h> -#include <asm/desc.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <asm/reboot.h> - -#include "xen-ops.h" -#include "mmu.h" -#include "multicalls.h" - -EXPORT_SYMBOL_GPL(hypercall_page); - -DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); - -DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); -DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); -DEFINE_PER_CPU(unsigned long, xen_cr3); - -struct start_info *xen_start_info; -EXPORT_SYMBOL_GPL(xen_start_info); - -static /* __initdata */ struct shared_info dummy_shared_info; - -/* - * Point at some empty memory to start with. We map the real shared_info - * page as soon as fixmap is up and running. - */ -struct shared_info *HYPERVISOR_shared_info = (void *)&dummy_shared_info; - -/* - * Flag to determine whether vcpu info placement is available on all - * VCPUs. We assume it is to start with, and then set it to zero on - * the first failure. This is because it can succeed on some VCPUs - * and not others, since it can involve hypervisor memory allocation, - * or because the guest failed to guarantee all the appropriate - * constraints on all VCPUs (ie buffer can't cross a page boundary). - * - * Note that any particular CPU may be using a placed vcpu structure, - * but we can only optimise if the all are. - * - * 0: not available, 1: available - */ -static int have_vcpu_info_placement = 1; - -static void __init xen_vcpu_setup(int cpu) -{ - struct vcpu_register_vcpu_info info; - int err; - struct vcpu_info *vcpup; - - per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; - - if (!have_vcpu_info_placement) - return; /* already tested, not available */ - - vcpup = &per_cpu(xen_vcpu_info, cpu); - - info.mfn = virt_to_mfn(vcpup); - info.offset = offset_in_page(vcpup); - - printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %x, offset %d\n", - cpu, vcpup, info.mfn, info.offset); - - /* Check to see if the hypervisor will put the vcpu_info - structure where we want it, which allows direct access via - a percpu-variable. */ - err = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info); - - if (err) { - printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); - have_vcpu_info_placement = 0; - } else { - /* This cpu is using the registered vcpu info, even if - later ones fail to. */ - per_cpu(xen_vcpu, cpu) = vcpup; - - printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n", - cpu, vcpup); - } -} - -static void __init xen_banner(void) -{ - printk(KERN_INFO "Booting paravirtualized kernel on %s\n", - paravirt_ops.name); - printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); -} - -static void xen_cpuid(unsigned int *eax, unsigned int *ebx, - unsigned int *ecx, unsigned int *edx) -{ - unsigned maskedx = ~0; - - /* - * Mask out inconvenient features, to try and disable as many - * unsupported kernel subsystems as possible. - */ - if (*eax == 1) - maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ - (1 << X86_FEATURE_ACPI) | /* disable ACPI */ - (1 << X86_FEATURE_ACC)); /* thermal monitoring */ - - asm(XEN_EMULATE_PREFIX "cpuid" - : "=a" (*eax), - "=b" (*ebx), - "=c" (*ecx), - "=d" (*edx) - : "0" (*eax), "2" (*ecx)); - *edx &= maskedx; -} - -static void xen_set_debugreg(int reg, unsigned long val) -{ - HYPERVISOR_set_debugreg(reg, val); -} - -static unsigned long xen_get_debugreg(int reg) -{ - return HYPERVISOR_get_debugreg(reg); -} - -static unsigned long xen_save_fl(void) -{ - struct vcpu_info *vcpu; - unsigned long flags; - - vcpu = x86_read_percpu(xen_vcpu); - - /* flag has opposite sense of mask */ - flags = !vcpu->evtchn_upcall_mask; - - /* convert to IF type flag - -0 -> 0x00000000 - -1 -> 0xffffffff - */ - return (-flags) & X86_EFLAGS_IF; -} - -static void xen_restore_fl(unsigned long flags) -{ - struct vcpu_info *vcpu; - - /* convert from IF type flag */ - flags = !(flags & X86_EFLAGS_IF); - - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ - preempt_disable(); - vcpu = x86_read_percpu(xen_vcpu); - vcpu->evtchn_upcall_mask = flags; - preempt_enable_no_resched(); - - /* Doesn't matter if we get preempted here, because any - pending event will get dealt with anyway. */ - - if (flags == 0) { - preempt_check_resched(); - barrier(); /* unmask then check (avoid races) */ - if (unlikely(vcpu->evtchn_upcall_pending)) - force_evtchn_callback(); - } -} - -static void xen_irq_disable(void) -{ - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ - preempt_disable(); - x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; - preempt_enable_no_resched(); -} - -static void xen_irq_enable(void) -{ - struct vcpu_info *vcpu; - - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ - preempt_disable(); - vcpu = x86_read_percpu(xen_vcpu); - vcpu->evtchn_upcall_mask = 0; - preempt_enable_no_resched(); - - /* Doesn't matter if we get preempted here, because any - pending event will get dealt with anyway. */ - - barrier(); /* unmask then check (avoid races) */ - if (unlikely(vcpu->evtchn_upcall_pending)) - force_evtchn_callback(); -} - -static void xen_safe_halt(void) -{ - /* Blocking includes an implicit local_irq_enable(). */ - if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0) - BUG(); -} - -static void xen_halt(void) -{ - if (irqs_disabled()) - HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); - else - xen_safe_halt(); -} - -static void xen_set_lazy_mode(enum paravirt_lazy_mode mode) -{ - BUG_ON(preemptible()); - - switch (mode) { - case PARAVIRT_LAZY_NONE: - BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE); - break; - - case PARAVIRT_LAZY_MMU: - case PARAVIRT_LAZY_CPU: - BUG_ON(x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE); - break; - - case PARAVIRT_LAZY_FLUSH: - /* flush if necessary, but don't change state */ - if (x86_read_percpu(xen_lazy_mode) != PARAVIRT_LAZY_NONE) - xen_mc_flush(); - return; - } - - xen_mc_flush(); - x86_write_percpu(xen_lazy_mode, mode); -} - -static unsigned long xen_store_tr(void) -{ - return 0; -} - -static void xen_set_ldt(const void *addr, unsigned entries) -{ - unsigned long linear_addr = (unsigned long)addr; - struct mmuext_op *op; - struct multicall_space mcs = xen_mc_entry(sizeof(*op)); - - op = mcs.args; - op->cmd = MMUEXT_SET_LDT; - if (linear_addr) { - /* ldt my be vmalloced, use arbitrary_virt_to_machine */ - xmaddr_t maddr; - maddr = arbitrary_virt_to_machine((unsigned long)addr); - linear_addr = (unsigned long)maddr.maddr; - } - op->arg1.linear_addr = linear_addr; - op->arg2.nr_ents = entries; - - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_CPU); -} - -static void xen_load_gdt(const struct Xgt_desc_struct *dtr) -{ - unsigned long *frames; - unsigned long va = dtr->address; - unsigned int size = dtr->size + 1; - unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; - int f; - struct multicall_space mcs; - - /* A GDT can be up to 64k in size, which corresponds to 8192 - 8-byte entries, or 16 4k pages.. */ - - BUG_ON(size > 65536); - BUG_ON(va & ~PAGE_MASK); - - mcs = xen_mc_entry(sizeof(*frames) * pages); - frames = mcs.args; - - for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { - frames[f] = virt_to_mfn(va); - make_lowmem_page_readonly((void *)va); - } - - MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); - - xen_mc_issue(PARAVIRT_LAZY_CPU); -} - -static void load_TLS_descriptor(struct thread_struct *t, - unsigned int cpu, unsigned int i) -{ - struct desc_struct *gdt = get_cpu_gdt_table(cpu); - xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); - struct multicall_space mc = __xen_mc_entry(0); - - MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); -} - -static void xen_load_tls(struct thread_struct *t, unsigned int cpu) -{ - xen_mc_batch(); - - load_TLS_descriptor(t, cpu, 0); - load_TLS_descriptor(t, cpu, 1); - load_TLS_descriptor(t, cpu, 2); - - xen_mc_issue(PARAVIRT_LAZY_CPU); - - /* - * XXX sleazy hack: If we're being called in a lazy-cpu zone, - * it means we're in a context switch, and %gs has just been - * saved. This means we can zero it out to prevent faults on - * exit from the hypervisor if the next process has no %gs. - * Either way, it has been saved, and the new value will get - * loaded properly. This will go away as soon as Xen has been - * modified to not save/restore %gs for normal hypercalls. - */ - if (xen_get_lazy_mode() == PARAVIRT_LAZY_CPU) - loadsegment(gs, 0); -} - -static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, - u32 low, u32 high) -{ - unsigned long lp = (unsigned long)&dt[entrynum]; - xmaddr_t mach_lp = virt_to_machine(lp); - u64 entry = (u64)high << 32 | low; - - preempt_disable(); - - xen_mc_flush(); - if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry)) - BUG(); - - preempt_enable(); -} - -static int cvt_gate_to_trap(int vector, u32 low, u32 high, - struct trap_info *info) -{ - u8 type, dpl; - - type = (high >> 8) & 0x1f; - dpl = (high >> 13) & 3; - - if (type != 0xf && type != 0xe) - return 0; - - info->vector = vector; - info->address = (high & 0xffff0000) | (low & 0x0000ffff); - info->cs = low >> 16; - info->flags = dpl; - /* interrupt gates clear IF */ - if (type == 0xe) - info->flags |= 4; - - return 1; -} - -/* Locations of each CPU's IDT */ -static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc); - -/* Set an IDT entry. If the entry is part of the current IDT, then - also update Xen. */ -static void xen_write_idt_entry(struct desc_struct *dt, int entrynum, - u32 low, u32 high) -{ - unsigned long p = (unsigned long)&dt[entrynum]; - unsigned long start, end; - - preempt_disable(); - - start = __get_cpu_var(idt_desc).address; - end = start + __get_cpu_var(idt_desc).size + 1; - - xen_mc_flush(); - - write_dt_entry(dt, entrynum, low, high); - - if (p >= start && (p + 8) <= end) { - struct trap_info info[2]; - - info[1].address = 0; - - if (cvt_gate_to_trap(entrynum, low, high, &info[0])) - if (HYPERVISOR_set_trap_table(info)) - BUG(); - } - - preempt_enable(); -} - -static void xen_convert_trap_info(const struct Xgt_desc_struct *desc, - struct trap_info *traps) -{ - unsigned in, out, count; - - count = (desc->size+1) / 8; - BUG_ON(count > 256); - - for (in = out = 0; in < count; in++) { - const u32 *entry = (u32 *)(desc->address + in * 8); - - if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) - out++; - } - traps[out].address = 0; -} - -void xen_copy_trap_info(struct trap_info *traps) -{ - const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc); - - xen_convert_trap_info(desc, traps); -} - -/* Load a new IDT into Xen. In principle this can be per-CPU, so we - hold a spinlock to protect the static traps[] array (static because - it avoids allocation, and saves stack space). */ -static void xen_load_idt(const struct Xgt_desc_struct *desc) -{ - static DEFINE_SPINLOCK(lock); - static struct trap_info traps[257]; - - spin_lock(&lock); - - __get_cpu_var(idt_desc) = *desc; - - xen_convert_trap_info(desc, traps); - - xen_mc_flush(); - if (HYPERVISOR_set_trap_table(traps)) - BUG(); - - spin_unlock(&lock); -} - -/* Write a GDT descriptor entry. Ignore LDT descriptors, since - they're handled differently. */ -static void xen_write_gdt_entry(struct desc_struct *dt, int entry, - u32 low, u32 high) -{ - preempt_disable(); - - switch ((high >> 8) & 0xff) { - case DESCTYPE_LDT: - case DESCTYPE_TSS: - /* ignore */ - break; - - default: { - xmaddr_t maddr = virt_to_machine(&dt[entry]); - u64 desc = (u64)high << 32 | low; - - xen_mc_flush(); - if (HYPERVISOR_update_descriptor(maddr.maddr, desc)) - BUG(); - } - - } - - preempt_enable(); -} - -static void xen_load_esp0(struct tss_struct *tss, - struct thread_struct *thread) -{ - struct multicall_space mcs = xen_mc_entry(0); - MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0); - xen_mc_issue(PARAVIRT_LAZY_CPU); -} - -static void xen_set_iopl_mask(unsigned mask) -{ - struct physdev_set_iopl set_iopl; - - /* Force the change at ring 0. */ - set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; - HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); -} - -static void xen_io_delay(void) -{ -} - -#ifdef CONFIG_X86_LOCAL_APIC -static unsigned long xen_apic_read(unsigned long reg) -{ - return 0; -} - -static void xen_apic_write(unsigned long reg, unsigned long val) -{ - /* Warn to see if there's any stray references */ - WARN_ON(1); -} -#endif - -static void xen_flush_tlb(void) -{ - struct mmuext_op *op; - struct multicall_space mcs = xen_mc_entry(sizeof(*op)); - - op = mcs.args; - op->cmd = MMUEXT_TLB_FLUSH_LOCAL; - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_MMU); -} - -static void xen_flush_tlb_single(unsigned long addr) -{ - struct mmuext_op *op; - struct multicall_space mcs = xen_mc_entry(sizeof(*op)); - - op = mcs.args; - op->cmd = MMUEXT_INVLPG_LOCAL; - op->arg1.linear_addr = addr & PAGE_MASK; - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_MMU); -} - -static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm, - unsigned long va) -{ - struct { - struct mmuext_op op; - cpumask_t mask; - } *args; - cpumask_t cpumask = *cpus; - struct multicall_space mcs; - - /* - * A couple of (to be removed) sanity checks: - * - * - current CPU must not be in mask - * - mask must exist :) - */ - BUG_ON(cpus_empty(cpumask)); - BUG_ON(cpu_isset(smp_processor_id(), cpumask)); - BUG_ON(!mm); - - /* If a CPU which we ran on has gone down, OK. */ - cpus_and(cpumask, cpumask, cpu_online_map); - if (cpus_empty(cpumask)) - return; - - mcs = xen_mc_entry(sizeof(*args)); - args = mcs.args; - args->mask = cpumask; - args->op.arg2.vcpumask = &args->mask; - - if (va == TLB_FLUSH_ALL) { - args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; - } else { - args->op.cmd = MMUEXT_INVLPG_MULTI; - args->op.arg1.linear_addr = va; - } - - MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_MMU); -} - -static void xen_write_cr2(unsigned long cr2) -{ - x86_read_percpu(xen_vcpu)->arch.cr2 = cr2; -} - -static unsigned long xen_read_cr2(void) -{ - return x86_read_percpu(xen_vcpu)->arch.cr2; -} - -static unsigned long xen_read_cr2_direct(void) -{ - return x86_read_percpu(xen_vcpu_info.arch.cr2); -} - -static void xen_write_cr4(unsigned long cr4) -{ - /* Just ignore cr4 changes; Xen doesn't allow us to do - anything anyway. */ -} - -static unsigned long xen_read_cr3(void) -{ - return x86_read_percpu(xen_cr3); -} - -static void xen_write_cr3(unsigned long cr3) -{ - BUG_ON(preemptible()); - - if (cr3 == x86_read_percpu(xen_cr3)) { - /* just a simple tlb flush */ - xen_flush_tlb(); - return; - } - - x86_write_percpu(xen_cr3, cr3); - - - { - struct mmuext_op *op; - struct multicall_space mcs = xen_mc_entry(sizeof(*op)); - unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); - - op = mcs.args; - op->cmd = MMUEXT_NEW_BASEPTR; - op->arg1.mfn = mfn; - - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_CPU); - } -} - -/* Early in boot, while setting up the initial pagetable, assume - everything is pinned. */ -static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) -{ - BUG_ON(mem_map); /* should only be used early */ - make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); -} - -/* This needs to make sure the new pte page is pinned iff its being - attached to a pinned pagetable. */ -static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) -{ - struct page *page = pfn_to_page(pfn); - - if (PagePinned(virt_to_page(mm->pgd))) { - SetPagePinned(page); - - if (!PageHighMem(page)) - make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); - else - /* make sure there are no stray mappings of - this page */ - kmap_flush_unused(); - } -} - -/* This should never happen until we're OK to use struct page */ -static void xen_release_pt(u32 pfn) -{ - struct page *page = pfn_to_page(pfn); - - if (PagePinned(page)) { - if (!PageHighMem(page)) - make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); - } -} - -#ifdef CONFIG_HIGHPTE -static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) -{ - pgprot_t prot = PAGE_KERNEL; - - if (PagePinned(page)) - prot = PAGE_KERNEL_RO; - - if (0 && PageHighMem(page)) - printk("mapping highpte %lx type %d prot %s\n", - page_to_pfn(page), type, - (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ"); - - return kmap_atomic_prot(page, type, prot); -} -#endif - -static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) -{ - /* If there's an existing pte, then don't allow _PAGE_RW to be set */ - if (pte_val_ma(*ptep) & _PAGE_PRESENT) - pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & - pte_val_ma(pte)); - - return pte; -} - -/* Init-time set_pte while constructing initial pagetables, which - doesn't allow RO pagetable pages to be remapped RW */ -static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) -{ - pte = mask_rw_pte(ptep, pte); - - xen_set_pte(ptep, pte); -} - -static __init void xen_pagetable_setup_start(pgd_t *base) -{ - pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; - - /* special set_pte for pagetable initialization */ - paravirt_ops.set_pte = xen_set_pte_init; - - init_mm.pgd = base; - /* - * copy top-level of Xen-supplied pagetable into place. For - * !PAE we can use this as-is, but for PAE it is a stand-in - * while we copy the pmd pages. - */ - memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); - - if (PTRS_PER_PMD > 1) { - int i; - /* - * For PAE, need to allocate new pmds, rather than - * share Xen's, since Xen doesn't like pmd's being - * shared between address spaces. - */ - for (i = 0; i < PTRS_PER_PGD; i++) { - if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { - pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); - - memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), - PAGE_SIZE); - - make_lowmem_page_readonly(pmd); - - set_pgd(&base[i], __pgd(1 + __pa(pmd))); - } else - pgd_clear(&base[i]); - } - } - - /* make sure zero_page is mapped RO so we can use it in pagetables */ - make_lowmem_page_readonly(empty_zero_page); - make_lowmem_page_readonly(base); - /* - * Switch to new pagetable. This is done before - * pagetable_init has done anything so that the new pages - * added to the table can be prepared properly for Xen. - */ - xen_write_cr3(__pa(base)); -} - -static __init void xen_pagetable_setup_done(pgd_t *base) -{ - /* This will work as long as patching hasn't happened yet - (which it hasn't) */ - paravirt_ops.alloc_pt = xen_alloc_pt; - paravirt_ops.set_pte = xen_set_pte; - - if (!xen_feature(XENFEAT_auto_translated_physmap)) { - /* - * Create a mapping for the shared info page. - * Should be set_fixmap(), but shared_info is a machine - * address with no corresponding pseudo-phys address. - */ - set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP), - PFN_DOWN(xen_start_info->shared_info), - PAGE_KERNEL); - - HYPERVISOR_shared_info = - (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); - - } else - HYPERVISOR_shared_info = - (struct shared_info *)__va(xen_start_info->shared_info); - - /* Actually pin the pagetable down, but we can't set PG_pinned - yet because the page structures don't exist yet. */ - { - struct mmuext_op op; -#ifdef CONFIG_X86_PAE - op.cmd = MMUEXT_PIN_L3_TABLE; -#else - op.cmd = MMUEXT_PIN_L3_TABLE; -#endif - op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(base))); - if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) - BUG(); - } -} - -/* This is called once we have the cpu_possible_map */ -void __init xen_setup_vcpu_info_placement(void) -{ - int cpu; - - for_each_possible_cpu(cpu) - xen_vcpu_setup(cpu); - - /* xen_vcpu_setup managed to place the vcpu_info within the - percpu area for all cpus, so make use of it */ - if (have_vcpu_info_placement) { - printk(KERN_INFO "Xen: using vcpu_info placement\n"); - - paravirt_ops.save_fl = xen_save_fl_direct; - paravirt_ops.restore_fl = xen_restore_fl_direct; - paravirt_ops.irq_disable = xen_irq_disable_direct; - paravirt_ops.irq_enable = xen_irq_enable_direct; - paravirt_ops.read_cr2 = xen_read_cr2_direct; - paravirt_ops.iret = xen_iret_direct; - } -} - -static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, - unsigned long addr, unsigned len) -{ - char *start, *end, *reloc; - unsigned ret; - - start = end = reloc = NULL; - -#define SITE(x) \ - case PARAVIRT_PATCH(x): \ - if (have_vcpu_info_placement) { \ - start = (char *)xen_##x##_direct; \ - end = xen_##x##_direct_end; \ - reloc = xen_##x##_direct_reloc; \ - } \ - goto patch_site - - switch (type) { - SITE(irq_enable); - SITE(irq_disable); - SITE(save_fl); - SITE(restore_fl); -#undef SITE - - patch_site: - if (start == NULL || (end-start) > len) - goto default_patch; - - ret = paravirt_patch_insns(insnbuf, len, start, end); - - /* Note: because reloc is assigned from something that - appears to be an array, gcc assumes it's non-null, - but doesn't know its relationship with start and - end. */ - if (reloc > start && reloc < end) { - int reloc_off = reloc - start; - long *relocp = (long *)(insnbuf + reloc_off); - long delta = start - (char *)addr; - - *relocp += delta; - } - break; - - default_patch: - default: - ret = paravirt_patch_default(type, clobbers, insnbuf, - addr, len); - break; - } - - return ret; -} - -static const struct paravirt_ops xen_paravirt_ops __initdata = { - .paravirt_enabled = 1, - .shared_kernel_pmd = 0, - - .name = "Xen", - .banner = xen_banner, - - .patch = xen_patch, - - .memory_setup = xen_memory_setup, - .arch_setup = xen_arch_setup, - .init_IRQ = xen_init_IRQ, - .post_allocator_init = xen_mark_init_mm_pinned, - - .time_init = xen_time_init, - .set_wallclock = xen_set_wallclock, - .get_wallclock = xen_get_wallclock, - .get_cpu_khz = xen_cpu_khz, - .sched_clock = xen_sched_clock, - - .cpuid = xen_cpuid, - - .set_debugreg = xen_set_debugreg, - .get_debugreg = xen_get_debugreg, - - .clts = native_clts, - - .read_cr0 = native_read_cr0, - .write_cr0 = native_write_cr0, - - .read_cr2 = xen_read_cr2, - .write_cr2 = xen_write_cr2, - - .read_cr3 = xen_read_cr3, - .write_cr3 = xen_write_cr3, - - .read_cr4 = native_read_cr4, - .read_cr4_safe = native_read_cr4_safe, - .write_cr4 = xen_write_cr4, - - .save_fl = xen_save_fl, - .restore_fl = xen_restore_fl, - .irq_disable = xen_irq_disable, - .irq_enable = xen_irq_enable, - .safe_halt = xen_safe_halt, - .halt = xen_halt, - .wbinvd = native_wbinvd, - - .read_msr = native_read_msr_safe, - .write_msr = native_write_msr_safe, - .read_tsc = native_read_tsc, - .read_pmc = native_read_pmc, - - .iret = (void *)&hypercall_page[__HYPERVISOR_iret], - .irq_enable_sysexit = NULL, /* never called */ - - .load_tr_desc = paravirt_nop, - .set_ldt = xen_set_ldt, - .load_gdt = xen_load_gdt, - .load_idt = xen_load_idt, - .load_tls = xen_load_tls, - - .store_gdt = native_store_gdt, - .store_idt = native_store_idt, - .store_tr = xen_store_tr, - - .write_ldt_entry = xen_write_ldt_entry, - .write_gdt_entry = xen_write_gdt_entry, - .write_idt_entry = xen_write_idt_entry, - .load_esp0 = xen_load_esp0, - - .set_iopl_mask = xen_set_iopl_mask, - .io_delay = xen_io_delay, - -#ifdef CONFIG_X86_LOCAL_APIC - .apic_write = xen_apic_write, - .apic_write_atomic = xen_apic_write, - .apic_read = xen_apic_read, - .setup_boot_clock = paravirt_nop, - .setup_secondary_clock = paravirt_nop, - .startup_ipi_hook = paravirt_nop, -#endif - - .flush_tlb_user = xen_flush_tlb, - .flush_tlb_kernel = xen_flush_tlb, - .flush_tlb_single = xen_flush_tlb_single, - .flush_tlb_others = xen_flush_tlb_others, - - .pte_update = paravirt_nop, - .pte_update_defer = paravirt_nop, - - .pagetable_setup_start = xen_pagetable_setup_start, - .pagetable_setup_done = xen_pagetable_setup_done, - - .alloc_pt = xen_alloc_pt_init, - .release_pt = xen_release_pt, - .alloc_pd = paravirt_nop, - .alloc_pd_clone = paravirt_nop, - .release_pd = paravirt_nop, - -#ifdef CONFIG_HIGHPTE - .kmap_atomic_pte = xen_kmap_atomic_pte, -#endif - - .set_pte = NULL, /* see xen_pagetable_setup_* */ - .set_pte_at = xen_set_pte_at, - .set_pmd = xen_set_pmd, - - .pte_val = xen_pte_val, - .pgd_val = xen_pgd_val, - - .make_pte = xen_make_pte, - .make_pgd = xen_make_pgd, - -#ifdef CONFIG_X86_PAE - .set_pte_atomic = xen_set_pte_atomic, - .set_pte_present = xen_set_pte_at, - .set_pud = xen_set_pud, - .pte_clear = xen_pte_clear, - .pmd_clear = xen_pmd_clear, - - .make_pmd = xen_make_pmd, - .pmd_val = xen_pmd_val, -#endif /* PAE */ - - .activate_mm = xen_activate_mm, - .dup_mmap = xen_dup_mmap, - .exit_mmap = xen_exit_mmap, - - .set_lazy_mode = xen_set_lazy_mode, -}; - -#ifdef CONFIG_SMP -static const struct smp_ops xen_smp_ops __initdata = { - .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, - .smp_prepare_cpus = xen_smp_prepare_cpus, - .cpu_up = xen_cpu_up, - .smp_cpus_done = xen_smp_cpus_done, - - .smp_send_stop = xen_smp_send_stop, - .smp_send_reschedule = xen_smp_send_reschedule, - .smp_call_function_mask = xen_smp_call_function_mask, -}; -#endif /* CONFIG_SMP */ - -static void xen_reboot(int reason) -{ -#ifdef CONFIG_SMP - smp_send_stop(); -#endif - - if (HYPERVISOR_sched_op(SCHEDOP_shutdown, reason)) - BUG(); -} - -static void xen_restart(char *msg) -{ - xen_reboot(SHUTDOWN_reboot); -} - -static void xen_emergency_restart(void) -{ - xen_reboot(SHUTDOWN_reboot); -} - -static void xen_machine_halt(void) -{ - xen_reboot(SHUTDOWN_poweroff); -} - -static void xen_crash_shutdown(struct pt_regs *regs) -{ - xen_reboot(SHUTDOWN_crash); -} - -static const struct machine_ops __initdata xen_machine_ops = { - .restart = xen_restart, - .halt = xen_machine_halt, - .power_off = xen_machine_halt, - .shutdown = xen_machine_halt, - .crash_shutdown = xen_crash_shutdown, - .emergency_restart = xen_emergency_restart, -}; - - -/* First C function to be called on Xen boot */ -asmlinkage void __init xen_start_kernel(void) -{ - pgd_t *pgd; - - if (!xen_start_info) - return; - - BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0); - - /* Install Xen paravirt ops */ - paravirt_ops = xen_paravirt_ops; - machine_ops = xen_machine_ops; - -#ifdef CONFIG_SMP - smp_ops = xen_smp_ops; -#endif - - xen_setup_features(); - - /* Get mfn list */ - if (!xen_feature(XENFEAT_auto_translated_physmap)) - phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list; - - pgd = (pgd_t *)xen_start_info->pt_base; - - init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; - - init_mm.pgd = pgd; /* use the Xen pagetables to start */ - - /* keep using Xen gdt for now; no urgent need to change it */ - - x86_write_percpu(xen_cr3, __pa(pgd)); - -#ifdef CONFIG_SMP - /* Don't do the full vcpu_info placement stuff until we have a - possible map. */ - per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; -#else - /* May as well do it now, since there's no good time to call - it later on UP. */ - xen_setup_vcpu_info_placement(); -#endif - - paravirt_ops.kernel_rpl = 1; - if (xen_feature(XENFEAT_supervisor_mode_kernel)) - paravirt_ops.kernel_rpl = 0; - - /* set the limit of our address space */ - reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE); - - /* set up basic CPUID stuff */ - cpu_detect(&new_cpu_data); - new_cpu_data.hard_math = 1; - new_cpu_data.x86_capability[0] = cpuid_edx(1); - - /* Poke various useful things into boot_params */ - LOADER_TYPE = (9 << 4) | 0; - INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0; - INITRD_SIZE = xen_start_info->mod_len; - - /* Start the world */ - start_kernel(); -} diff --git a/arch/i386/xen/events.c b/arch/i386/xen/events.c deleted file mode 100644 index da1b173547a1..000000000000 --- a/arch/i386/xen/events.c +++ /dev/null @@ -1,591 +0,0 @@ -/* - * Xen event channels - * - * Xen models interrupts with abstract event channels. Because each - * domain gets 1024 event channels, but NR_IRQ is not that large, we - * must dynamically map irqs<->event channels. The event channels - * interface with the rest of the kernel by defining a xen interrupt - * chip. When an event is recieved, it is mapped to an irq and sent - * through the normal interrupt processing path. - * - * There are four kinds of events which can be mapped to an event - * channel: - * - * 1. Inter-domain notifications. This includes all the virtual - * device events, since they're driven by front-ends in another domain - * (typically dom0). - * 2. VIRQs, typically used for timers. These are per-cpu events. - * 3. IPIs. - * 4. Hardware interrupts. Not supported at present. - * - * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 - */ - -#include <linux/linkage.h> -#include <linux/interrupt.h> -#include <linux/irq.h> -#include <linux/module.h> -#include <linux/string.h> - -#include <asm/ptrace.h> -#include <asm/irq.h> -#include <asm/sync_bitops.h> -#include <asm/xen/hypercall.h> -#include <asm/xen/hypervisor.h> - -#include <xen/events.h> -#include <xen/interface/xen.h> -#include <xen/interface/event_channel.h> - -#include "xen-ops.h" - -/* - * This lock protects updates to the following mapping and reference-count - * arrays. The lock does not need to be acquired to read the mapping tables. - */ -static DEFINE_SPINLOCK(irq_mapping_update_lock); - -/* IRQ <-> VIRQ mapping. */ -static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; - -/* IRQ <-> IPI mapping */ -static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1}; - -/* Packed IRQ information: binding type, sub-type index, and event channel. */ -struct packed_irq -{ - unsigned short evtchn; - unsigned char index; - unsigned char type; -}; - -static struct packed_irq irq_info[NR_IRQS]; - -/* Binding types. */ -enum { - IRQT_UNBOUND, - IRQT_PIRQ, - IRQT_VIRQ, - IRQT_IPI, - IRQT_EVTCHN -}; - -/* Convenient shorthand for packed representation of an unbound IRQ. */ -#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0) - -static int evtchn_to_irq[NR_EVENT_CHANNELS] = { - [0 ... NR_EVENT_CHANNELS-1] = -1 -}; -static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG]; -static u8 cpu_evtchn[NR_EVENT_CHANNELS]; - -/* Reference counts for bindings to IRQs. */ -static int irq_bindcount[NR_IRQS]; - -/* Xen will never allocate port zero for any purpose. */ -#define VALID_EVTCHN(chn) ((chn) != 0) - -/* - * Force a proper event-channel callback from Xen after clearing the - * callback mask. We do this in a very simple manner, by making a call - * down into Xen. The pending flag will be checked by Xen on return. - */ -void force_evtchn_callback(void) -{ - (void)HYPERVISOR_xen_version(0, NULL); -} -EXPORT_SYMBOL_GPL(force_evtchn_callback); - -static struct irq_chip xen_dynamic_chip; - -/* Constructor for packed IRQ information. */ -static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn) -{ - return (struct packed_irq) { evtchn, index, type }; -} - -/* - * Accessors for packed IRQ information. - */ -static inline unsigned int evtchn_from_irq(int irq) -{ - return irq_info[irq].evtchn; -} - -static inline unsigned int index_from_irq(int irq) -{ - return irq_info[irq].index; -} - -static inline unsigned int type_from_irq(int irq) -{ - return irq_info[irq].type; -} - -static inline unsigned long active_evtchns(unsigned int cpu, - struct shared_info *sh, - unsigned int idx) -{ - return (sh->evtchn_pending[idx] & - cpu_evtchn_mask[cpu][idx] & - ~sh->evtchn_mask[idx]); -} - -static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) -{ - int irq = evtchn_to_irq[chn]; - - BUG_ON(irq == -1); -#ifdef CONFIG_SMP - irq_desc[irq].affinity = cpumask_of_cpu(cpu); -#endif - - __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]); - __set_bit(chn, cpu_evtchn_mask[cpu]); - - cpu_evtchn[chn] = cpu; -} - -static void init_evtchn_cpu_bindings(void) -{ -#ifdef CONFIG_SMP - int i; - /* By default all event channels notify CPU#0. */ - for (i = 0; i < NR_IRQS; i++) - irq_desc[i].affinity = cpumask_of_cpu(0); -#endif - - memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); - memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0])); -} - -static inline unsigned int cpu_from_evtchn(unsigned int evtchn) -{ - return cpu_evtchn[evtchn]; -} - -static inline void clear_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_clear_bit(port, &s->evtchn_pending[0]); -} - -static inline void set_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, &s->evtchn_pending[0]); -} - - -/** - * notify_remote_via_irq - send event to remote end of event channel via irq - * @irq: irq of event channel to send event to - * - * Unlike notify_remote_via_evtchn(), this is safe to use across - * save/restore. Notifications on a broken connection are silently - * dropped. - */ -void notify_remote_via_irq(int irq) -{ - int evtchn = evtchn_from_irq(irq); - - if (VALID_EVTCHN(evtchn)) - notify_remote_via_evtchn(evtchn); -} -EXPORT_SYMBOL_GPL(notify_remote_via_irq); - -static void mask_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, &s->evtchn_mask[0]); -} - -static void unmask_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - unsigned int cpu = get_cpu(); - - BUG_ON(!irqs_disabled()); - - /* Slow path (hypercall) if this is a non-local port. */ - if (unlikely(cpu != cpu_from_evtchn(port))) { - struct evtchn_unmask unmask = { .port = port }; - (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); - } else { - struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); - - sync_clear_bit(port, &s->evtchn_mask[0]); - - /* - * The following is basically the equivalent of - * 'hw_resend_irq'. Just like a real IO-APIC we 'lose - * the interrupt edge' if the channel is masked. - */ - if (sync_test_bit(port, &s->evtchn_pending[0]) && - !sync_test_and_set_bit(port / BITS_PER_LONG, - &vcpu_info->evtchn_pending_sel)) - vcpu_info->evtchn_upcall_pending = 1; - } - - put_cpu(); -} - -static int find_unbound_irq(void) -{ - int irq; - - /* Only allocate from dynirq range */ - for (irq = 0; irq < NR_IRQS; irq++) - if (irq_bindcount[irq] == 0) - break; - - if (irq == NR_IRQS) - panic("No available IRQ to bind to: increase NR_IRQS!\n"); - - return irq; -} - -int bind_evtchn_to_irq(unsigned int evtchn) -{ - int irq; - - spin_lock(&irq_mapping_update_lock); - - irq = evtchn_to_irq[evtchn]; - - if (irq == -1) { - irq = find_unbound_irq(); - - dynamic_irq_init(irq); - set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_level_irq, "event"); - - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn); - } - - irq_bindcount[irq]++; - - spin_unlock(&irq_mapping_update_lock); - - return irq; -} -EXPORT_SYMBOL_GPL(bind_evtchn_to_irq); - -static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) -{ - struct evtchn_bind_ipi bind_ipi; - int evtchn, irq; - - spin_lock(&irq_mapping_update_lock); - - irq = per_cpu(ipi_to_irq, cpu)[ipi]; - if (irq == -1) { - irq = find_unbound_irq(); - if (irq < 0) - goto out; - - dynamic_irq_init(irq); - set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_level_irq, "ipi"); - - bind_ipi.vcpu = cpu; - if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, - &bind_ipi) != 0) - BUG(); - evtchn = bind_ipi.port; - - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); - - per_cpu(ipi_to_irq, cpu)[ipi] = irq; - - bind_evtchn_to_cpu(evtchn, cpu); - } - - irq_bindcount[irq]++; - - out: - spin_unlock(&irq_mapping_update_lock); - return irq; -} - - -static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) -{ - struct evtchn_bind_virq bind_virq; - int evtchn, irq; - - spin_lock(&irq_mapping_update_lock); - - irq = per_cpu(virq_to_irq, cpu)[virq]; - - if (irq == -1) { - bind_virq.virq = virq; - bind_virq.vcpu = cpu; - if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, - &bind_virq) != 0) - BUG(); - evtchn = bind_virq.port; - - irq = find_unbound_irq(); - - dynamic_irq_init(irq); - set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_level_irq, "virq"); - - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); - - per_cpu(virq_to_irq, cpu)[virq] = irq; - - bind_evtchn_to_cpu(evtchn, cpu); - } - - irq_bindcount[irq]++; - - spin_unlock(&irq_mapping_update_lock); - - return irq; -} - -static void unbind_from_irq(unsigned int irq) -{ - struct evtchn_close close; - int evtchn = evtchn_from_irq(irq); - - spin_lock(&irq_mapping_update_lock); - - if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) { - close.port = evtchn; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) - BUG(); - - switch (type_from_irq(irq)) { - case IRQT_VIRQ: - per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) - [index_from_irq(irq)] = -1; - break; - default: - break; - } - - /* Closed ports are implicitly re-bound to VCPU0. */ - bind_evtchn_to_cpu(evtchn, 0); - - evtchn_to_irq[evtchn] = -1; - irq_info[irq] = IRQ_UNBOUND; - - dynamic_irq_init(irq); - } - - spin_unlock(&irq_mapping_update_lock); -} - -int bind_evtchn_to_irqhandler(unsigned int evtchn, - irqreturn_t (*handler)(int, void *), - unsigned long irqflags, - const char *devname, void *dev_id) -{ - unsigned int irq; - int retval; - - irq = bind_evtchn_to_irq(evtchn); - retval = request_irq(irq, handler, irqflags, devname, dev_id); - if (retval != 0) { - unbind_from_irq(irq); - return retval; - } - - return irq; -} -EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); - -int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, - irqreturn_t (*handler)(int, void *), - unsigned long irqflags, const char *devname, void *dev_id) -{ - unsigned int irq; - int retval; - - irq = bind_virq_to_irq(virq, cpu); - retval = request_irq(irq, handler, irqflags, devname, dev_id); - if (retval != 0) { - unbind_from_irq(irq); - return retval; - } - - return irq; -} -EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler); - -int bind_ipi_to_irqhandler(enum ipi_vector ipi, - unsigned int cpu, - irq_handler_t handler, - unsigned long irqflags, - const char *devname, - void *dev_id) -{ - int irq, retval; - - irq = bind_ipi_to_irq(ipi, cpu); - if (irq < 0) - return irq; - - retval = request_irq(irq, handler, irqflags, devname, dev_id); - if (retval != 0) { - unbind_from_irq(irq); - return retval; - } - - return irq; -} - -void unbind_from_irqhandler(unsigned int irq, void *dev_id) -{ - free_irq(irq, dev_id); - unbind_from_irq(irq); -} -EXPORT_SYMBOL_GPL(unbind_from_irqhandler); - -void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) -{ - int irq = per_cpu(ipi_to_irq, cpu)[vector]; - BUG_ON(irq < 0); - notify_remote_via_irq(irq); -} - - -/* - * Search the CPUs pending events bitmasks. For each one found, map - * the event number to an irq, and feed it into do_IRQ() for - * handling. - * - * Xen uses a two-level bitmap to speed searching. The first level is - * a bitset of words which contain pending event bits. The second - * level is a bitset of pending events themselves. - */ -fastcall void xen_evtchn_do_upcall(struct pt_regs *regs) -{ - int cpu = get_cpu(); - struct shared_info *s = HYPERVISOR_shared_info; - struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); - unsigned long pending_words; - - vcpu_info->evtchn_upcall_pending = 0; - - /* NB. No need for a barrier here -- XCHG is a barrier on x86. */ - pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); - while (pending_words != 0) { - unsigned long pending_bits; - int word_idx = __ffs(pending_words); - pending_words &= ~(1UL << word_idx); - - while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) { - int bit_idx = __ffs(pending_bits); - int port = (word_idx * BITS_PER_LONG) + bit_idx; - int irq = evtchn_to_irq[port]; - - if (irq != -1) { - regs->orig_eax = ~irq; - do_IRQ(regs); - } - } - } - - put_cpu(); -} - -/* Rebind an evtchn so that it gets delivered to a specific cpu */ -static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) -{ - struct evtchn_bind_vcpu bind_vcpu; - int evtchn = evtchn_from_irq(irq); - - if (!VALID_EVTCHN(evtchn)) - return; - - /* Send future instances of this interrupt to other vcpu. */ - bind_vcpu.port = evtchn; - bind_vcpu.vcpu = tcpu; - - /* - * If this fails, it usually just indicates that we're dealing with a - * virq or IPI channel, which don't actually need to be rebound. Ignore - * it, but don't do the xenlinux-level rebind in that case. - */ - if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) - bind_evtchn_to_cpu(evtchn, tcpu); -} - - -static void set_affinity_irq(unsigned irq, cpumask_t dest) -{ - unsigned tcpu = first_cpu(dest); - rebind_irq_to_cpu(irq, tcpu); -} - -static void enable_dynirq(unsigned int irq) -{ - int evtchn = evtchn_from_irq(irq); - - if (VALID_EVTCHN(evtchn)) - unmask_evtchn(evtchn); -} - -static void disable_dynirq(unsigned int irq) -{ - int evtchn = evtchn_from_irq(irq); - - if (VALID_EVTCHN(evtchn)) - mask_evtchn(evtchn); -} - -static void ack_dynirq(unsigned int irq) -{ - int evtchn = evtchn_from_irq(irq); - - move_native_irq(irq); - - if (VALID_EVTCHN(evtchn)) - clear_evtchn(evtchn); -} - -static int retrigger_dynirq(unsigned int irq) -{ - int evtchn = evtchn_from_irq(irq); - int ret = 0; - - if (VALID_EVTCHN(evtchn)) { - set_evtchn(evtchn); - ret = 1; - } - - return ret; -} - -static struct irq_chip xen_dynamic_chip __read_mostly = { - .name = "xen-dyn", - .mask = disable_dynirq, - .unmask = enable_dynirq, - .ack = ack_dynirq, - .set_affinity = set_affinity_irq, - .retrigger = retrigger_dynirq, -}; - -void __init xen_init_IRQ(void) -{ - int i; - - init_evtchn_cpu_bindings(); - - /* No event channels are 'live' right now. */ - for (i = 0; i < NR_EVENT_CHANNELS; i++) - mask_evtchn(i); - - /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ - for (i = 0; i < NR_IRQS; i++) - irq_bindcount[i] = 0; - - irq_ctx_init(smp_processor_id()); -} diff --git a/arch/i386/xen/features.c b/arch/i386/xen/features.c deleted file mode 100644 index 0707714e40d6..000000000000 --- a/arch/i386/xen/features.c +++ /dev/null @@ -1,29 +0,0 @@ -/****************************************************************************** - * features.c - * - * Xen feature flags. - * - * Copyright (c) 2006, Ian Campbell, XenSource Inc. - */ -#include <linux/types.h> -#include <linux/cache.h> -#include <linux/module.h> -#include <asm/xen/hypervisor.h> -#include <xen/features.h> - -u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; -EXPORT_SYMBOL_GPL(xen_features); - -void xen_setup_features(void) -{ - struct xen_feature_info fi; - int i, j; - - for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) { - fi.submap_idx = i; - if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0) - break; - for (j = 0; j < 32; j++) - xen_features[i * 32 + j] = !!(fi.submap & 1<<j); - } -} diff --git a/arch/i386/xen/manage.c b/arch/i386/xen/manage.c deleted file mode 100644 index aa7af9e6abc0..000000000000 --- a/arch/i386/xen/manage.c +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Handle extern requests for shutdown, reboot and sysrq - */ -#include <linux/kernel.h> -#include <linux/err.h> -#include <linux/reboot.h> -#include <linux/sysrq.h> - -#include <xen/xenbus.h> - -#define SHUTDOWN_INVALID -1 -#define SHUTDOWN_POWEROFF 0 -#define SHUTDOWN_SUSPEND 2 -/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only - * report a crash, not be instructed to crash! - * HALT is the same as POWEROFF, as far as we're concerned. The tools use - * the distinction when we return the reason code to them. - */ -#define SHUTDOWN_HALT 4 - -/* Ignore multiple shutdown requests. */ -static int shutting_down = SHUTDOWN_INVALID; - -static void shutdown_handler(struct xenbus_watch *watch, - const char **vec, unsigned int len) -{ - char *str; - struct xenbus_transaction xbt; - int err; - - if (shutting_down != SHUTDOWN_INVALID) - return; - - again: - err = xenbus_transaction_start(&xbt); - if (err) - return; - - str = (char *)xenbus_read(xbt, "control", "shutdown", NULL); - /* Ignore read errors and empty reads. */ - if (XENBUS_IS_ERR_READ(str)) { - xenbus_transaction_end(xbt, 1); - return; - } - - xenbus_write(xbt, "control", "shutdown", ""); - - err = xenbus_transaction_end(xbt, 0); - if (err == -EAGAIN) { - kfree(str); - goto again; - } - - if (strcmp(str, "poweroff") == 0 || - strcmp(str, "halt") == 0) - orderly_poweroff(false); - else if (strcmp(str, "reboot") == 0) - ctrl_alt_del(); - else { - printk(KERN_INFO "Ignoring shutdown request: %s\n", str); - shutting_down = SHUTDOWN_INVALID; - } - - kfree(str); -} - -static void sysrq_handler(struct xenbus_watch *watch, const char **vec, - unsigned int len) -{ - char sysrq_key = '\0'; - struct xenbus_transaction xbt; - int err; - - again: - err = xenbus_transaction_start(&xbt); - if (err) - return; - if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) { - printk(KERN_ERR "Unable to read sysrq code in " - "control/sysrq\n"); - xenbus_transaction_end(xbt, 1); - return; - } - - if (sysrq_key != '\0') - xenbus_printf(xbt, "control", "sysrq", "%c", '\0'); - - err = xenbus_transaction_end(xbt, 0); - if (err == -EAGAIN) - goto again; - - if (sysrq_key != '\0') - handle_sysrq(sysrq_key, NULL); -} - -static struct xenbus_watch shutdown_watch = { - .node = "control/shutdown", - .callback = shutdown_handler -}; - -static struct xenbus_watch sysrq_watch = { - .node = "control/sysrq", - .callback = sysrq_handler -}; - -static int setup_shutdown_watcher(void) -{ - int err; - - err = register_xenbus_watch(&shutdown_watch); - if (err) { - printk(KERN_ERR "Failed to set shutdown watcher\n"); - return err; - } - - err = register_xenbus_watch(&sysrq_watch); - if (err) { - printk(KERN_ERR "Failed to set sysrq watcher\n"); - return err; - } - - return 0; -} - -static int shutdown_event(struct notifier_block *notifier, - unsigned long event, - void *data) -{ - setup_shutdown_watcher(); - return NOTIFY_DONE; -} - -static int __init setup_shutdown_event(void) -{ - static struct notifier_block xenstore_notifier = { - .notifier_call = shutdown_event - }; - register_xenstore_notifier(&xenstore_notifier); - - return 0; -} - -subsys_initcall(setup_shutdown_event); diff --git a/arch/i386/xen/mmu.c b/arch/i386/xen/mmu.c deleted file mode 100644 index 874db0cd1d2a..000000000000 --- a/arch/i386/xen/mmu.c +++ /dev/null @@ -1,567 +0,0 @@ -/* - * Xen mmu operations - * - * This file contains the various mmu fetch and update operations. - * The most important job they must perform is the mapping between the - * domain's pfn and the overall machine mfns. - * - * Xen allows guests to directly update the pagetable, in a controlled - * fashion. In other words, the guest modifies the same pagetable - * that the CPU actually uses, which eliminates the overhead of having - * a separate shadow pagetable. - * - * In order to allow this, it falls on the guest domain to map its - * notion of a "physical" pfn - which is just a domain-local linear - * address - into a real "machine address" which the CPU's MMU can - * use. - * - * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be - * inserted directly into the pagetable. When creating a new - * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely, - * when reading the content back with __(pgd|pmd|pte)_val, it converts - * the mfn back into a pfn. - * - * The other constraint is that all pages which make up a pagetable - * must be mapped read-only in the guest. This prevents uncontrolled - * guest updates to the pagetable. Xen strictly enforces this, and - * will disallow any pagetable update which will end up mapping a - * pagetable page RW, and will disallow using any writable page as a - * pagetable. - * - * Naively, when loading %cr3 with the base of a new pagetable, Xen - * would need to validate the whole pagetable before going on. - * Naturally, this is quite slow. The solution is to "pin" a - * pagetable, which enforces all the constraints on the pagetable even - * when it is not actively in use. This menas that Xen can be assured - * that it is still valid when you do load it into %cr3, and doesn't - * need to revalidate it. - * - * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 - */ -#include <linux/sched.h> -#include <linux/highmem.h> -#include <linux/bug.h> -#include <linux/sched.h> - -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <asm/mmu_context.h> -#include <asm/paravirt.h> - -#include <asm/xen/hypercall.h> -#include <asm/xen/hypervisor.h> - -#include <xen/page.h> -#include <xen/interface/xen.h> - -#include "multicalls.h" -#include "mmu.h" - -xmaddr_t arbitrary_virt_to_machine(unsigned long address) -{ - pte_t *pte = lookup_address(address); - unsigned offset = address & PAGE_MASK; - - BUG_ON(pte == NULL); - - return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); -} - -void make_lowmem_page_readonly(void *vaddr) -{ - pte_t *pte, ptev; - unsigned long address = (unsigned long)vaddr; - - pte = lookup_address(address); - BUG_ON(pte == NULL); - - ptev = pte_wrprotect(*pte); - - if (HYPERVISOR_update_va_mapping(address, ptev, 0)) - BUG(); -} - -void make_lowmem_page_readwrite(void *vaddr) -{ - pte_t *pte, ptev; - unsigned long address = (unsigned long)vaddr; - - pte = lookup_address(address); - BUG_ON(pte == NULL); - - ptev = pte_mkwrite(*pte); - - if (HYPERVISOR_update_va_mapping(address, ptev, 0)) - BUG(); -} - - -void xen_set_pmd(pmd_t *ptr, pmd_t val) -{ - struct multicall_space mcs; - struct mmu_update *u; - - preempt_disable(); - - mcs = xen_mc_entry(sizeof(*u)); - u = mcs.args; - u->ptr = virt_to_machine(ptr).maddr; - u->val = pmd_val_ma(val); - MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - - preempt_enable(); -} - -/* - * Associate a virtual page frame with a given physical page frame - * and protection flags for that frame. - */ -void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pgd = swapper_pg_dir + pgd_index(vaddr); - if (pgd_none(*pgd)) { - BUG(); - return; - } - pud = pud_offset(pgd, vaddr); - if (pud_none(*pud)) { - BUG(); - return; - } - pmd = pmd_offset(pud, vaddr); - if (pmd_none(*pmd)) { - BUG(); - return; - } - pte = pte_offset_kernel(pmd, vaddr); - /* <mfn,flags> stored as-is, to permit clearing entries */ - xen_set_pte(pte, mfn_pte(mfn, flags)); - - /* - * It's enough to flush this one mapping. - * (PGE mappings get flushed as well) - */ - __flush_tlb_one(vaddr); -} - -void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval) -{ - if (mm == current->mm || mm == &init_mm) { - if (xen_get_lazy_mode() == PARAVIRT_LAZY_MMU) { - struct multicall_space mcs; - mcs = xen_mc_entry(0); - - MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); - xen_mc_issue(PARAVIRT_LAZY_MMU); - return; - } else - if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0) - return; - } - xen_set_pte(ptep, pteval); -} - -#ifdef CONFIG_X86_PAE -void xen_set_pud(pud_t *ptr, pud_t val) -{ - struct multicall_space mcs; - struct mmu_update *u; - - preempt_disable(); - - mcs = xen_mc_entry(sizeof(*u)); - u = mcs.args; - u->ptr = virt_to_machine(ptr).maddr; - u->val = pud_val_ma(val); - MULTI_mmu_update(mcs.mc, u, 1, NULL, DOMID_SELF); - - xen_mc_issue(PARAVIRT_LAZY_MMU); - - preempt_enable(); -} - -void xen_set_pte(pte_t *ptep, pte_t pte) -{ - ptep->pte_high = pte.pte_high; - smp_wmb(); - ptep->pte_low = pte.pte_low; -} - -void xen_set_pte_atomic(pte_t *ptep, pte_t pte) -{ - set_64bit((u64 *)ptep, pte_val_ma(pte)); -} - -void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) -{ - ptep->pte_low = 0; - smp_wmb(); /* make sure low gets written first */ - ptep->pte_high = 0; -} - -void xen_pmd_clear(pmd_t *pmdp) -{ - xen_set_pmd(pmdp, __pmd(0)); -} - -unsigned long long xen_pte_val(pte_t pte) -{ - unsigned long long ret = 0; - - if (pte.pte_low) { - ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low; - ret = machine_to_phys(XMADDR(ret)).paddr | 1; - } - - return ret; -} - -unsigned long long xen_pmd_val(pmd_t pmd) -{ - unsigned long long ret = pmd.pmd; - if (ret) - ret = machine_to_phys(XMADDR(ret)).paddr | 1; - return ret; -} - -unsigned long long xen_pgd_val(pgd_t pgd) -{ - unsigned long long ret = pgd.pgd; - if (ret) - ret = machine_to_phys(XMADDR(ret)).paddr | 1; - return ret; -} - -pte_t xen_make_pte(unsigned long long pte) -{ - if (pte & 1) - pte = phys_to_machine(XPADDR(pte)).maddr; - - return (pte_t){ pte, pte >> 32 }; -} - -pmd_t xen_make_pmd(unsigned long long pmd) -{ - if (pmd & 1) - pmd = phys_to_machine(XPADDR(pmd)).maddr; - - return (pmd_t){ pmd }; -} - -pgd_t xen_make_pgd(unsigned long long pgd) -{ - if (pgd & _PAGE_PRESENT) - pgd = phys_to_machine(XPADDR(pgd)).maddr; - - return (pgd_t){ pgd }; -} -#else /* !PAE */ -void xen_set_pte(pte_t *ptep, pte_t pte) -{ - *ptep = pte; -} - -unsigned long xen_pte_val(pte_t pte) -{ - unsigned long ret = pte.pte_low; - - if (ret & _PAGE_PRESENT) - ret = machine_to_phys(XMADDR(ret)).paddr; - - return ret; -} - -unsigned long xen_pgd_val(pgd_t pgd) -{ - unsigned long ret = pgd.pgd; - if (ret) - ret = machine_to_phys(XMADDR(ret)).paddr | 1; - return ret; -} - -pte_t xen_make_pte(unsigned long pte) -{ - if (pte & _PAGE_PRESENT) - pte = phys_to_machine(XPADDR(pte)).maddr; - - return (pte_t){ pte }; -} - -pgd_t xen_make_pgd(unsigned long pgd) -{ - if (pgd & _PAGE_PRESENT) - pgd = phys_to_machine(XPADDR(pgd)).maddr; - - return (pgd_t){ pgd }; -} -#endif /* CONFIG_X86_PAE */ - - - -/* - (Yet another) pagetable walker. This one is intended for pinning a - pagetable. This means that it walks a pagetable and calls the - callback function on each page it finds making up the page table, - at every level. It walks the entire pagetable, but it only bothers - pinning pte pages which are below pte_limit. In the normal case - this will be TASK_SIZE, but at boot we need to pin up to - FIXADDR_TOP. But the important bit is that we don't pin beyond - there, because then we start getting into Xen's ptes. -*/ -static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, unsigned), - unsigned long limit) -{ - pgd_t *pgd = pgd_base; - int flush = 0; - unsigned long addr = 0; - unsigned long pgd_next; - - BUG_ON(limit > FIXADDR_TOP); - - if (xen_feature(XENFEAT_auto_translated_physmap)) - return 0; - - for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { - pud_t *pud; - unsigned long pud_limit, pud_next; - - pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); - - if (!pgd_val(*pgd)) - continue; - - pud = pud_offset(pgd, 0); - - if (PTRS_PER_PUD > 1) /* not folded */ - flush |= (*func)(virt_to_page(pud), 0); - - for (; addr != pud_limit; pud++, addr = pud_next) { - pmd_t *pmd; - unsigned long pmd_limit; - - pud_next = pud_addr_end(addr, pud_limit); - - if (pud_next < limit) - pmd_limit = pud_next; - else - pmd_limit = limit; - - if (pud_none(*pud)) - continue; - - pmd = pmd_offset(pud, 0); - - if (PTRS_PER_PMD > 1) /* not folded */ - flush |= (*func)(virt_to_page(pmd), 0); - - for (; addr != pmd_limit; pmd++) { - addr += (PAGE_SIZE * PTRS_PER_PTE); - if ((pmd_limit-1) < (addr-1)) { - addr = pmd_limit; - break; - } - - if (pmd_none(*pmd)) - continue; - - flush |= (*func)(pmd_page(*pmd), 0); - } - } - } - - flush |= (*func)(virt_to_page(pgd_base), UVMF_TLB_FLUSH); - - return flush; -} - -static int pin_page(struct page *page, unsigned flags) -{ - unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); - int flush; - - if (pgfl) - flush = 0; /* already pinned */ - else if (PageHighMem(page)) - /* kmaps need flushing if we found an unpinned - highpage */ - flush = 1; - else { - void *pt = lowmem_page_address(page); - unsigned long pfn = page_to_pfn(page); - struct multicall_space mcs = __xen_mc_entry(0); - - flush = 0; - - MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, - pfn_pte(pfn, PAGE_KERNEL_RO), - flags); - } - - return flush; -} - -/* This is called just after a mm has been created, but it has not - been used yet. We need to make sure that its pagetable is all - read-only, and can be pinned. */ -void xen_pgd_pin(pgd_t *pgd) -{ - struct multicall_space mcs; - struct mmuext_op *op; - - xen_mc_batch(); - - if (pgd_walk(pgd, pin_page, TASK_SIZE)) { - /* re-enable interrupts for kmap_flush_unused */ - xen_mc_issue(0); - kmap_flush_unused(); - xen_mc_batch(); - } - - mcs = __xen_mc_entry(sizeof(*op)); - op = mcs.args; - -#ifdef CONFIG_X86_PAE - op->cmd = MMUEXT_PIN_L3_TABLE; -#else - op->cmd = MMUEXT_PIN_L2_TABLE; -#endif - op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - xen_mc_issue(0); -} - -/* The init_mm pagetable is really pinned as soon as its created, but - that's before we have page structures to store the bits. So do all - the book-keeping now. */ -static __init int mark_pinned(struct page *page, unsigned flags) -{ - SetPagePinned(page); - return 0; -} - -void __init xen_mark_init_mm_pinned(void) -{ - pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); -} - -static int unpin_page(struct page *page, unsigned flags) -{ - unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); - - if (pgfl && !PageHighMem(page)) { - void *pt = lowmem_page_address(page); - unsigned long pfn = page_to_pfn(page); - struct multicall_space mcs = __xen_mc_entry(0); - - MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, - pfn_pte(pfn, PAGE_KERNEL), - flags); - } - - return 0; /* never need to flush on unpin */ -} - -/* Release a pagetables pages back as normal RW */ -static void xen_pgd_unpin(pgd_t *pgd) -{ - struct mmuext_op *op; - struct multicall_space mcs; - - xen_mc_batch(); - - mcs = __xen_mc_entry(sizeof(*op)); - - op = mcs.args; - op->cmd = MMUEXT_UNPIN_TABLE; - op->arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); - - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - - pgd_walk(pgd, unpin_page, TASK_SIZE); - - xen_mc_issue(0); -} - -void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) -{ - spin_lock(&next->page_table_lock); - xen_pgd_pin(next->pgd); - spin_unlock(&next->page_table_lock); -} - -void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) -{ - spin_lock(&mm->page_table_lock); - xen_pgd_pin(mm->pgd); - spin_unlock(&mm->page_table_lock); -} - - -#ifdef CONFIG_SMP -/* Another cpu may still have their %cr3 pointing at the pagetable, so - we need to repoint it somewhere else before we can unpin it. */ -static void drop_other_mm_ref(void *info) -{ - struct mm_struct *mm = info; - - if (__get_cpu_var(cpu_tlbstate).active_mm == mm) - leave_mm(smp_processor_id()); -} - -static void drop_mm_ref(struct mm_struct *mm) -{ - if (current->active_mm == mm) { - if (current->mm == mm) - load_cr3(swapper_pg_dir); - else - leave_mm(smp_processor_id()); - } - - if (!cpus_empty(mm->cpu_vm_mask)) - xen_smp_call_function_mask(mm->cpu_vm_mask, drop_other_mm_ref, - mm, 1); -} -#else -static void drop_mm_ref(struct mm_struct *mm) -{ - if (current->active_mm == mm) - load_cr3(swapper_pg_dir); -} -#endif - -/* - * While a process runs, Xen pins its pagetables, which means that the - * hypervisor forces it to be read-only, and it controls all updates - * to it. This means that all pagetable updates have to go via the - * hypervisor, which is moderately expensive. - * - * Since we're pulling the pagetable down, we switch to use init_mm, - * unpin old process pagetable and mark it all read-write, which - * allows further operations on it to be simple memory accesses. - * - * The only subtle point is that another CPU may be still using the - * pagetable because of lazy tlb flushing. This means we need need to - * switch all CPUs off this pagetable before we can unpin it. - */ -void xen_exit_mmap(struct mm_struct *mm) -{ - get_cpu(); /* make sure we don't move around */ - drop_mm_ref(mm); - put_cpu(); - - spin_lock(&mm->page_table_lock); - - /* pgd may not be pinned in the error exit path of execve */ - if (PagePinned(virt_to_page(mm->pgd))) - xen_pgd_unpin(mm->pgd); - spin_unlock(&mm->page_table_lock); -} diff --git a/arch/i386/xen/mmu.h b/arch/i386/xen/mmu.h deleted file mode 100644 index c9ff27f3ac3a..000000000000 --- a/arch/i386/xen/mmu.h +++ /dev/null @@ -1,60 +0,0 @@ -#ifndef _XEN_MMU_H - -#include <linux/linkage.h> -#include <asm/page.h> - -/* - * Page-directory addresses above 4GB do not fit into architectural %cr3. - * When accessing %cr3, or equivalent field in vcpu_guest_context, guests - * must use the following accessor macros to pack/unpack valid MFNs. - * - * Note that Xen is using the fact that the pagetable base is always - * page-aligned, and putting the 12 MSB of the address into the 12 LSB - * of cr3. - */ -#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20)) -#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20)) - - -void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); - -void xen_set_pte(pte_t *ptep, pte_t pteval); -void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval); -void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); - -void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); -void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); -void xen_exit_mmap(struct mm_struct *mm); - -void xen_pgd_pin(pgd_t *pgd); -//void xen_pgd_unpin(pgd_t *pgd); - -#ifdef CONFIG_X86_PAE -unsigned long long xen_pte_val(pte_t); -unsigned long long xen_pmd_val(pmd_t); -unsigned long long xen_pgd_val(pgd_t); - -pte_t xen_make_pte(unsigned long long); -pmd_t xen_make_pmd(unsigned long long); -pgd_t xen_make_pgd(unsigned long long); - -void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval); -void xen_set_pte_atomic(pte_t *ptep, pte_t pte); -void xen_set_pud(pud_t *ptr, pud_t val); -void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -void xen_pmd_clear(pmd_t *pmdp); - - -#else -unsigned long xen_pte_val(pte_t); -unsigned long xen_pmd_val(pmd_t); -unsigned long xen_pgd_val(pgd_t); - -pte_t xen_make_pte(unsigned long); -pmd_t xen_make_pmd(unsigned long); -pgd_t xen_make_pgd(unsigned long); -#endif - -#endif /* _XEN_MMU_H */ diff --git a/arch/i386/xen/multicalls.c b/arch/i386/xen/multicalls.c deleted file mode 100644 index c837e8e463db..000000000000 --- a/arch/i386/xen/multicalls.c +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Xen hypercall batching. - * - * Xen allows multiple hypercalls to be issued at once, using the - * multicall interface. This allows the cost of trapping into the - * hypervisor to be amortized over several calls. - * - * This file implements a simple interface for multicalls. There's a - * per-cpu buffer of outstanding multicalls. When you want to queue a - * multicall for issuing, you can allocate a multicall slot for the - * call and its arguments, along with storage for space which is - * pointed to by the arguments (for passing pointers to structures, - * etc). When the multicall is actually issued, all the space for the - * commands and allocated memory is freed for reuse. - * - * Multicalls are flushed whenever any of the buffers get full, or - * when explicitly requested. There's no way to get per-multicall - * return results back. It will BUG if any of the multicalls fail. - * - * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 - */ -#include <linux/percpu.h> -#include <linux/hardirq.h> - -#include <asm/xen/hypercall.h> - -#include "multicalls.h" - -#define MC_BATCH 32 -#define MC_ARGS (MC_BATCH * 16 / sizeof(u64)) - -struct mc_buffer { - struct multicall_entry entries[MC_BATCH]; - u64 args[MC_ARGS]; - unsigned mcidx, argidx; -}; - -static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); -DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); - -void xen_mc_flush(void) -{ - struct mc_buffer *b = &__get_cpu_var(mc_buffer); - int ret = 0; - unsigned long flags; - - BUG_ON(preemptible()); - - /* Disable interrupts in case someone comes in and queues - something in the middle */ - local_irq_save(flags); - - if (b->mcidx) { - int i; - - if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0) - BUG(); - for (i = 0; i < b->mcidx; i++) - if (b->entries[i].result < 0) - ret++; - b->mcidx = 0; - b->argidx = 0; - } else - BUG_ON(b->argidx != 0); - - local_irq_restore(flags); - - BUG_ON(ret); -} - -struct multicall_space __xen_mc_entry(size_t args) -{ - struct mc_buffer *b = &__get_cpu_var(mc_buffer); - struct multicall_space ret; - unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64); - - BUG_ON(preemptible()); - BUG_ON(argspace > MC_ARGS); - - if (b->mcidx == MC_BATCH || - (b->argidx + argspace) > MC_ARGS) - xen_mc_flush(); - - ret.mc = &b->entries[b->mcidx]; - b->mcidx++; - ret.args = &b->args[b->argidx]; - b->argidx += argspace; - - return ret; -} diff --git a/arch/i386/xen/multicalls.h b/arch/i386/xen/multicalls.h deleted file mode 100644 index e6f7530b156c..000000000000 --- a/arch/i386/xen/multicalls.h +++ /dev/null @@ -1,45 +0,0 @@ -#ifndef _XEN_MULTICALLS_H -#define _XEN_MULTICALLS_H - -#include "xen-ops.h" - -/* Multicalls */ -struct multicall_space -{ - struct multicall_entry *mc; - void *args; -}; - -/* Allocate room for a multicall and its args */ -struct multicall_space __xen_mc_entry(size_t args); - -DECLARE_PER_CPU(unsigned long, xen_mc_irq_flags); - -/* Call to start a batch of multiple __xen_mc_entry()s. Must be - paired with xen_mc_issue() */ -static inline void xen_mc_batch(void) -{ - /* need to disable interrupts until this entry is complete */ - local_irq_save(__get_cpu_var(xen_mc_irq_flags)); -} - -static inline struct multicall_space xen_mc_entry(size_t args) -{ - xen_mc_batch(); - return __xen_mc_entry(args); -} - -/* Flush all pending multicalls */ -void xen_mc_flush(void); - -/* Issue a multicall if we're not in a lazy mode */ -static inline void xen_mc_issue(unsigned mode) -{ - if ((xen_get_lazy_mode() & mode) == 0) - xen_mc_flush(); - - /* restore flags saved in xen_mc_batch */ - local_irq_restore(x86_read_percpu(xen_mc_irq_flags)); -} - -#endif /* _XEN_MULTICALLS_H */ diff --git a/arch/i386/xen/setup.c b/arch/i386/xen/setup.c deleted file mode 100644 index f84e77226646..000000000000 --- a/arch/i386/xen/setup.c +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Machine specific setup for xen - * - * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 - */ - -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/pm.h> - -#include <asm/elf.h> -#include <asm/e820.h> -#include <asm/setup.h> -#include <asm/xen/hypervisor.h> -#include <asm/xen/hypercall.h> - -#include <xen/interface/physdev.h> -#include <xen/features.h> - -#include "xen-ops.h" -#include "vdso.h" - -/* These are code, but not functions. Defined in entry.S */ -extern const char xen_hypervisor_callback[]; -extern const char xen_failsafe_callback[]; - -unsigned long *phys_to_machine_mapping; -EXPORT_SYMBOL(phys_to_machine_mapping); - -/** - * machine_specific_memory_setup - Hook for machine specific memory setup. - **/ - -char * __init xen_memory_setup(void) -{ - unsigned long max_pfn = xen_start_info->nr_pages; - - e820.nr_map = 0; - add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM); - - return "Xen"; -} - -static void xen_idle(void) -{ - local_irq_disable(); - - if (need_resched()) - local_irq_enable(); - else { - current_thread_info()->status &= ~TS_POLLING; - smp_mb__after_clear_bit(); - safe_halt(); - current_thread_info()->status |= TS_POLLING; - } -} - -/* - * Set the bit indicating "nosegneg" library variants should be used. - */ -static void fiddle_vdso(void) -{ - extern u32 VDSO_NOTE_MASK; /* See ../kernel/vsyscall-note.S. */ - extern char vsyscall_int80_start; - u32 *mask = (u32 *) ((unsigned long) &VDSO_NOTE_MASK - VDSO_PRELINK + - &vsyscall_int80_start); - *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; -} - -void __init xen_arch_setup(void) -{ - struct physdev_set_iopl set_iopl; - int rc; - - HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); - HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); - - if (!xen_feature(XENFEAT_auto_translated_physmap)) - HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); - - HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, - __KERNEL_CS, (unsigned long)xen_failsafe_callback); - - set_iopl.iopl = 1; - rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); - if (rc != 0) - printk(KERN_INFO "physdev_op failed %d\n", rc); - -#ifdef CONFIG_ACPI - if (!(xen_start_info->flags & SIF_INITDOMAIN)) { - printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); - disable_acpi(); - } -#endif - - memcpy(boot_command_line, xen_start_info->cmd_line, - MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? - COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); - - pm_idle = xen_idle; - -#ifdef CONFIG_SMP - /* fill cpus_possible with all available cpus */ - xen_fill_possible_map(); -#endif - - paravirt_disable_iospace(); - - fiddle_vdso(); -} diff --git a/arch/i386/xen/smp.c b/arch/i386/xen/smp.c deleted file mode 100644 index 557b8e24706a..000000000000 --- a/arch/i386/xen/smp.c +++ /dev/null @@ -1,404 +0,0 @@ -/* - * Xen SMP support - * - * This file implements the Xen versions of smp_ops. SMP under Xen is - * very straightforward. Bringing a CPU up is simply a matter of - * loading its initial context and setting it running. - * - * IPIs are handled through the Xen event mechanism. - * - * Because virtual CPUs can be scheduled onto any real CPU, there's no - * useful topology information for the kernel to make use of. As a - * result, all CPUs are treated as if they're single-core and - * single-threaded. - * - * This does not handle HOTPLUG_CPU yet. - */ -#include <linux/sched.h> -#include <linux/err.h> -#include <linux/smp.h> - -#include <asm/paravirt.h> -#include <asm/desc.h> -#include <asm/pgtable.h> -#include <asm/cpu.h> - -#include <xen/interface/xen.h> -#include <xen/interface/vcpu.h> - -#include <asm/xen/interface.h> -#include <asm/xen/hypercall.h> - -#include <xen/page.h> -#include <xen/events.h> - -#include "xen-ops.h" -#include "mmu.h" - -static cpumask_t cpu_initialized_map; -static DEFINE_PER_CPU(int, resched_irq); -static DEFINE_PER_CPU(int, callfunc_irq); - -/* - * Structure and data for smp_call_function(). This is designed to minimise - * static memory requirements. It also looks cleaner. - */ -static DEFINE_SPINLOCK(call_lock); - -struct call_data_struct { - void (*func) (void *info); - void *info; - atomic_t started; - atomic_t finished; - int wait; -}; - -static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); - -static struct call_data_struct *call_data; - -/* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. - */ -static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) -{ - return IRQ_HANDLED; -} - -static __cpuinit void cpu_bringup_and_idle(void) -{ - int cpu = smp_processor_id(); - - cpu_init(); - - preempt_disable(); - per_cpu(cpu_state, cpu) = CPU_ONLINE; - - xen_setup_cpu_clockevents(); - - /* We can take interrupts now: we're officially "up". */ - local_irq_enable(); - - wmb(); /* make sure everything is out */ - cpu_idle(); -} - -static int xen_smp_intr_init(unsigned int cpu) -{ - int rc; - const char *resched_name, *callfunc_name; - - per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1; - - resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); - rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, - cpu, - xen_reschedule_interrupt, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, - resched_name, - NULL); - if (rc < 0) - goto fail; - per_cpu(resched_irq, cpu) = rc; - - callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu); - rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR, - cpu, - xen_call_function_interrupt, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, - callfunc_name, - NULL); - if (rc < 0) - goto fail; - per_cpu(callfunc_irq, cpu) = rc; - - return 0; - - fail: - if (per_cpu(resched_irq, cpu) >= 0) - unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); - if (per_cpu(callfunc_irq, cpu) >= 0) - unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); - return rc; -} - -void __init xen_fill_possible_map(void) -{ - int i, rc; - - for (i = 0; i < NR_CPUS; i++) { - rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); - if (rc >= 0) - cpu_set(i, cpu_possible_map); - } -} - -void __init xen_smp_prepare_boot_cpu(void) -{ - int cpu; - - BUG_ON(smp_processor_id() != 0); - native_smp_prepare_boot_cpu(); - - /* We've switched to the "real" per-cpu gdt, so make sure the - old memory can be recycled */ - make_lowmem_page_readwrite(&per_cpu__gdt_page); - - for (cpu = 0; cpu < NR_CPUS; cpu++) { - cpus_clear(cpu_sibling_map[cpu]); - cpus_clear(cpu_core_map[cpu]); - } - - xen_setup_vcpu_info_placement(); -} - -void __init xen_smp_prepare_cpus(unsigned int max_cpus) -{ - unsigned cpu; - - for (cpu = 0; cpu < NR_CPUS; cpu++) { - cpus_clear(cpu_sibling_map[cpu]); - cpus_clear(cpu_core_map[cpu]); - } - - smp_store_cpu_info(0); - set_cpu_sibling_map(0); - - if (xen_smp_intr_init(0)) - BUG(); - - cpu_initialized_map = cpumask_of_cpu(0); - - /* Restrict the possible_map according to max_cpus. */ - while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { - for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--) - continue; - cpu_clear(cpu, cpu_possible_map); - } - - for_each_possible_cpu (cpu) { - struct task_struct *idle; - - if (cpu == 0) - continue; - - idle = fork_idle(cpu); - if (IS_ERR(idle)) - panic("failed fork for CPU %d", cpu); - - cpu_set(cpu, cpu_present_map); - } - - //init_xenbus_allowed_cpumask(); -} - -static __cpuinit int -cpu_initialize_context(unsigned int cpu, struct task_struct *idle) -{ - struct vcpu_guest_context *ctxt; - struct gdt_page *gdt = &per_cpu(gdt_page, cpu); - - if (cpu_test_and_set(cpu, cpu_initialized_map)) - return 0; - - ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); - if (ctxt == NULL) - return -ENOMEM; - - ctxt->flags = VGCF_IN_KERNEL; - ctxt->user_regs.ds = __USER_DS; - ctxt->user_regs.es = __USER_DS; - ctxt->user_regs.fs = __KERNEL_PERCPU; - ctxt->user_regs.gs = 0; - ctxt->user_regs.ss = __KERNEL_DS; - ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; - ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ - - memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); - - xen_copy_trap_info(ctxt->trap_ctxt); - - ctxt->ldt_ents = 0; - - BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK); - make_lowmem_page_readonly(gdt->gdt); - - ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt); - ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); - - ctxt->user_regs.cs = __KERNEL_CS; - ctxt->user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs); - - ctxt->kernel_ss = __KERNEL_DS; - ctxt->kernel_sp = idle->thread.esp0; - - ctxt->event_callback_cs = __KERNEL_CS; - ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback; - ctxt->failsafe_callback_cs = __KERNEL_CS; - ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; - - per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); - ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); - - if (HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, ctxt)) - BUG(); - - kfree(ctxt); - return 0; -} - -int __cpuinit xen_cpu_up(unsigned int cpu) -{ - struct task_struct *idle = idle_task(cpu); - int rc; - -#if 0 - rc = cpu_up_check(cpu); - if (rc) - return rc; -#endif - - init_gdt(cpu); - per_cpu(current_task, cpu) = idle; - irq_ctx_init(cpu); - xen_setup_timer(cpu); - - /* make sure interrupts start blocked */ - per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; - - rc = cpu_initialize_context(cpu, idle); - if (rc) - return rc; - - if (num_online_cpus() == 1) - alternatives_smp_switch(1); - - rc = xen_smp_intr_init(cpu); - if (rc) - return rc; - - smp_store_cpu_info(cpu); - set_cpu_sibling_map(cpu); - /* This must be done before setting cpu_online_map */ - wmb(); - - cpu_set(cpu, cpu_online_map); - - rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); - BUG_ON(rc); - - return 0; -} - -void xen_smp_cpus_done(unsigned int max_cpus) -{ -} - -static void stop_self(void *v) -{ - int cpu = smp_processor_id(); - - /* make sure we're not pinning something down */ - load_cr3(swapper_pg_dir); - /* should set up a minimal gdt */ - - HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL); - BUG(); -} - -void xen_smp_send_stop(void) -{ - smp_call_function(stop_self, NULL, 0, 0); -} - -void xen_smp_send_reschedule(int cpu) -{ - xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); -} - - -static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) -{ - unsigned cpu; - - cpus_and(mask, mask, cpu_online_map); - - for_each_cpu_mask(cpu, mask) - xen_send_IPI_one(cpu, vector); -} - -static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) -{ - void (*func) (void *info) = call_data->func; - void *info = call_data->info; - int wait = call_data->wait; - - /* - * Notify initiating CPU that I've grabbed the data and am - * about to execute the function - */ - mb(); - atomic_inc(&call_data->started); - /* - * At this point the info structure may be out of scope unless wait==1 - */ - irq_enter(); - (*func)(info); - irq_exit(); - - if (wait) { - mb(); /* commit everything before setting finished */ - atomic_inc(&call_data->finished); - } - - return IRQ_HANDLED; -} - -int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), - void *info, int wait) -{ - struct call_data_struct data; - int cpus; - - /* Holding any lock stops cpus from going down. */ - spin_lock(&call_lock); - - cpu_clear(smp_processor_id(), mask); - - cpus = cpus_weight(mask); - if (!cpus) { - spin_unlock(&call_lock); - return 0; - } - - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - - data.func = func; - data.info = info; - atomic_set(&data.started, 0); - data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); - - call_data = &data; - mb(); /* write everything before IPI */ - - /* Send a message to other CPUs and wait for them to respond */ - xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); - - /* Make sure other vcpus get a chance to run. - XXX too severe? Maybe we should check the other CPU's states? */ - HYPERVISOR_sched_op(SCHEDOP_yield, 0); - - /* Wait for response */ - while (atomic_read(&data.started) != cpus || - (wait && atomic_read(&data.finished) != cpus)) - cpu_relax(); - - spin_unlock(&call_lock); - - return 0; -} diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c deleted file mode 100644 index dfd6db69ead5..000000000000 --- a/arch/i386/xen/time.c +++ /dev/null @@ -1,593 +0,0 @@ -/* - * Xen time implementation. - * - * This is implemented in terms of a clocksource driver which uses - * the hypervisor clock as a nanosecond timebase, and a clockevent - * driver which uses the hypervisor's timer mechanism. - * - * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 - */ -#include <linux/kernel.h> -#include <linux/interrupt.h> -#include <linux/clocksource.h> -#include <linux/clockchips.h> -#include <linux/kernel_stat.h> - -#include <asm/xen/hypervisor.h> -#include <asm/xen/hypercall.h> - -#include <xen/events.h> -#include <xen/interface/xen.h> -#include <xen/interface/vcpu.h> - -#include "xen-ops.h" - -#define XEN_SHIFT 22 - -/* Xen may fire a timer up to this many ns early */ -#define TIMER_SLOP 100000 -#define NS_PER_TICK (1000000000LL / HZ) - -static cycle_t xen_clocksource_read(void); - -/* These are perodically updated in shared_info, and then copied here. */ -struct shadow_time_info { - u64 tsc_timestamp; /* TSC at last update of time vals. */ - u64 system_timestamp; /* Time, in nanosecs, since boot. */ - u32 tsc_to_nsec_mul; - int tsc_shift; - u32 version; -}; - -static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); - -/* runstate info updated by Xen */ -static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); - -/* snapshots of runstate info */ -static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot); - -/* unused ns of stolen and blocked time */ -static DEFINE_PER_CPU(u64, residual_stolen); -static DEFINE_PER_CPU(u64, residual_blocked); - -/* return an consistent snapshot of 64-bit time/counter value */ -static u64 get64(const u64 *p) -{ - u64 ret; - - if (BITS_PER_LONG < 64) { - u32 *p32 = (u32 *)p; - u32 h, l; - - /* - * Read high then low, and then make sure high is - * still the same; this will only loop if low wraps - * and carries into high. - * XXX some clean way to make this endian-proof? - */ - do { - h = p32[1]; - barrier(); - l = p32[0]; - barrier(); - } while (p32[1] != h); - - ret = (((u64)h) << 32) | l; - } else - ret = *p; - - return ret; -} - -/* - * Runstate accounting - */ -static void get_runstate_snapshot(struct vcpu_runstate_info *res) -{ - u64 state_time; - struct vcpu_runstate_info *state; - - BUG_ON(preemptible()); - - state = &__get_cpu_var(runstate); - - /* - * The runstate info is always updated by the hypervisor on - * the current CPU, so there's no need to use anything - * stronger than a compiler barrier when fetching it. - */ - do { - state_time = get64(&state->state_entry_time); - barrier(); - *res = *state; - barrier(); - } while (get64(&state->state_entry_time) != state_time); -} - -static void setup_runstate_info(int cpu) -{ - struct vcpu_register_runstate_memory_area area; - - area.addr.v = &per_cpu(runstate, cpu); - - if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, - cpu, &area)) - BUG(); -} - -static void do_stolen_accounting(void) -{ - struct vcpu_runstate_info state; - struct vcpu_runstate_info *snap; - s64 blocked, runnable, offline, stolen; - cputime_t ticks; - - get_runstate_snapshot(&state); - - WARN_ON(state.state != RUNSTATE_running); - - snap = &__get_cpu_var(runstate_snapshot); - - /* work out how much time the VCPU has not been runn*ing* */ - blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]; - runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]; - offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline]; - - *snap = state; - - /* Add the appropriate number of ticks of stolen time, - including any left-overs from last time. Passing NULL to - account_steal_time accounts the time as stolen. */ - stolen = runnable + offline + __get_cpu_var(residual_stolen); - - if (stolen < 0) - stolen = 0; - - ticks = 0; - while (stolen >= NS_PER_TICK) { - ticks++; - stolen -= NS_PER_TICK; - } - __get_cpu_var(residual_stolen) = stolen; - account_steal_time(NULL, ticks); - - /* Add the appropriate number of ticks of blocked time, - including any left-overs from last time. Passing idle to - account_steal_time accounts the time as idle/wait. */ - blocked += __get_cpu_var(residual_blocked); - - if (blocked < 0) - blocked = 0; - - ticks = 0; - while (blocked >= NS_PER_TICK) { - ticks++; - blocked -= NS_PER_TICK; - } - __get_cpu_var(residual_blocked) = blocked; - account_steal_time(idle_task(smp_processor_id()), ticks); -} - -/* - * Xen sched_clock implementation. Returns the number of unstolen - * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED - * states. - */ -unsigned long long xen_sched_clock(void) -{ - struct vcpu_runstate_info state; - cycle_t now; - u64 ret; - s64 offset; - - /* - * Ideally sched_clock should be called on a per-cpu basis - * anyway, so preempt should already be disabled, but that's - * not current practice at the moment. - */ - preempt_disable(); - - now = xen_clocksource_read(); - - get_runstate_snapshot(&state); - - WARN_ON(state.state != RUNSTATE_running); - - offset = now - state.state_entry_time; - if (offset < 0) - offset = 0; - - ret = state.time[RUNSTATE_blocked] + - state.time[RUNSTATE_running] + - offset; - - preempt_enable(); - - return ret; -} - - -/* Get the CPU speed from Xen */ -unsigned long xen_cpu_khz(void) -{ - u64 cpu_khz = 1000000ULL << 32; - const struct vcpu_time_info *info = - &HYPERVISOR_shared_info->vcpu_info[0].time; - - do_div(cpu_khz, info->tsc_to_system_mul); - if (info->tsc_shift < 0) - cpu_khz <<= -info->tsc_shift; - else - cpu_khz >>= info->tsc_shift; - - return cpu_khz; -} - -/* - * Reads a consistent set of time-base values from Xen, into a shadow data - * area. - */ -static unsigned get_time_values_from_xen(void) -{ - struct vcpu_time_info *src; - struct shadow_time_info *dst; - - /* src is shared memory with the hypervisor, so we need to - make sure we get a consistent snapshot, even in the face of - being preempted. */ - src = &__get_cpu_var(xen_vcpu)->time; - dst = &__get_cpu_var(shadow_time); - - do { - dst->version = src->version; - rmb(); /* fetch version before data */ - dst->tsc_timestamp = src->tsc_timestamp; - dst->system_timestamp = src->system_time; - dst->tsc_to_nsec_mul = src->tsc_to_system_mul; - dst->tsc_shift = src->tsc_shift; - rmb(); /* test version after fetching data */ - } while ((src->version & 1) | (dst->version ^ src->version)); - - return dst->version; -} - -/* - * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, - * yielding a 64-bit result. - */ -static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) -{ - u64 product; -#ifdef __i386__ - u32 tmp1, tmp2; -#endif - - if (shift < 0) - delta >>= -shift; - else - delta <<= shift; - -#ifdef __i386__ - __asm__ ( - "mul %5 ; " - "mov %4,%%eax ; " - "mov %%edx,%4 ; " - "mul %5 ; " - "xor %5,%5 ; " - "add %4,%%eax ; " - "adc %5,%%edx ; " - : "=A" (product), "=r" (tmp1), "=r" (tmp2) - : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); -#elif __x86_64__ - __asm__ ( - "mul %%rdx ; shrd $32,%%rdx,%%rax" - : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); -#else -#error implement me! -#endif - - return product; -} - -static u64 get_nsec_offset(struct shadow_time_info *shadow) -{ - u64 now, delta; - now = native_read_tsc(); - delta = now - shadow->tsc_timestamp; - return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); -} - -static cycle_t xen_clocksource_read(void) -{ - struct shadow_time_info *shadow = &get_cpu_var(shadow_time); - cycle_t ret; - unsigned version; - - do { - version = get_time_values_from_xen(); - barrier(); - ret = shadow->system_timestamp + get_nsec_offset(shadow); - barrier(); - } while (version != __get_cpu_var(xen_vcpu)->time.version); - - put_cpu_var(shadow_time); - - return ret; -} - -static void xen_read_wallclock(struct timespec *ts) -{ - const struct shared_info *s = HYPERVISOR_shared_info; - u32 version; - u64 delta; - struct timespec now; - - /* get wallclock at system boot */ - do { - version = s->wc_version; - rmb(); /* fetch version before time */ - now.tv_sec = s->wc_sec; - now.tv_nsec = s->wc_nsec; - rmb(); /* fetch time before checking version */ - } while ((s->wc_version & 1) | (version ^ s->wc_version)); - - delta = xen_clocksource_read(); /* time since system boot */ - delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; - - now.tv_nsec = do_div(delta, NSEC_PER_SEC); - now.tv_sec = delta; - - set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); -} - -unsigned long xen_get_wallclock(void) -{ - struct timespec ts; - - xen_read_wallclock(&ts); - - return ts.tv_sec; -} - -int xen_set_wallclock(unsigned long now) -{ - /* do nothing for domU */ - return -1; -} - -static struct clocksource xen_clocksource __read_mostly = { - .name = "xen", - .rating = 400, - .read = xen_clocksource_read, - .mask = ~0, - .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */ - .shift = XEN_SHIFT, - .flags = CLOCK_SOURCE_IS_CONTINUOUS, -}; - -/* - Xen clockevent implementation - - Xen has two clockevent implementations: - - The old timer_op one works with all released versions of Xen prior - to version 3.0.4. This version of the hypervisor provides a - single-shot timer with nanosecond resolution. However, sharing the - same event channel is a 100Hz tick which is delivered while the - vcpu is running. We don't care about or use this tick, but it will - cause the core time code to think the timer fired too soon, and - will end up resetting it each time. It could be filtered, but - doing so has complications when the ktime clocksource is not yet - the xen clocksource (ie, at boot time). - - The new vcpu_op-based timer interface allows the tick timer period - to be changed or turned off. The tick timer is not useful as a - periodic timer because events are only delivered to running vcpus. - The one-shot timer can report when a timeout is in the past, so - set_next_event is capable of returning -ETIME when appropriate. - This interface is used when available. -*/ - - -/* - Get a hypervisor absolute time. In theory we could maintain an - offset between the kernel's time and the hypervisor's time, and - apply that to a kernel's absolute timeout. Unfortunately the - hypervisor and kernel times can drift even if the kernel is using - the Xen clocksource, because ntp can warp the kernel's clocksource. -*/ -static s64 get_abs_timeout(unsigned long delta) -{ - return xen_clocksource_read() + delta; -} - -static void xen_timerop_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) -{ - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - /* unsupported */ - WARN_ON(1); - break; - - case CLOCK_EVT_MODE_ONESHOT: - case CLOCK_EVT_MODE_RESUME: - break; - - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - HYPERVISOR_set_timer_op(0); /* cancel timeout */ - break; - } -} - -static int xen_timerop_set_next_event(unsigned long delta, - struct clock_event_device *evt) -{ - WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); - - if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0) - BUG(); - - /* We may have missed the deadline, but there's no real way of - knowing for sure. If the event was in the past, then we'll - get an immediate interrupt. */ - - return 0; -} - -static const struct clock_event_device xen_timerop_clockevent = { - .name = "xen", - .features = CLOCK_EVT_FEAT_ONESHOT, - - .max_delta_ns = 0xffffffff, - .min_delta_ns = TIMER_SLOP, - - .mult = 1, - .shift = 0, - .rating = 500, - - .set_mode = xen_timerop_set_mode, - .set_next_event = xen_timerop_set_next_event, -}; - - - -static void xen_vcpuop_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) -{ - int cpu = smp_processor_id(); - - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - WARN_ON(1); /* unsupported */ - break; - - case CLOCK_EVT_MODE_ONESHOT: - if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) - BUG(); - break; - - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) || - HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) - BUG(); - break; - case CLOCK_EVT_MODE_RESUME: - break; - } -} - -static int xen_vcpuop_set_next_event(unsigned long delta, - struct clock_event_device *evt) -{ - int cpu = smp_processor_id(); - struct vcpu_set_singleshot_timer single; - int ret; - - WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); - - single.timeout_abs_ns = get_abs_timeout(delta); - single.flags = VCPU_SSHOTTMR_future; - - ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single); - - BUG_ON(ret != 0 && ret != -ETIME); - - return ret; -} - -static const struct clock_event_device xen_vcpuop_clockevent = { - .name = "xen", - .features = CLOCK_EVT_FEAT_ONESHOT, - - .max_delta_ns = 0xffffffff, - .min_delta_ns = TIMER_SLOP, - - .mult = 1, - .shift = 0, - .rating = 500, - - .set_mode = xen_vcpuop_set_mode, - .set_next_event = xen_vcpuop_set_next_event, -}; - -static const struct clock_event_device *xen_clockevent = - &xen_timerop_clockevent; -static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events); - -static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) -{ - struct clock_event_device *evt = &__get_cpu_var(xen_clock_events); - irqreturn_t ret; - - ret = IRQ_NONE; - if (evt->event_handler) { - evt->event_handler(evt); - ret = IRQ_HANDLED; - } - - do_stolen_accounting(); - - return ret; -} - -void xen_setup_timer(int cpu) -{ - const char *name; - struct clock_event_device *evt; - int irq; - - printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu); - - name = kasprintf(GFP_KERNEL, "timer%d", cpu); - if (!name) - name = "<timer kasprintf failed>"; - - irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, - name, NULL); - - evt = &per_cpu(xen_clock_events, cpu); - memcpy(evt, xen_clockevent, sizeof(*evt)); - - evt->cpumask = cpumask_of_cpu(cpu); - evt->irq = irq; - - setup_runstate_info(cpu); -} - -void xen_setup_cpu_clockevents(void) -{ - BUG_ON(preemptible()); - - clockevents_register_device(&__get_cpu_var(xen_clock_events)); -} - -__init void xen_time_init(void) -{ - int cpu = smp_processor_id(); - - get_time_values_from_xen(); - - clocksource_register(&xen_clocksource); - - if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { - /* Successfully turned off 100Hz tick, so we have the - vcpuop-based timer interface */ - printk(KERN_DEBUG "Xen: using vcpuop timer interface\n"); - xen_clockevent = &xen_vcpuop_clockevent; - } - - /* Set initial system time with full resolution */ - xen_read_wallclock(&xtime); - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); - - tsc_disable = 0; - - xen_setup_timer(cpu); - xen_setup_cpu_clockevents(); -} diff --git a/arch/i386/xen/vdso.h b/arch/i386/xen/vdso.h deleted file mode 100644 index 861fedfe5230..000000000000 --- a/arch/i386/xen/vdso.h +++ /dev/null @@ -1,4 +0,0 @@ -/* Bit used for the pseudo-hwcap for non-negative segments. We use - bit 1 to avoid bugs in some versions of glibc when bit 0 is - used; the choice is otherwise arbitrary. */ -#define VDSO_NOTE_NONEGSEG_BIT 1 diff --git a/arch/i386/xen/xen-asm.S b/arch/i386/xen/xen-asm.S deleted file mode 100644 index 1a43b60c0c62..000000000000 --- a/arch/i386/xen/xen-asm.S +++ /dev/null @@ -1,291 +0,0 @@ -/* - Asm versions of Xen pv-ops, suitable for either direct use or inlining. - The inline versions are the same as the direct-use versions, with the - pre- and post-amble chopped off. - - This code is encoded for size rather than absolute efficiency, - with a view to being able to inline as much as possible. - - We only bother with direct forms (ie, vcpu in pda) of the operations - here; the indirect forms are better handled in C, since they're - generally too large to inline anyway. - */ - -#include <linux/linkage.h> - -#include <asm/asm-offsets.h> -#include <asm/thread_info.h> -#include <asm/percpu.h> -#include <asm/processor-flags.h> -#include <asm/segment.h> - -#include <xen/interface/xen.h> - -#define RELOC(x, v) .globl x##_reloc; x##_reloc=v -#define ENDPATCH(x) .globl x##_end; x##_end=. - -/* Pseudo-flag used for virtual NMI, which we don't implement yet */ -#define XEN_EFLAGS_NMI 0x80000000 - -/* - Enable events. This clears the event mask and tests the pending - event status with one and operation. If there are pending - events, then enter the hypervisor to get them handled. - */ -ENTRY(xen_irq_enable_direct) - /* Clear mask and test pending */ - andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending - /* Preempt here doesn't matter because that will deal with - any pending interrupts. The pending check may end up being - run on the wrong CPU, but that doesn't hurt. */ - jz 1f -2: call check_events -1: -ENDPATCH(xen_irq_enable_direct) - ret - ENDPROC(xen_irq_enable_direct) - RELOC(xen_irq_enable_direct, 2b+1) - - -/* - Disabling events is simply a matter of making the event mask - non-zero. - */ -ENTRY(xen_irq_disable_direct) - movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask -ENDPATCH(xen_irq_disable_direct) - ret - ENDPROC(xen_irq_disable_direct) - RELOC(xen_irq_disable_direct, 0) - -/* - (xen_)save_fl is used to get the current interrupt enable status. - Callers expect the status to be in X86_EFLAGS_IF, and other bits - may be set in the return value. We take advantage of this by - making sure that X86_EFLAGS_IF has the right value (and other bits - in that byte are 0), but other bits in the return value are - undefined. We need to toggle the state of the bit, because - Xen and x86 use opposite senses (mask vs enable). - */ -ENTRY(xen_save_fl_direct) - testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask - setz %ah - addb %ah,%ah -ENDPATCH(xen_save_fl_direct) - ret - ENDPROC(xen_save_fl_direct) - RELOC(xen_save_fl_direct, 0) - - -/* - In principle the caller should be passing us a value return - from xen_save_fl_direct, but for robustness sake we test only - the X86_EFLAGS_IF flag rather than the whole byte. After - setting the interrupt mask state, it checks for unmasked - pending events and enters the hypervisor to get them delivered - if so. - */ -ENTRY(xen_restore_fl_direct) - testb $X86_EFLAGS_IF>>8, %ah - setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask - /* Preempt here doesn't matter because that will deal with - any pending interrupts. The pending check may end up being - run on the wrong CPU, but that doesn't hurt. */ - - /* check for unmasked and pending */ - cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending - jz 1f -2: call check_events -1: -ENDPATCH(xen_restore_fl_direct) - ret - ENDPROC(xen_restore_fl_direct) - RELOC(xen_restore_fl_direct, 2b+1) - -/* - This is run where a normal iret would be run, with the same stack setup: - 8: eflags - 4: cs - esp-> 0: eip - - This attempts to make sure that any pending events are dealt - with on return to usermode, but there is a small window in - which an event can happen just before entering usermode. If - the nested interrupt ends up setting one of the TIF_WORK_MASK - pending work flags, they will not be tested again before - returning to usermode. This means that a process can end up - with pending work, which will be unprocessed until the process - enters and leaves the kernel again, which could be an - unbounded amount of time. This means that a pending signal or - reschedule event could be indefinitely delayed. - - The fix is to notice a nested interrupt in the critical - window, and if one occurs, then fold the nested interrupt into - the current interrupt stack frame, and re-process it - iteratively rather than recursively. This means that it will - exit via the normal path, and all pending work will be dealt - with appropriately. - - Because the nested interrupt handler needs to deal with the - current stack state in whatever form its in, we keep things - simple by only using a single register which is pushed/popped - on the stack. - - Non-direct iret could be done in the same way, but it would - require an annoying amount of code duplication. We'll assume - that direct mode will be the common case once the hypervisor - support becomes commonplace. - */ -ENTRY(xen_iret_direct) - /* test eflags for special cases */ - testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) - jnz hyper_iret - - push %eax - ESP_OFFSET=4 # bytes pushed onto stack - - /* Store vcpu_info pointer for easy access. Do it this - way to avoid having to reload %fs */ -#ifdef CONFIG_SMP - GET_THREAD_INFO(%eax) - movl TI_cpu(%eax),%eax - movl __per_cpu_offset(,%eax,4),%eax - lea per_cpu__xen_vcpu_info(%eax),%eax -#else - movl $per_cpu__xen_vcpu_info, %eax -#endif - - /* check IF state we're restoring */ - testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) - - /* Maybe enable events. Once this happens we could get a - recursive event, so the critical region starts immediately - afterwards. However, if that happens we don't end up - resuming the code, so we don't have to be worried about - being preempted to another CPU. */ - setz XEN_vcpu_info_mask(%eax) -xen_iret_start_crit: - - /* check for unmasked and pending */ - cmpw $0x0001, XEN_vcpu_info_pending(%eax) - - /* If there's something pending, mask events again so we - can jump back into xen_hypervisor_callback */ - sete XEN_vcpu_info_mask(%eax) - - popl %eax - - /* From this point on the registers are restored and the stack - updated, so we don't need to worry about it if we're preempted */ -iret_restore_end: - - /* Jump to hypervisor_callback after fixing up the stack. - Events are masked, so jumping out of the critical - region is OK. */ - je xen_hypervisor_callback - - iret -xen_iret_end_crit: - -hyper_iret: - /* put this out of line since its very rarely used */ - jmp hypercall_page + __HYPERVISOR_iret * 32 - - .globl xen_iret_start_crit, xen_iret_end_crit - -/* - This is called by xen_hypervisor_callback in entry.S when it sees - that the EIP at the time of interrupt was between xen_iret_start_crit - and xen_iret_end_crit. We're passed the EIP in %eax so we can do - a more refined determination of what to do. - - The stack format at this point is: - ---------------- - ss : (ss/esp may be present if we came from usermode) - esp : - eflags } outer exception info - cs } - eip } - ---------------- <- edi (copy dest) - eax : outer eax if it hasn't been restored - ---------------- - eflags } nested exception info - cs } (no ss/esp because we're nested - eip } from the same ring) - orig_eax }<- esi (copy src) - - - - - - - - - - fs } - es } - ds } SAVE_ALL state - eax } - : : - ebx } - ---------------- - return addr <- esp - ---------------- - - In order to deliver the nested exception properly, we need to shift - everything from the return addr up to the error code so it - sits just under the outer exception info. This means that when we - handle the exception, we do it in the context of the outer exception - rather than starting a new one. - - The only caveat is that if the outer eax hasn't been - restored yet (ie, it's still on stack), we need to insert - its value into the SAVE_ALL state before going on, since - it's usermode state which we eventually need to restore. - */ -ENTRY(xen_iret_crit_fixup) - /* offsets +4 for return address */ - - /* - Paranoia: Make sure we're really coming from userspace. - One could imagine a case where userspace jumps into the - critical range address, but just before the CPU delivers a GP, - it decides to deliver an interrupt instead. Unlikely? - Definitely. Easy to avoid? Yes. The Intel documents - explicitly say that the reported EIP for a bad jump is the - jump instruction itself, not the destination, but some virtual - environments get this wrong. - */ - movl PT_CS+4(%esp), %ecx - andl $SEGMENT_RPL_MASK, %ecx - cmpl $USER_RPL, %ecx - je 2f - - lea PT_ORIG_EAX+4(%esp), %esi - lea PT_EFLAGS+4(%esp), %edi - - /* If eip is before iret_restore_end then stack - hasn't been restored yet. */ - cmp $iret_restore_end, %eax - jae 1f - - movl 0+4(%edi),%eax /* copy EAX */ - movl %eax, PT_EAX+4(%esp) - - lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ - - /* set up the copy */ -1: std - mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */ - rep movsl - cld - - lea 4(%edi),%esp /* point esp to new frame */ -2: ret - - -/* - Force an event check by making a hypercall, - but preserve regs before making the call. - */ -check_events: - push %eax - push %ecx - push %edx - call force_evtchn_callback - pop %edx - pop %ecx - pop %eax - ret diff --git a/arch/i386/xen/xen-head.S b/arch/i386/xen/xen-head.S deleted file mode 100644 index f8d6937db2ec..000000000000 --- a/arch/i386/xen/xen-head.S +++ /dev/null @@ -1,38 +0,0 @@ -/* Xen-specific pieces of head.S, intended to be included in the right - place in head.S */ - -#ifdef CONFIG_XEN - -#include <linux/elfnote.h> -#include <asm/boot.h> -#include <xen/interface/elfnote.h> - -.pushsection .init.text -ENTRY(startup_xen) - movl %esi,xen_start_info - cld - movl $(init_thread_union+THREAD_SIZE),%esp - jmp xen_start_kernel -.popsection - -.pushsection .bss.page_aligned - .align PAGE_SIZE_asm -ENTRY(hypercall_page) - .skip 0x1000 -.popsection - - ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") - ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") - ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") - ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) - ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) - ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) - ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") -#ifdef CONFIG_X86_PAE - ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") -#else - ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "no") -#endif - ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") - -#endif /*CONFIG_XEN */ diff --git a/arch/i386/xen/xen-ops.h b/arch/i386/xen/xen-ops.h deleted file mode 100644 index b9aaea45f07f..000000000000 --- a/arch/i386/xen/xen-ops.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef XEN_OPS_H -#define XEN_OPS_H - -#include <linux/init.h> - -/* These are code, but not functions. Defined in entry.S */ -extern const char xen_hypervisor_callback[]; -extern const char xen_failsafe_callback[]; - -void xen_copy_trap_info(struct trap_info *traps); - -DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); -DECLARE_PER_CPU(unsigned long, xen_cr3); - -extern struct start_info *xen_start_info; -extern struct shared_info *HYPERVISOR_shared_info; - -char * __init xen_memory_setup(void); -void __init xen_arch_setup(void); -void __init xen_init_IRQ(void); - -void xen_setup_timer(int cpu); -void xen_setup_cpu_clockevents(void); -unsigned long xen_cpu_khz(void); -void __init xen_time_init(void); -unsigned long xen_get_wallclock(void); -int xen_set_wallclock(unsigned long time); -unsigned long long xen_sched_clock(void); - -void xen_mark_init_mm_pinned(void); - -DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); - -static inline unsigned xen_get_lazy_mode(void) -{ - return x86_read_percpu(xen_lazy_mode); -} - -void __init xen_fill_possible_map(void); - -void __init xen_setup_vcpu_info_placement(void); -void xen_smp_prepare_boot_cpu(void); -void xen_smp_prepare_cpus(unsigned int max_cpus); -int xen_cpu_up(unsigned int cpu); -void xen_smp_cpus_done(unsigned int max_cpus); - -void xen_smp_send_stop(void); -void xen_smp_send_reschedule(int cpu); -int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic, - int wait); -int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info, - int nonatomic, int wait); - -int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), - void *info, int wait); - - -/* Declare an asm function, along with symbols needed to make it - inlineable */ -#define DECL_ASM(ret, name, ...) \ - ret name(__VA_ARGS__); \ - extern char name##_end[]; \ - extern char name##_reloc[] \ - -DECL_ASM(void, xen_irq_enable_direct, void); -DECL_ASM(void, xen_irq_disable_direct, void); -DECL_ASM(unsigned long, xen_save_fl_direct, void); -DECL_ASM(void, xen_restore_fl_direct, unsigned long); - -void xen_iret_direct(void); -#endif /* XEN_OPS_H */ |