summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-13 15:31:08 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-13 15:31:08 -0800
commit66cdd0ceaf65a18996f561b770eedde1d123b019 (patch)
tree4892eaa422d366fce5d1e866ff1fe0988af95569 /arch
parent896ea17d3da5f44b2625c9cda9874d7dfe447393 (diff)
parent58b7825bc324da55415034a9f6ca5d716b8fd898 (diff)
downloadlwn-66cdd0ceaf65a18996f561b770eedde1d123b019.tar.gz
lwn-66cdd0ceaf65a18996f561b770eedde1d123b019.zip
Merge tag 'kvm-3.8-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Marcelo Tosatti: "Considerable KVM/PPC work, x86 kvmclock vsyscall support, IA32_TSC_ADJUST MSR emulation, amongst others." Fix up trivial conflict in kernel/sched/core.c due to cross-cpu migration notifier added next to rq migration call-back. * tag 'kvm-3.8-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (156 commits) KVM: emulator: fix real mode segment checks in address linearization VMX: remove unneeded enable_unrestricted_guest check KVM: VMX: fix DPL during entry to protected mode x86/kexec: crash_vmclear_local_vmcss needs __rcu kvm: Fix irqfd resampler list walk KVM: VMX: provide the vmclear function and a bitmap to support VMCLEAR in kdump x86/kexec: VMCLEAR VMCSs loaded on all cpus if necessary KVM: MMU: optimize for set_spte KVM: PPC: booke: Get/set guest EPCR register using ONE_REG interface KVM: PPC: bookehv: Add EPCR support in mtspr/mfspr emulation KVM: PPC: bookehv: Add guest computation mode for irq delivery KVM: PPC: Make EPCR a valid field for booke64 and bookehv KVM: PPC: booke: Extend MAS2 EPN mask for 64-bit KVM: PPC: e500: Mask MAS2 EPN high 32-bits in 32/64 tlbwe emulation KVM: PPC: Mask ea's high 32-bits in 32/64 instr emulation KVM: PPC: e500: Add emulation helper for getting instruction ea KVM: PPC: bookehv64: Add support for interrupt handling KVM: PPC: bookehv: Remove GET_VCPU macro from exception handler KVM: PPC: booke: Fix get_tb() compile error on 64-bit KVM: PPC: e500: Silence bogus GCC warning in tlb code ...
Diffstat (limited to 'arch')
-rw-r--r--arch/ia64/kvm/kvm-ia64.c7
-rw-r--r--arch/powerpc/include/asm/Kbuild1
-rw-r--r--arch/powerpc/include/asm/epapr_hcalls.h83
-rw-r--r--arch/powerpc/include/asm/fsl_hcalls.h36
-rw-r--r--arch/powerpc/include/asm/kvm_asm.h1
-rw-r--r--arch/powerpc/include/asm/kvm_book3s.h12
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h33
-rw-r--r--arch/powerpc/include/asm/kvm_booke_hv_asm.h29
-rw-r--r--arch/powerpc/include/asm/kvm_host.h68
-rw-r--r--arch/powerpc/include/asm/kvm_para.h15
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h87
-rw-r--r--arch/powerpc/include/asm/mmu-book3e.h2
-rw-r--r--arch/powerpc/include/asm/mmu-hash64.h10
-rw-r--r--arch/powerpc/include/asm/reg.h1
-rw-r--r--arch/powerpc/include/asm/reg_booke.h7
-rw-r--r--arch/powerpc/include/asm/smp.h8
-rw-r--r--arch/powerpc/include/uapi/asm/Kbuild1
-rw-r--r--arch/powerpc/include/uapi/asm/epapr_hcalls.h98
-rw-r--r--arch/powerpc/include/uapi/asm/kvm.h86
-rw-r--r--arch/powerpc/include/uapi/asm/kvm_para.h7
-rw-r--r--arch/powerpc/kernel/asm-offsets.c4
-rw-r--r--arch/powerpc/kernel/epapr_hcalls.S28
-rw-r--r--arch/powerpc/kernel/epapr_paravirt.c11
-rw-r--r--arch/powerpc/kernel/kvm.c2
-rw-r--r--arch/powerpc/kernel/ppc_ksyms.c5
-rw-r--r--arch/powerpc/kernel/smp.c46
-rw-r--r--arch/powerpc/kvm/44x.c1
-rw-r--r--arch/powerpc/kvm/44x_emulate.c112
-rw-r--r--arch/powerpc/kvm/Kconfig4
-rw-r--r--arch/powerpc/kvm/Makefile5
-rw-r--r--arch/powerpc/kvm/book3s.c125
-rw-r--r--arch/powerpc/kvm/book3s_32_mmu_host.c3
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_host.c3
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c474
-rw-r--r--arch/powerpc/kvm/book3s_emulate.c16
-rw-r--r--arch/powerpc/kvm/book3s_exports.c3
-rw-r--r--arch/powerpc/kvm/book3s_hv.c655
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c4
-rw-r--r--arch/powerpc/kvm/book3s_hv_ras.c144
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c143
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S142
-rw-r--r--arch/powerpc/kvm/book3s_mmu_hpte.c5
-rw-r--r--arch/powerpc/kvm/book3s_pr.c294
-rw-r--r--arch/powerpc/kvm/book3s_rmhandlers.S18
-rw-r--r--arch/powerpc/kvm/booke.c346
-rw-r--r--arch/powerpc/kvm/booke.h1
-rw-r--r--arch/powerpc/kvm/booke_emulate.c36
-rw-r--r--arch/powerpc/kvm/bookehv_interrupts.S145
-rw-r--r--arch/powerpc/kvm/e500.h11
-rw-r--r--arch/powerpc/kvm/e500_emulate.c14
-rw-r--r--arch/powerpc/kvm/e500_tlb.c132
-rw-r--r--arch/powerpc/kvm/emulate.c221
-rw-r--r--arch/powerpc/kvm/powerpc.c187
-rw-r--r--arch/powerpc/kvm/trace.h200
-rw-r--r--arch/powerpc/platforms/Kconfig1
-rw-r--r--arch/powerpc/sysdev/fsl_msi.c9
-rw-r--r--arch/powerpc/sysdev/fsl_soc.c2
-rw-r--r--arch/s390/kvm/interrupt.c19
-rw-r--r--arch/s390/kvm/kvm-s390.c7
-rw-r--r--arch/x86/include/asm/clocksource.h1
-rw-r--r--arch/x86/include/asm/cpufeature.h1
-rw-r--r--arch/x86/include/asm/fixmap.h5
-rw-r--r--arch/x86/include/asm/kexec.h3
-rw-r--r--arch/x86/include/asm/kvm_guest.h6
-rw-r--r--arch/x86/include/asm/kvm_host.h24
-rw-r--r--arch/x86/include/asm/msr-index.h1
-rw-r--r--arch/x86/include/asm/pvclock.h47
-rw-r--r--arch/x86/include/asm/vmx.h3
-rw-r--r--arch/x86/include/asm/vsyscall.h20
-rw-r--r--arch/x86/kernel/crash.c32
-rw-r--r--arch/x86/kernel/kvm.c20
-rw-r--r--arch/x86/kernel/kvmclock.c88
-rw-r--r--arch/x86/kernel/pvclock.c143
-rw-r--r--arch/x86/kvm/cpuid.c3
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/emulate.c5
-rw-r--r--arch/x86/kvm/lapic.c2
-rw-r--r--arch/x86/kvm/mmu.c65
-rw-r--r--arch/x86/kvm/paging_tmpl.h115
-rw-r--r--arch/x86/kvm/svm.c48
-rw-r--r--arch/x86/kvm/trace.h63
-rw-r--r--arch/x86/kvm/vmx.c203
-rw-r--r--arch/x86/kvm/x86.c548
-rw-r--r--arch/x86/kvm/x86.h2
-rw-r--r--arch/x86/vdso/vclock_gettime.c81
-rw-r--r--arch/x86/vdso/vgetcpu.c11
86 files changed, 4525 insertions, 1193 deletions
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 0a88cb5d316d..bd1c51555038 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1330,6 +1330,11 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
return 0;
}
+int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
return -EINVAL;
@@ -1362,11 +1367,9 @@ static void kvm_release_vm_pages(struct kvm *kvm)
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
int j;
- unsigned long base_gfn;
slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots) {
- base_gfn = memslot->base_gfn;
for (j = 0; j < memslot->npages; j++) {
if (memslot->rmap[j])
put_page((struct page *)memslot->rmap[j]);
diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 2d62b484b3fc..650757c300db 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -1,5 +1,4 @@
-
generic-y += clkdev.h
generic-y += rwsem.h
generic-y += trace_clock.h
diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h
index bf2c06c33871..d3d634274d2c 100644
--- a/arch/powerpc/include/asm/epapr_hcalls.h
+++ b/arch/powerpc/include/asm/epapr_hcalls.h
@@ -50,64 +50,13 @@
#ifndef _EPAPR_HCALLS_H
#define _EPAPR_HCALLS_H
+#include <uapi/asm/epapr_hcalls.h>
+
+#ifndef __ASSEMBLY__
#include <linux/types.h>
#include <linux/errno.h>
#include <asm/byteorder.h>
-#define EV_BYTE_CHANNEL_SEND 1
-#define EV_BYTE_CHANNEL_RECEIVE 2
-#define EV_BYTE_CHANNEL_POLL 3
-#define EV_INT_SET_CONFIG 4
-#define EV_INT_GET_CONFIG 5
-#define EV_INT_SET_MASK 6
-#define EV_INT_GET_MASK 7
-#define EV_INT_IACK 9
-#define EV_INT_EOI 10
-#define EV_INT_SEND_IPI 11
-#define EV_INT_SET_TASK_PRIORITY 12
-#define EV_INT_GET_TASK_PRIORITY 13
-#define EV_DOORBELL_SEND 14
-#define EV_MSGSND 15
-#define EV_IDLE 16
-
-/* vendor ID: epapr */
-#define EV_LOCAL_VENDOR_ID 0 /* for private use */
-#define EV_EPAPR_VENDOR_ID 1
-#define EV_FSL_VENDOR_ID 2 /* Freescale Semiconductor */
-#define EV_IBM_VENDOR_ID 3 /* IBM */
-#define EV_GHS_VENDOR_ID 4 /* Green Hills Software */
-#define EV_ENEA_VENDOR_ID 5 /* Enea */
-#define EV_WR_VENDOR_ID 6 /* Wind River Systems */
-#define EV_AMCC_VENDOR_ID 7 /* Applied Micro Circuits */
-#define EV_KVM_VENDOR_ID 42 /* KVM */
-
-/* The max number of bytes that a byte channel can send or receive per call */
-#define EV_BYTE_CHANNEL_MAX_BYTES 16
-
-
-#define _EV_HCALL_TOKEN(id, num) (((id) << 16) | (num))
-#define EV_HCALL_TOKEN(hcall_num) _EV_HCALL_TOKEN(EV_EPAPR_VENDOR_ID, hcall_num)
-
-/* epapr error codes */
-#define EV_EPERM 1 /* Operation not permitted */
-#define EV_ENOENT 2 /* Entry Not Found */
-#define EV_EIO 3 /* I/O error occured */
-#define EV_EAGAIN 4 /* The operation had insufficient
- * resources to complete and should be
- * retried
- */
-#define EV_ENOMEM 5 /* There was insufficient memory to
- * complete the operation */
-#define EV_EFAULT 6 /* Bad guest address */
-#define EV_ENODEV 7 /* No such device */
-#define EV_EINVAL 8 /* An argument supplied to the hcall
- was out of range or invalid */
-#define EV_INTERNAL 9 /* An internal error occured */
-#define EV_CONFIG 10 /* A configuration error was detected */
-#define EV_INVALID_STATE 11 /* The object is in an invalid state */
-#define EV_UNIMPLEMENTED 12 /* Unimplemented hypercall */
-#define EV_BUFFER_OVERFLOW 13 /* Caller-supplied buffer too small */
-
/*
* Hypercall register clobber list
*
@@ -193,7 +142,7 @@ static inline unsigned int ev_int_set_config(unsigned int interrupt,
r5 = priority;
r6 = destination;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6)
: : EV_HCALL_CLOBBERS4
);
@@ -222,7 +171,7 @@ static inline unsigned int ev_int_get_config(unsigned int interrupt,
r11 = EV_HCALL_TOKEN(EV_INT_GET_CONFIG);
r3 = interrupt;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "+r" (r3), "=r" (r4), "=r" (r5), "=r" (r6)
: : EV_HCALL_CLOBBERS4
);
@@ -252,7 +201,7 @@ static inline unsigned int ev_int_set_mask(unsigned int interrupt,
r3 = interrupt;
r4 = mask;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "+r" (r3), "+r" (r4)
: : EV_HCALL_CLOBBERS2
);
@@ -277,7 +226,7 @@ static inline unsigned int ev_int_get_mask(unsigned int interrupt,
r11 = EV_HCALL_TOKEN(EV_INT_GET_MASK);
r3 = interrupt;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "+r" (r3), "=r" (r4)
: : EV_HCALL_CLOBBERS2
);
@@ -305,7 +254,7 @@ static inline unsigned int ev_int_eoi(unsigned int interrupt)
r11 = EV_HCALL_TOKEN(EV_INT_EOI);
r3 = interrupt;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "+r" (r3)
: : EV_HCALL_CLOBBERS1
);
@@ -344,7 +293,7 @@ static inline unsigned int ev_byte_channel_send(unsigned int handle,
r7 = be32_to_cpu(p[2]);
r8 = be32_to_cpu(p[3]);
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "+r" (r3),
"+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7), "+r" (r8)
: : EV_HCALL_CLOBBERS6
@@ -383,7 +332,7 @@ static inline unsigned int ev_byte_channel_receive(unsigned int handle,
r3 = handle;
r4 = *count;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "+r" (r3), "+r" (r4),
"=r" (r5), "=r" (r6), "=r" (r7), "=r" (r8)
: : EV_HCALL_CLOBBERS6
@@ -421,7 +370,7 @@ static inline unsigned int ev_byte_channel_poll(unsigned int handle,
r11 = EV_HCALL_TOKEN(EV_BYTE_CHANNEL_POLL);
r3 = handle;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "+r" (r3), "=r" (r4), "=r" (r5)
: : EV_HCALL_CLOBBERS3
);
@@ -454,7 +403,7 @@ static inline unsigned int ev_int_iack(unsigned int handle,
r11 = EV_HCALL_TOKEN(EV_INT_IACK);
r3 = handle;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "+r" (r3), "=r" (r4)
: : EV_HCALL_CLOBBERS2
);
@@ -478,7 +427,7 @@ static inline unsigned int ev_doorbell_send(unsigned int handle)
r11 = EV_HCALL_TOKEN(EV_DOORBELL_SEND);
r3 = handle;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "+r" (r3)
: : EV_HCALL_CLOBBERS1
);
@@ -498,12 +447,12 @@ static inline unsigned int ev_idle(void)
r11 = EV_HCALL_TOKEN(EV_IDLE);
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "=r" (r3)
: : EV_HCALL_CLOBBERS1
);
return r3;
}
-
-#endif
+#endif /* !__ASSEMBLY__ */
+#endif /* _EPAPR_HCALLS_H */
diff --git a/arch/powerpc/include/asm/fsl_hcalls.h b/arch/powerpc/include/asm/fsl_hcalls.h
index 922d9b5fe3d5..3abb58394da4 100644
--- a/arch/powerpc/include/asm/fsl_hcalls.h
+++ b/arch/powerpc/include/asm/fsl_hcalls.h
@@ -96,7 +96,7 @@ static inline unsigned int fh_send_nmi(unsigned int vcpu_mask)
r11 = FH_HCALL_TOKEN(FH_SEND_NMI);
r3 = vcpu_mask;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "+r" (r3)
: : EV_HCALL_CLOBBERS1
);
@@ -151,7 +151,7 @@ static inline unsigned int fh_partition_get_dtprop(int handle,
r9 = (uint32_t)propvalue_addr;
r10 = *propvalue_len;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11),
"+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7),
"+r" (r8), "+r" (r9), "+r" (r10)
@@ -205,7 +205,7 @@ static inline unsigned int fh_partition_set_dtprop(int handle,
r9 = (uint32_t)propvalue_addr;
r10 = propvalue_len;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11),
"+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7),
"+r" (r8), "+r" (r9), "+r" (r10)
@@ -229,7 +229,7 @@ static inline unsigned int fh_partition_restart(unsigned int partition)
r11 = FH_HCALL_TOKEN(FH_PARTITION_RESTART);
r3 = partition;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "+r" (r3)
: : EV_HCALL_CLOBBERS1
);
@@ -262,7 +262,7 @@ static inline unsigned int fh_partition_get_status(unsigned int partition,
r11 = FH_HCALL_TOKEN(FH_PARTITION_GET_STATUS);
r3 = partition;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "+r" (r3), "=r" (r4)
: : EV_HCALL_CLOBBERS2
);
@@ -295,7 +295,7 @@ static inline unsigned int fh_partition_start(unsigned int partition,
r4 = entry_point;
r5 = load;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5)
: : EV_HCALL_CLOBBERS3
);
@@ -317,7 +317,7 @@ static inline unsigned int fh_partition_stop(unsigned int partition)
r11 = FH_HCALL_TOKEN(FH_PARTITION_STOP);
r3 = partition;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "+r" (r3)
: : EV_HCALL_CLOBBERS1
);
@@ -376,7 +376,7 @@ static inline unsigned int fh_partition_memcpy(unsigned int source,
#endif
r7 = count;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11),
"+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7)
: : EV_HCALL_CLOBBERS5
@@ -399,7 +399,7 @@ static inline unsigned int fh_dma_enable(unsigned int liodn)
r11 = FH_HCALL_TOKEN(FH_DMA_ENABLE);
r3 = liodn;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "+r" (r3)
: : EV_HCALL_CLOBBERS1
);
@@ -421,7 +421,7 @@ static inline unsigned int fh_dma_disable(unsigned int liodn)
r11 = FH_HCALL_TOKEN(FH_DMA_DISABLE);
r3 = liodn;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "+r" (r3)
: : EV_HCALL_CLOBBERS1
);
@@ -447,7 +447,7 @@ static inline unsigned int fh_vmpic_get_msir(unsigned int interrupt,
r11 = FH_HCALL_TOKEN(FH_VMPIC_GET_MSIR);
r3 = interrupt;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "+r" (r3), "=r" (r4)
: : EV_HCALL_CLOBBERS2
);
@@ -469,7 +469,7 @@ static inline unsigned int fh_system_reset(void)
r11 = FH_HCALL_TOKEN(FH_SYSTEM_RESET);
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "=r" (r3)
: : EV_HCALL_CLOBBERS1
);
@@ -506,7 +506,7 @@ static inline unsigned int fh_err_get_info(int queue, uint32_t *bufsize,
r6 = addr_lo;
r7 = peek;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6),
"+r" (r7)
: : EV_HCALL_CLOBBERS5
@@ -542,7 +542,7 @@ static inline unsigned int fh_get_core_state(unsigned int handle,
r3 = handle;
r4 = vcpu;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "+r" (r3), "+r" (r4)
: : EV_HCALL_CLOBBERS2
);
@@ -572,7 +572,7 @@ static inline unsigned int fh_enter_nap(unsigned int handle, unsigned int vcpu)
r3 = handle;
r4 = vcpu;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "+r" (r3), "+r" (r4)
: : EV_HCALL_CLOBBERS2
);
@@ -597,7 +597,7 @@ static inline unsigned int fh_exit_nap(unsigned int handle, unsigned int vcpu)
r3 = handle;
r4 = vcpu;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "+r" (r3), "+r" (r4)
: : EV_HCALL_CLOBBERS2
);
@@ -618,7 +618,7 @@ static inline unsigned int fh_claim_device(unsigned int handle)
r11 = FH_HCALL_TOKEN(FH_CLAIM_DEVICE);
r3 = handle;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "+r" (r3)
: : EV_HCALL_CLOBBERS1
);
@@ -645,7 +645,7 @@ static inline unsigned int fh_partition_stop_dma(unsigned int handle)
r11 = FH_HCALL_TOKEN(FH_PARTITION_STOP_DMA);
r3 = handle;
- __asm__ __volatile__ ("sc 1"
+ asm volatile("bl epapr_hypercall_start"
: "+r" (r11), "+r" (r3)
: : EV_HCALL_CLOBBERS1
);
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index 76fdcfef0889..aabcdba8f6b0 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -118,6 +118,7 @@
#define RESUME_FLAG_NV (1<<0) /* Reload guest nonvolatile state? */
#define RESUME_FLAG_HOST (1<<1) /* Resume host? */
+#define RESUME_FLAG_ARCH1 (1<<2)
#define RESUME_GUEST 0
#define RESUME_GUEST_NV RESUME_FLAG_NV
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 7aefdb3e1ce4..5a56e1c5f851 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -81,6 +81,8 @@ struct kvmppc_vcpu_book3s {
u64 sdr1;
u64 hior;
u64 msr_mask;
+ u64 purr_offset;
+ u64 spurr_offset;
#ifdef CONFIG_PPC_BOOK3S_32
u32 vsid_pool[VSID_POOL_SIZE];
u32 vsid_next;
@@ -157,10 +159,14 @@ extern void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long addr,
extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr);
extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
long pte_index, unsigned long pteh, unsigned long ptel);
-extern long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
- long pte_index, unsigned long pteh, unsigned long ptel);
+extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
+ long pte_index, unsigned long pteh, unsigned long ptel,
+ pgd_t *pgdir, bool realmode, unsigned long *idx_ret);
+extern long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
+ unsigned long pte_index, unsigned long avpn,
+ unsigned long *hpret);
extern long kvmppc_hv_get_dirty_log(struct kvm *kvm,
- struct kvm_memory_slot *memslot);
+ struct kvm_memory_slot *memslot, unsigned long *map);
extern void kvmppc_entry_trampoline(void);
extern void kvmppc_hv_entry_trampoline(void);
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 0dd1d86d3e31..38bec1dc9928 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -50,6 +50,15 @@ extern int kvm_hpt_order; /* order of preallocated HPTs */
#define HPTE_V_HVLOCK 0x40UL
#define HPTE_V_ABSENT 0x20UL
+/*
+ * We use this bit in the guest_rpte field of the revmap entry
+ * to indicate a modified HPTE.
+ */
+#define HPTE_GR_MODIFIED (1ul << 62)
+
+/* These bits are reserved in the guest view of the HPTE */
+#define HPTE_GR_RESERVED HPTE_GR_MODIFIED
+
static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
{
unsigned long tmp, old;
@@ -60,7 +69,7 @@ static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
" ori %0,%0,%4\n"
" stdcx. %0,0,%2\n"
" beq+ 2f\n"
- " li %1,%3\n"
+ " mr %1,%3\n"
"2: isync"
: "=&r" (tmp), "=&r" (old)
: "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK)
@@ -237,4 +246,26 @@ static inline bool slot_is_aligned(struct kvm_memory_slot *memslot,
return !(memslot->base_gfn & mask) && !(memslot->npages & mask);
}
+/*
+ * This works for 4k, 64k and 16M pages on POWER7,
+ * and 4k and 16M pages on PPC970.
+ */
+static inline unsigned long slb_pgsize_encoding(unsigned long psize)
+{
+ unsigned long senc = 0;
+
+ if (psize > 0x1000) {
+ senc = SLB_VSID_L;
+ if (psize == 0x10000)
+ senc |= SLB_VSID_LP_01;
+ }
+ return senc;
+}
+
+static inline int is_vrma_hpte(unsigned long hpte_v)
+{
+ return (hpte_v & ~0xffffffUL) ==
+ (HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)));
+}
+
#endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_booke_hv_asm.h b/arch/powerpc/include/asm/kvm_booke_hv_asm.h
index 30a600fa1b6a..3a79f5325712 100644
--- a/arch/powerpc/include/asm/kvm_booke_hv_asm.h
+++ b/arch/powerpc/include/asm/kvm_booke_hv_asm.h
@@ -17,6 +17,7 @@
* there are no exceptions for which we fall through directly to
* the normal host handler.
*
+ * 32-bit host
* Expected inputs (normal exceptions):
* SCRATCH0 = saved r10
* r10 = thread struct
@@ -33,14 +34,38 @@
* *(r8 + GPR9) = saved r9
* *(r8 + GPR10) = saved r10 (r10 not yet clobbered)
* *(r8 + GPR11) = saved r11
+ *
+ * 64-bit host
+ * Expected inputs (GEN/GDBELL/DBG/MC exception types):
+ * r10 = saved CR
+ * r13 = PACA_POINTER
+ * *(r13 + PACA_EX##type + EX_R10) = saved r10
+ * *(r13 + PACA_EX##type + EX_R11) = saved r11
+ * SPRN_SPRG_##type##_SCRATCH = saved r13
+ *
+ * Expected inputs (CRIT exception type):
+ * r10 = saved CR
+ * r13 = PACA_POINTER
+ * *(r13 + PACA_EX##type + EX_R10) = saved r10
+ * *(r13 + PACA_EX##type + EX_R11) = saved r11
+ * *(r13 + PACA_EX##type + EX_R13) = saved r13
+ *
+ * Expected inputs (TLB exception type):
+ * r10 = saved CR
+ * r13 = PACA_POINTER
+ * *(r13 + PACA_EX##type + EX_TLB_R10) = saved r10
+ * *(r13 + PACA_EX##type + EX_TLB_R11) = saved r11
+ * SPRN_SPRG_GEN_SCRATCH = saved r13
+ *
+ * Only the bolted version of TLB miss exception handlers is supported now.
*/
.macro DO_KVM intno srr1
#ifdef CONFIG_KVM_BOOKE_HV
BEGIN_FTR_SECTION
mtocrf 0x80, r11 /* check MSR[GS] without clobbering reg */
- bf 3, kvmppc_resume_\intno\()_\srr1
+ bf 3, 1975f
b kvmppc_handler_\intno\()_\srr1
-kvmppc_resume_\intno\()_\srr1:
+1975:
END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
#endif
.endm
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 28e8f5e5c63e..ca9bf459db6a 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -46,7 +46,7 @@
#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
#endif
-#ifdef CONFIG_KVM_BOOK3S_64_HV
+#if !defined(CONFIG_KVM_440)
#include <linux/mmu_notifier.h>
#define KVM_ARCH_WANT_MMU_NOTIFIER
@@ -204,7 +204,7 @@ struct revmap_entry {
};
/*
- * We use the top bit of each memslot->rmap entry as a lock bit,
+ * We use the top bit of each memslot->arch.rmap entry as a lock bit,
* and bit 32 as a present flag. The bottom 32 bits are the
* index in the guest HPT of a HPTE that points to the page.
*/
@@ -215,14 +215,17 @@ struct revmap_entry {
#define KVMPPC_RMAP_PRESENT 0x100000000ul
#define KVMPPC_RMAP_INDEX 0xfffffffful
-/* Low-order bits in kvm->arch.slot_phys[][] */
+/* Low-order bits in memslot->arch.slot_phys[] */
#define KVMPPC_PAGE_ORDER_MASK 0x1f
#define KVMPPC_PAGE_NO_CACHE HPTE_R_I /* 0x20 */
#define KVMPPC_PAGE_WRITETHRU HPTE_R_W /* 0x40 */
#define KVMPPC_GOT_PAGE 0x80
struct kvm_arch_memory_slot {
+#ifdef CONFIG_KVM_BOOK3S_64_HV
unsigned long *rmap;
+ unsigned long *slot_phys;
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
};
struct kvm_arch {
@@ -243,12 +246,12 @@ struct kvm_arch {
int using_mmu_notifiers;
u32 hpt_order;
atomic_t vcpus_running;
+ u32 online_vcores;
unsigned long hpt_npte;
unsigned long hpt_mask;
+ atomic_t hpte_mod_interest;
spinlock_t slot_phys_lock;
- unsigned long *slot_phys[KVM_MEM_SLOTS_NUM];
- int slot_npages[KVM_MEM_SLOTS_NUM];
- unsigned short last_vcpu[NR_CPUS];
+ cpumask_t need_tlb_flush;
struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
struct kvmppc_linear_info *hpt_li;
#endif /* CONFIG_KVM_BOOK3S_64_HV */
@@ -273,6 +276,7 @@ struct kvmppc_vcore {
int nap_count;
int napping_threads;
u16 pcpu;
+ u16 last_cpu;
u8 vcore_state;
u8 in_guest;
struct list_head runnable_threads;
@@ -288,9 +292,10 @@ struct kvmppc_vcore {
/* Values for vcore_state */
#define VCORE_INACTIVE 0
-#define VCORE_RUNNING 1
-#define VCORE_EXITING 2
-#define VCORE_SLEEPING 3
+#define VCORE_SLEEPING 1
+#define VCORE_STARTING 2
+#define VCORE_RUNNING 3
+#define VCORE_EXITING 4
/*
* Struct used to manage memory for a virtual processor area
@@ -346,6 +351,27 @@ struct kvmppc_slb {
bool class : 1;
};
+# ifdef CONFIG_PPC_FSL_BOOK3E
+#define KVMPPC_BOOKE_IAC_NUM 2
+#define KVMPPC_BOOKE_DAC_NUM 2
+# else
+#define KVMPPC_BOOKE_IAC_NUM 4
+#define KVMPPC_BOOKE_DAC_NUM 2
+# endif
+#define KVMPPC_BOOKE_MAX_IAC 4
+#define KVMPPC_BOOKE_MAX_DAC 2
+
+struct kvmppc_booke_debug_reg {
+ u32 dbcr0;
+ u32 dbcr1;
+ u32 dbcr2;
+#ifdef CONFIG_KVM_E500MC
+ u32 dbcr4;
+#endif
+ u64 iac[KVMPPC_BOOKE_MAX_IAC];
+ u64 dac[KVMPPC_BOOKE_MAX_DAC];
+};
+
struct kvm_vcpu_arch {
ulong host_stack;
u32 host_pid;
@@ -380,13 +406,18 @@ struct kvm_vcpu_arch {
u32 host_mas4;
u32 host_mas6;
u32 shadow_epcr;
- u32 epcr;
u32 shadow_msrp;
u32 eplc;
u32 epsc;
u32 oldpir;
#endif
+#if defined(CONFIG_BOOKE)
+#if defined(CONFIG_KVM_BOOKE_HV) || defined(CONFIG_64BIT)
+ u32 epcr;
+#endif
+#endif
+
#ifdef CONFIG_PPC_BOOK3S
/* For Gekko paired singles */
u32 qpr[32];
@@ -440,8 +471,6 @@ struct kvm_vcpu_arch {
u32 ccr0;
u32 ccr1;
- u32 dbcr0;
- u32 dbcr1;
u32 dbsr;
u64 mmcr[3];
@@ -471,9 +500,12 @@ struct kvm_vcpu_arch {
ulong fault_esr;
ulong queued_dear;
ulong queued_esr;
+ spinlock_t wdt_lock;
+ struct timer_list wdt_timer;
u32 tlbcfg[4];
u32 mmucfg;
u32 epr;
+ struct kvmppc_booke_debug_reg dbg_reg;
#endif
gpa_t paddr_accessed;
gva_t vaddr_accessed;
@@ -486,6 +518,7 @@ struct kvm_vcpu_arch {
u8 osi_needed;
u8 osi_enabled;
u8 papr_enabled;
+ u8 watchdog_enabled;
u8 sane;
u8 cpu_type;
u8 hcall_needed;
@@ -497,7 +530,6 @@ struct kvm_vcpu_arch {
u64 dec_jiffies;
u64 dec_expires;
unsigned long pending_exceptions;
- u16 last_cpu;
u8 ceded;
u8 prodded;
u32 last_inst;
@@ -534,13 +566,17 @@ struct kvm_vcpu_arch {
unsigned long dtl_index;
u64 stolen_logged;
struct kvmppc_vpa slb_shadow;
+
+ spinlock_t tbacct_lock;
+ u64 busy_stolen;
+ u64 busy_preempt;
#endif
};
/* Values for vcpu->arch.state */
-#define KVMPPC_VCPU_STOPPED 0
-#define KVMPPC_VCPU_BUSY_IN_HOST 1
-#define KVMPPC_VCPU_RUNNABLE 2
+#define KVMPPC_VCPU_NOTREADY 0
+#define KVMPPC_VCPU_RUNNABLE 1
+#define KVMPPC_VCPU_BUSY_IN_HOST 2
/* Values for vcpu->arch.io_gpr */
#define KVM_MMIO_REG_MASK 0x001f
diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index 9365860fb7f6..2b119654b4c1 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -21,7 +21,6 @@
#include <uapi/asm/kvm_para.h>
-
#ifdef CONFIG_KVM_GUEST
#include <linux/of.h>
@@ -55,7 +54,7 @@ static unsigned long kvm_hypercall(unsigned long *in,
unsigned long *out,
unsigned long nr)
{
- return HC_EV_UNIMPLEMENTED;
+ return EV_UNIMPLEMENTED;
}
#endif
@@ -66,7 +65,7 @@ static inline long kvm_hypercall0_1(unsigned int nr, unsigned long *r2)
unsigned long out[8];
unsigned long r;
- r = kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+ r = kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
*r2 = out[0];
return r;
@@ -77,7 +76,7 @@ static inline long kvm_hypercall0(unsigned int nr)
unsigned long in[8];
unsigned long out[8];
- return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+ return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
}
static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
@@ -86,7 +85,7 @@ static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
unsigned long out[8];
in[0] = p1;
- return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+ return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
}
static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
@@ -97,7 +96,7 @@ static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
in[0] = p1;
in[1] = p2;
- return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+ return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
}
static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
@@ -109,7 +108,7 @@ static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
in[0] = p1;
in[1] = p2;
in[2] = p3;
- return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+ return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
}
static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
@@ -123,7 +122,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
in[1] = p2;
in[2] = p3;
in[3] = p4;
- return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+ return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
}
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index e006f0bdea95..572aa7530619 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -28,6 +28,7 @@
#include <linux/types.h>
#include <linux/kvm_types.h>
#include <linux/kvm_host.h>
+#include <linux/bug.h>
#ifdef CONFIG_PPC_BOOK3S
#include <asm/kvm_book3s.h>
#else
@@ -68,6 +69,8 @@ extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
extern void kvmppc_decrementer_func(unsigned long data);
extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
+extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu);
+extern void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu);
/* Core-specific hooks */
@@ -104,6 +107,7 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
struct kvm_interrupt *irq);
extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
struct kvm_interrupt *irq);
+extern void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu);
extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
unsigned int op, int *advance);
@@ -111,6 +115,7 @@ extern int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn,
ulong val);
extern int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn,
ulong *val);
+extern int kvmppc_core_check_requests(struct kvm_vcpu *vcpu);
extern int kvmppc_booke_init(void);
extern void kvmppc_booke_exit(void);
@@ -139,16 +144,28 @@ extern struct kvmppc_linear_info *kvm_alloc_hpt(void);
extern void kvm_release_hpt(struct kvmppc_linear_info *li);
extern int kvmppc_core_init_vm(struct kvm *kvm);
extern void kvmppc_core_destroy_vm(struct kvm *kvm);
+extern void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
+ struct kvm_memory_slot *dont);
+extern int kvmppc_core_create_memslot(struct kvm_memory_slot *slot,
+ unsigned long npages);
extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
struct kvm_userspace_memory_region *mem);
extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem);
+ struct kvm_userspace_memory_region *mem,
+ struct kvm_memory_slot old);
extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm,
struct kvm_ppc_smmu_info *info);
+extern void kvmppc_core_flush_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *memslot);
extern int kvmppc_bookehv_init(void);
extern void kvmppc_bookehv_exit(void);
+extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu);
+
+extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *);
+
/*
* Cuts out inst bits with ordering according to spec.
* That means the leftmost bit is zero. All given bits are included.
@@ -182,6 +199,41 @@ static inline u32 kvmppc_set_field(u64 inst, int msb, int lsb, int value)
return r;
}
+union kvmppc_one_reg {
+ u32 wval;
+ u64 dval;
+ vector128 vval;
+ u64 vsxval[2];
+ struct {
+ u64 addr;
+ u64 length;
+ } vpaval;
+};
+
+#define one_reg_size(id) \
+ (1ul << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
+
+#define get_reg_val(id, reg) ({ \
+ union kvmppc_one_reg __u; \
+ switch (one_reg_size(id)) { \
+ case 4: __u.wval = (reg); break; \
+ case 8: __u.dval = (reg); break; \
+ default: BUG(); \
+ } \
+ __u; \
+})
+
+
+#define set_reg_val(id, val) ({ \
+ u64 __v; \
+ switch (one_reg_size(id)) { \
+ case 4: __v = (val).wval; break; \
+ case 8: __v = (val).dval; break; \
+ default: BUG(); \
+ } \
+ __v; \
+})
+
void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
@@ -190,6 +242,8 @@ int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg);
int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg);
+int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *);
+int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *);
void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
@@ -230,5 +284,36 @@ static inline void kvmppc_mmu_flush_icache(pfn_t pfn)
}
}
+/* Please call after prepare_to_enter. This function puts the lazy ee state
+ back to normal mode, without actually enabling interrupts. */
+static inline void kvmppc_lazy_ee_enable(void)
+{
+#ifdef CONFIG_PPC64
+ /* Only need to enable IRQs by hard enabling them after this */
+ local_paca->irq_happened = 0;
+ local_paca->soft_enabled = 1;
+#endif
+}
+
+static inline ulong kvmppc_get_ea_indexed(struct kvm_vcpu *vcpu, int ra, int rb)
+{
+ ulong ea;
+ ulong msr_64bit = 0;
+
+ ea = kvmppc_get_gpr(vcpu, rb);
+ if (ra)
+ ea += kvmppc_get_gpr(vcpu, ra);
+
+#if defined(CONFIG_PPC_BOOK3E_64)
+ msr_64bit = MSR_CM;
+#elif defined(CONFIG_PPC_BOOK3S_64)
+ msr_64bit = MSR_SF;
+#endif
+
+ if (!(vcpu->arch.shared->msr & msr_64bit))
+ ea = (uint32_t)ea;
+
+ return ea;
+}
#endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h
index eeabcdbc30f7..99d43e0c1e4a 100644
--- a/arch/powerpc/include/asm/mmu-book3e.h
+++ b/arch/powerpc/include/asm/mmu-book3e.h
@@ -59,7 +59,7 @@
#define MAS1_TSIZE_SHIFT 7
#define MAS1_TSIZE(x) (((x) << MAS1_TSIZE_SHIFT) & MAS1_TSIZE_MASK)
-#define MAS2_EPN 0xFFFFF000
+#define MAS2_EPN (~0xFFFUL)
#define MAS2_X0 0x00000040
#define MAS2_X1 0x00000020
#define MAS2_W 0x00000010
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
index 9673f73eb8db..2fdb47a19efd 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -121,6 +121,16 @@ extern char initial_stab[];
#define PP_RXRX 3 /* Supervisor read, User read */
#define PP_RXXX (HPTE_R_PP0 | 2) /* Supervisor read, user none */
+/* Fields for tlbiel instruction in architecture 2.06 */
+#define TLBIEL_INVAL_SEL_MASK 0xc00 /* invalidation selector */
+#define TLBIEL_INVAL_PAGE 0x000 /* invalidate a single page */
+#define TLBIEL_INVAL_SET_LPID 0x800 /* invalidate a set for current LPID */
+#define TLBIEL_INVAL_SET 0xc00 /* invalidate a set for all LPIDs */
+#define TLBIEL_INVAL_SET_MASK 0xfff000 /* set number to inval. */
+#define TLBIEL_INVAL_SET_SHIFT 12
+
+#define POWER7_TLB_SETS 128 /* # sets in POWER7 TLB */
+
#ifndef __ASSEMBLY__
struct hash_pte {
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index d24c14163966..97d37278ea2d 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -518,6 +518,7 @@
#define SRR1_WS_DEEPER 0x00020000 /* Some resources not maintained */
#define SRR1_WS_DEEP 0x00010000 /* All resources maintained */
#define SRR1_PROGFPE 0x00100000 /* Floating Point Enabled */
+#define SRR1_PROGILL 0x00080000 /* Illegal instruction */
#define SRR1_PROGPRIV 0x00040000 /* Privileged instruction */
#define SRR1_PROGTRAP 0x00020000 /* Trap */
#define SRR1_PROGADDR 0x00010000 /* SRR0 contains subsequent addr */
diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index 2d916c4982c5..e07e6af5e1ff 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -539,6 +539,13 @@
#define TCR_FIE 0x00800000 /* FIT Interrupt Enable */
#define TCR_ARE 0x00400000 /* Auto Reload Enable */
+#ifdef CONFIG_E500
+#define TCR_GET_WP(tcr) ((((tcr) & 0xC0000000) >> 30) | \
+ (((tcr) & 0x1E0000) >> 15))
+#else
+#define TCR_GET_WP(tcr) (((tcr) & 0xC0000000) >> 30)
+#endif
+
/* Bit definitions for the TSR. */
#define TSR_ENW 0x80000000 /* Enable Next Watchdog */
#define TSR_WIS 0x40000000 /* WDT Interrupt Status */
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index e807e9d8e3f7..5a4e437c238d 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -67,6 +67,14 @@ void generic_mach_cpu_die(void);
void generic_set_cpu_dead(unsigned int cpu);
void generic_set_cpu_up(unsigned int cpu);
int generic_check_cpu_restart(unsigned int cpu);
+
+extern void inhibit_secondary_onlining(void);
+extern void uninhibit_secondary_onlining(void);
+
+#else /* HOTPLUG_CPU */
+static inline void inhibit_secondary_onlining(void) {}
+static inline void uninhibit_secondary_onlining(void) {}
+
#endif
#ifdef CONFIG_PPC64
diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild
index a33c3c03bb2e..f7bca6370745 100644
--- a/arch/powerpc/include/uapi/asm/Kbuild
+++ b/arch/powerpc/include/uapi/asm/Kbuild
@@ -7,6 +7,7 @@ header-y += bootx.h
header-y += byteorder.h
header-y += cputable.h
header-y += elf.h
+header-y += epapr_hcalls.h
header-y += errno.h
header-y += fcntl.h
header-y += ioctl.h
diff --git a/arch/powerpc/include/uapi/asm/epapr_hcalls.h b/arch/powerpc/include/uapi/asm/epapr_hcalls.h
new file mode 100644
index 000000000000..7f9c74b46704
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/epapr_hcalls.h
@@ -0,0 +1,98 @@
+/*
+ * ePAPR hcall interface
+ *
+ * Copyright 2008-2011 Freescale Semiconductor, Inc.
+ *
+ * Author: Timur Tabi <timur@freescale.com>
+ *
+ * This file is provided under a dual BSD/GPL license. When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Freescale Semiconductor nor the
+ * names of its contributors may be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _UAPI_ASM_POWERPC_EPAPR_HCALLS_H
+#define _UAPI_ASM_POWERPC_EPAPR_HCALLS_H
+
+#define EV_BYTE_CHANNEL_SEND 1
+#define EV_BYTE_CHANNEL_RECEIVE 2
+#define EV_BYTE_CHANNEL_POLL 3
+#define EV_INT_SET_CONFIG 4
+#define EV_INT_GET_CONFIG 5
+#define EV_INT_SET_MASK 6
+#define EV_INT_GET_MASK 7
+#define EV_INT_IACK 9
+#define EV_INT_EOI 10
+#define EV_INT_SEND_IPI 11
+#define EV_INT_SET_TASK_PRIORITY 12
+#define EV_INT_GET_TASK_PRIORITY 13
+#define EV_DOORBELL_SEND 14
+#define EV_MSGSND 15
+#define EV_IDLE 16
+
+/* vendor ID: epapr */
+#define EV_LOCAL_VENDOR_ID 0 /* for private use */
+#define EV_EPAPR_VENDOR_ID 1
+#define EV_FSL_VENDOR_ID 2 /* Freescale Semiconductor */
+#define EV_IBM_VENDOR_ID 3 /* IBM */
+#define EV_GHS_VENDOR_ID 4 /* Green Hills Software */
+#define EV_ENEA_VENDOR_ID 5 /* Enea */
+#define EV_WR_VENDOR_ID 6 /* Wind River Systems */
+#define EV_AMCC_VENDOR_ID 7 /* Applied Micro Circuits */
+#define EV_KVM_VENDOR_ID 42 /* KVM */
+
+/* The max number of bytes that a byte channel can send or receive per call */
+#define EV_BYTE_CHANNEL_MAX_BYTES 16
+
+
+#define _EV_HCALL_TOKEN(id, num) (((id) << 16) | (num))
+#define EV_HCALL_TOKEN(hcall_num) _EV_HCALL_TOKEN(EV_EPAPR_VENDOR_ID, hcall_num)
+
+/* epapr return codes */
+#define EV_SUCCESS 0
+#define EV_EPERM 1 /* Operation not permitted */
+#define EV_ENOENT 2 /* Entry Not Found */
+#define EV_EIO 3 /* I/O error occured */
+#define EV_EAGAIN 4 /* The operation had insufficient
+ * resources to complete and should be
+ * retried
+ */
+#define EV_ENOMEM 5 /* There was insufficient memory to
+ * complete the operation */
+#define EV_EFAULT 6 /* Bad guest address */
+#define EV_ENODEV 7 /* No such device */
+#define EV_EINVAL 8 /* An argument supplied to the hcall
+ was out of range or invalid */
+#define EV_INTERNAL 9 /* An internal error occured */
+#define EV_CONFIG 10 /* A configuration error was detected */
+#define EV_INVALID_STATE 11 /* The object is in an invalid state */
+#define EV_UNIMPLEMENTED 12 /* Unimplemented hypercall */
+#define EV_BUFFER_OVERFLOW 13 /* Caller-supplied buffer too small */
+
+#endif /* _UAPI_ASM_POWERPC_EPAPR_HCALLS_H */
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 1bea4d8ea6f4..2fba8a66fb10 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -221,6 +221,12 @@ struct kvm_sregs {
__u32 dbsr; /* KVM_SREGS_E_UPDATE_DBSR */
__u32 dbcr[3];
+ /*
+ * iac/dac registers are 64bit wide, while this API
+ * interface provides only lower 32 bits on 64 bit
+ * processors. ONE_REG interface is added for 64bit
+ * iac/dac registers.
+ */
__u32 iac[4];
__u32 dac[2];
__u32 dvc[2];
@@ -325,6 +331,86 @@ struct kvm_book3e_206_tlb_params {
__u32 reserved[8];
};
+/* For KVM_PPC_GET_HTAB_FD */
+struct kvm_get_htab_fd {
+ __u64 flags;
+ __u64 start_index;
+ __u64 reserved[2];
+};
+
+/* Values for kvm_get_htab_fd.flags */
+#define KVM_GET_HTAB_BOLTED_ONLY ((__u64)0x1)
+#define KVM_GET_HTAB_WRITE ((__u64)0x2)
+
+/*
+ * Data read on the file descriptor is formatted as a series of
+ * records, each consisting of a header followed by a series of
+ * `n_valid' HPTEs (16 bytes each), which are all valid. Following
+ * those valid HPTEs there are `n_invalid' invalid HPTEs, which
+ * are not represented explicitly in the stream. The same format
+ * is used for writing.
+ */
+struct kvm_get_htab_header {
+ __u32 index;
+ __u16 n_valid;
+ __u16 n_invalid;
+};
+
#define KVM_REG_PPC_HIOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1)
+#define KVM_REG_PPC_IAC1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x2)
+#define KVM_REG_PPC_IAC2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3)
+#define KVM_REG_PPC_IAC3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x4)
+#define KVM_REG_PPC_IAC4 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x5)
+#define KVM_REG_PPC_DAC1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x6)
+#define KVM_REG_PPC_DAC2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x7)
+#define KVM_REG_PPC_DABR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8)
+#define KVM_REG_PPC_DSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x9)
+#define KVM_REG_PPC_PURR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa)
+#define KVM_REG_PPC_SPURR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb)
+#define KVM_REG_PPC_DAR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc)
+#define KVM_REG_PPC_DSISR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xd)
+#define KVM_REG_PPC_AMR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xe)
+#define KVM_REG_PPC_UAMOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xf)
+
+#define KVM_REG_PPC_MMCR0 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x10)
+#define KVM_REG_PPC_MMCR1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x11)
+#define KVM_REG_PPC_MMCRA (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12)
+
+#define KVM_REG_PPC_PMC1 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x18)
+#define KVM_REG_PPC_PMC2 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x19)
+#define KVM_REG_PPC_PMC3 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1a)
+#define KVM_REG_PPC_PMC4 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1b)
+#define KVM_REG_PPC_PMC5 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1c)
+#define KVM_REG_PPC_PMC6 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1d)
+#define KVM_REG_PPC_PMC7 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1e)
+#define KVM_REG_PPC_PMC8 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1f)
+
+/* 32 floating-point registers */
+#define KVM_REG_PPC_FPR0 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x20)
+#define KVM_REG_PPC_FPR(n) (KVM_REG_PPC_FPR0 + (n))
+#define KVM_REG_PPC_FPR31 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3f)
+
+/* 32 VMX/Altivec vector registers */
+#define KVM_REG_PPC_VR0 (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x40)
+#define KVM_REG_PPC_VR(n) (KVM_REG_PPC_VR0 + (n))
+#define KVM_REG_PPC_VR31 (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x5f)
+
+/* 32 double-width FP registers for VSX */
+/* High-order halves overlap with FP regs */
+#define KVM_REG_PPC_VSR0 (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x60)
+#define KVM_REG_PPC_VSR(n) (KVM_REG_PPC_VSR0 + (n))
+#define KVM_REG_PPC_VSR31 (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x7f)
+
+/* FP and vector status/control registers */
+#define KVM_REG_PPC_FPSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x80)
+#define KVM_REG_PPC_VSCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x81)
+
+/* Virtual processor areas */
+/* For SLB & DTL, address in high (first) half, length in low half */
+#define KVM_REG_PPC_VPA_ADDR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x82)
+#define KVM_REG_PPC_VPA_SLB (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x83)
+#define KVM_REG_PPC_VPA_DTL (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x84)
+
+#define KVM_REG_PPC_EPCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x85)
#endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/include/uapi/asm/kvm_para.h b/arch/powerpc/include/uapi/asm/kvm_para.h
index 5e04383a1db5..ed0e0254b47f 100644
--- a/arch/powerpc/include/uapi/asm/kvm_para.h
+++ b/arch/powerpc/include/uapi/asm/kvm_para.h
@@ -75,9 +75,10 @@ struct kvm_vcpu_arch_shared {
};
#define KVM_SC_MAGIC_R0 0x4b564d21 /* "KVM!" */
-#define HC_VENDOR_KVM (42 << 16)
-#define HC_EV_SUCCESS 0
-#define HC_EV_UNIMPLEMENTED 12
+
+#define KVM_HCALL_TOKEN(num) _EV_HCALL_TOKEN(EV_KVM_VENDOR_ID, num)
+
+#include <uapi/asm/epapr_hcalls.h>
#define KVM_FEATURE_MAGIC_PAGE 1
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 7523539cfe9f..4e23ba2f3ca7 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -441,8 +441,7 @@ int main(void)
DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1));
DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
- DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter));
- DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu));
+ DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits));
DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v));
@@ -470,7 +469,6 @@ int main(void)
DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb));
DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max));
DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
- DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu));
DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
diff --git a/arch/powerpc/kernel/epapr_hcalls.S b/arch/powerpc/kernel/epapr_hcalls.S
index 697b390ebfd8..62c0dc237826 100644
--- a/arch/powerpc/kernel/epapr_hcalls.S
+++ b/arch/powerpc/kernel/epapr_hcalls.S
@@ -8,13 +8,41 @@
*/
#include <linux/threads.h>
+#include <asm/epapr_hcalls.h>
#include <asm/reg.h>
#include <asm/page.h>
#include <asm/cputable.h>
#include <asm/thread_info.h>
#include <asm/ppc_asm.h>
+#include <asm/asm-compat.h>
#include <asm/asm-offsets.h>
+/* epapr_ev_idle() was derived from e500_idle() */
+_GLOBAL(epapr_ev_idle)
+ CURRENT_THREAD_INFO(r3, r1)
+ PPC_LL r4, TI_LOCAL_FLAGS(r3) /* set napping bit */
+ ori r4, r4,_TLF_NAPPING /* so when we take an exception */
+ PPC_STL r4, TI_LOCAL_FLAGS(r3) /* it will return to our caller */
+
+ wrteei 1
+
+idle_loop:
+ LOAD_REG_IMMEDIATE(r11, EV_HCALL_TOKEN(EV_IDLE))
+
+.global epapr_ev_idle_start
+epapr_ev_idle_start:
+ li r3, -1
+ nop
+ nop
+ nop
+
+ /*
+ * Guard against spurious wakeups from a hypervisor --
+ * only interrupt will cause us to return to LR due to
+ * _TLF_NAPPING.
+ */
+ b idle_loop
+
/* Hypercall entry point. Will be patched with device tree instructions. */
.global epapr_hypercall_start
epapr_hypercall_start:
diff --git a/arch/powerpc/kernel/epapr_paravirt.c b/arch/powerpc/kernel/epapr_paravirt.c
index 028aeae370b6..f3eab8594d9f 100644
--- a/arch/powerpc/kernel/epapr_paravirt.c
+++ b/arch/powerpc/kernel/epapr_paravirt.c
@@ -21,6 +21,10 @@
#include <asm/epapr_hcalls.h>
#include <asm/cacheflush.h>
#include <asm/code-patching.h>
+#include <asm/machdep.h>
+
+extern void epapr_ev_idle(void);
+extern u32 epapr_ev_idle_start[];
bool epapr_paravirt_enabled;
@@ -41,8 +45,13 @@ static int __init epapr_paravirt_init(void)
if (len % 4 || len > (4 * 4))
return -ENODEV;
- for (i = 0; i < (len / 4); i++)
+ for (i = 0; i < (len / 4); i++) {
patch_instruction(epapr_hypercall_start + i, insts[i]);
+ patch_instruction(epapr_ev_idle_start + i, insts[i]);
+ }
+
+ if (of_get_property(hyper_node, "has-idle", NULL))
+ ppc_md.power_save = epapr_ev_idle;
epapr_paravirt_enabled = true;
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 867db1de8949..a61b133c4f99 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -419,7 +419,7 @@ static void kvm_map_magic_page(void *data)
in[0] = KVM_MAGIC_PAGE;
in[1] = KVM_MAGIC_PAGE;
- kvm_hypercall(in, out, HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE);
+ kvm_hypercall(in, out, KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE));
*features = out[0];
}
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index 19e4288d8486..78b8766fd79e 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -43,6 +43,7 @@
#include <asm/dcr.h>
#include <asm/ftrace.h>
#include <asm/switch_to.h>
+#include <asm/epapr_hcalls.h>
#ifdef CONFIG_PPC32
extern void transfer_to_handler(void);
@@ -191,3 +192,7 @@ EXPORT_SYMBOL(__arch_hweight64);
#ifdef CONFIG_PPC_BOOK3S_64
EXPORT_SYMBOL_GPL(mmu_psize_defs);
#endif
+
+#ifdef CONFIG_EPAPR_PARAVIRT
+EXPORT_SYMBOL(epapr_hypercall_start);
+#endif
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 2b952b5386fd..e5b133ebd8a5 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -427,6 +427,45 @@ int generic_check_cpu_restart(unsigned int cpu)
{
return per_cpu(cpu_state, cpu) == CPU_UP_PREPARE;
}
+
+static atomic_t secondary_inhibit_count;
+
+/*
+ * Don't allow secondary CPU threads to come online
+ */
+void inhibit_secondary_onlining(void)
+{
+ /*
+ * This makes secondary_inhibit_count stable during cpu
+ * online/offline operations.
+ */
+ get_online_cpus();
+
+ atomic_inc(&secondary_inhibit_count);
+ put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(inhibit_secondary_onlining);
+
+/*
+ * Allow secondary CPU threads to come online again
+ */
+void uninhibit_secondary_onlining(void)
+{
+ get_online_cpus();
+ atomic_dec(&secondary_inhibit_count);
+ put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(uninhibit_secondary_onlining);
+
+static int secondaries_inhibited(void)
+{
+ return atomic_read(&secondary_inhibit_count);
+}
+
+#else /* HOTPLUG_CPU */
+
+#define secondaries_inhibited() 0
+
#endif
static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle)
@@ -445,6 +484,13 @@ int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle)
{
int rc, c;
+ /*
+ * Don't allow secondary threads to come online if inhibited
+ */
+ if (threads_per_core > 1 && secondaries_inhibited() &&
+ cpu % threads_per_core != 0)
+ return -EBUSY;
+
if (smp_ops == NULL ||
(smp_ops->cpu_bootable && !smp_ops->cpu_bootable(cpu)))
return -EINVAL;
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 50e7dbc7356c..3d7fd21c65f9 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -83,6 +83,7 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
vcpu_44x->shadow_refs[i].gtlb_index = -1;
vcpu->arch.cpu_type = KVM_CPU_440;
+ vcpu->arch.pvr = mfspr(SPRN_PVR);
return 0;
}
diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c
index c8c61578fdfc..35ec0a8547da 100644
--- a/arch/powerpc/kvm/44x_emulate.c
+++ b/arch/powerpc/kvm/44x_emulate.c
@@ -27,12 +27,70 @@
#include "booke.h"
#include "44x_tlb.h"
+#define XOP_MFDCRX 259
#define XOP_MFDCR 323
+#define XOP_MTDCRX 387
#define XOP_MTDCR 451
#define XOP_TLBSX 914
#define XOP_ICCCI 966
#define XOP_TLBWE 978
+static int emulate_mtdcr(struct kvm_vcpu *vcpu, int rs, int dcrn)
+{
+ /* emulate some access in kernel */
+ switch (dcrn) {
+ case DCRN_CPR0_CONFIG_ADDR:
+ vcpu->arch.cpr0_cfgaddr = kvmppc_get_gpr(vcpu, rs);
+ return EMULATE_DONE;
+ default:
+ vcpu->run->dcr.dcrn = dcrn;
+ vcpu->run->dcr.data = kvmppc_get_gpr(vcpu, rs);
+ vcpu->run->dcr.is_write = 1;
+ vcpu->arch.dcr_is_write = 1;
+ vcpu->arch.dcr_needed = 1;
+ kvmppc_account_exit(vcpu, DCR_EXITS);
+ return EMULATE_DO_DCR;
+ }
+}
+
+static int emulate_mfdcr(struct kvm_vcpu *vcpu, int rt, int dcrn)
+{
+ /* The guest may access CPR0 registers to determine the timebase
+ * frequency, and it must know the real host frequency because it
+ * can directly access the timebase registers.
+ *
+ * It would be possible to emulate those accesses in userspace,
+ * but userspace can really only figure out the end frequency.
+ * We could decompose that into the factors that compute it, but
+ * that's tricky math, and it's easier to just report the real
+ * CPR0 values.
+ */
+ switch (dcrn) {
+ case DCRN_CPR0_CONFIG_ADDR:
+ kvmppc_set_gpr(vcpu, rt, vcpu->arch.cpr0_cfgaddr);
+ break;
+ case DCRN_CPR0_CONFIG_DATA:
+ local_irq_disable();
+ mtdcr(DCRN_CPR0_CONFIG_ADDR,
+ vcpu->arch.cpr0_cfgaddr);
+ kvmppc_set_gpr(vcpu, rt,
+ mfdcr(DCRN_CPR0_CONFIG_DATA));
+ local_irq_enable();
+ break;
+ default:
+ vcpu->run->dcr.dcrn = dcrn;
+ vcpu->run->dcr.data = 0;
+ vcpu->run->dcr.is_write = 0;
+ vcpu->arch.dcr_is_write = 0;
+ vcpu->arch.io_gpr = rt;
+ vcpu->arch.dcr_needed = 1;
+ kvmppc_account_exit(vcpu, DCR_EXITS);
+ return EMULATE_DO_DCR;
+ }
+
+ return EMULATE_DONE;
+}
+
int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
unsigned int inst, int *advance)
{
@@ -50,55 +108,21 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
switch (get_xop(inst)) {
case XOP_MFDCR:
- /* The guest may access CPR0 registers to determine the timebase
- * frequency, and it must know the real host frequency because it
- * can directly access the timebase registers.
- *
- * It would be possible to emulate those accesses in userspace,
- * but userspace can really only figure out the end frequency.
- * We could decompose that into the factors that compute it, but
- * that's tricky math, and it's easier to just report the real
- * CPR0 values.
- */
- switch (dcrn) {
- case DCRN_CPR0_CONFIG_ADDR:
- kvmppc_set_gpr(vcpu, rt, vcpu->arch.cpr0_cfgaddr);
- break;
- case DCRN_CPR0_CONFIG_DATA:
- local_irq_disable();
- mtdcr(DCRN_CPR0_CONFIG_ADDR,
- vcpu->arch.cpr0_cfgaddr);
- kvmppc_set_gpr(vcpu, rt,
- mfdcr(DCRN_CPR0_CONFIG_DATA));
- local_irq_enable();
- break;
- default:
- run->dcr.dcrn = dcrn;
- run->dcr.data = 0;
- run->dcr.is_write = 0;
- vcpu->arch.io_gpr = rt;
- vcpu->arch.dcr_needed = 1;
- kvmppc_account_exit(vcpu, DCR_EXITS);
- emulated = EMULATE_DO_DCR;
- }
+ emulated = emulate_mfdcr(vcpu, rt, dcrn);
+ break;
+ case XOP_MFDCRX:
+ emulated = emulate_mfdcr(vcpu, rt,
+ kvmppc_get_gpr(vcpu, ra));
break;
case XOP_MTDCR:
- /* emulate some access in kernel */
- switch (dcrn) {
- case DCRN_CPR0_CONFIG_ADDR:
- vcpu->arch.cpr0_cfgaddr = kvmppc_get_gpr(vcpu, rs);
- break;
- default:
- run->dcr.dcrn = dcrn;
- run->dcr.data = kvmppc_get_gpr(vcpu, rs);
- run->dcr.is_write = 1;
- vcpu->arch.dcr_needed = 1;
- kvmppc_account_exit(vcpu, DCR_EXITS);
- emulated = EMULATE_DO_DCR;
- }
+ emulated = emulate_mtdcr(vcpu, rs, dcrn);
+ break;
+ case XOP_MTDCRX:
+ emulated = emulate_mtdcr(vcpu, rs,
+ kvmppc_get_gpr(vcpu, ra));
break;
case XOP_TLBWE:
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index f4dacb9c57fa..4730c953f435 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -20,6 +20,7 @@ config KVM
bool
select PREEMPT_NOTIFIERS
select ANON_INODES
+ select HAVE_KVM_EVENTFD
config KVM_BOOK3S_HANDLER
bool
@@ -36,6 +37,7 @@ config KVM_BOOK3S_64_HANDLER
config KVM_BOOK3S_PR
bool
select KVM_MMIO
+ select MMU_NOTIFIER
config KVM_BOOK3S_32
tristate "KVM support for PowerPC book3s_32 processors"
@@ -123,6 +125,7 @@ config KVM_E500V2
depends on EXPERIMENTAL && E500 && !PPC_E500MC
select KVM
select KVM_MMIO
+ select MMU_NOTIFIER
---help---
Support running unmodified E500 guest kernels in virtual machines on
E500v2 host processors.
@@ -138,6 +141,7 @@ config KVM_E500MC
select KVM
select KVM_MMIO
select KVM_BOOKE_HV
+ select MMU_NOTIFIER
---help---
Support running unmodified E500MC/E5500 (32-bit) guest kernels in
virtual machines on E500MC/E5500 host processors.
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index c2a08636e6d4..1e473d46322c 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -6,7 +6,8 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
-common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
+common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o \
+ eventfd.o)
CFLAGS_44x_tlb.o := -I.
CFLAGS_e500_tlb.o := -I.
@@ -72,10 +73,12 @@ kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HV) := \
book3s_hv_rmhandlers.o \
book3s_hv_rm_mmu.o \
book3s_64_vio_hv.o \
+ book3s_hv_ras.o \
book3s_hv_builtin.o
kvm-book3s_64-module-objs := \
../../../virt/kvm/kvm_main.o \
+ ../../../virt/kvm/eventfd.o \
powerpc.o \
emulate.o \
book3s.o \
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 3f2a8360c857..a4b645285240 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -411,6 +411,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
return 0;
}
+int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+}
+
int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
int i;
@@ -476,6 +485,122 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
return -ENOTSUPP;
}
+int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+{
+ int r;
+ union kvmppc_one_reg val;
+ int size;
+ long int i;
+
+ size = one_reg_size(reg->id);
+ if (size > sizeof(val))
+ return -EINVAL;
+
+ r = kvmppc_get_one_reg(vcpu, reg->id, &val);
+
+ if (r == -EINVAL) {
+ r = 0;
+ switch (reg->id) {
+ case KVM_REG_PPC_DAR:
+ val = get_reg_val(reg->id, vcpu->arch.shared->dar);
+ break;
+ case KVM_REG_PPC_DSISR:
+ val = get_reg_val(reg->id, vcpu->arch.shared->dsisr);
+ break;
+ case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
+ i = reg->id - KVM_REG_PPC_FPR0;
+ val = get_reg_val(reg->id, vcpu->arch.fpr[i]);
+ break;
+ case KVM_REG_PPC_FPSCR:
+ val = get_reg_val(reg->id, vcpu->arch.fpscr);
+ break;
+#ifdef CONFIG_ALTIVEC
+ case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31:
+ if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
+ r = -ENXIO;
+ break;
+ }
+ val.vval = vcpu->arch.vr[reg->id - KVM_REG_PPC_VR0];
+ break;
+ case KVM_REG_PPC_VSCR:
+ if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
+ r = -ENXIO;
+ break;
+ }
+ val = get_reg_val(reg->id, vcpu->arch.vscr.u[3]);
+ break;
+#endif /* CONFIG_ALTIVEC */
+ default:
+ r = -EINVAL;
+ break;
+ }
+ }
+ if (r)
+ return r;
+
+ if (copy_to_user((char __user *)(unsigned long)reg->addr, &val, size))
+ r = -EFAULT;
+
+ return r;
+}
+
+int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+{
+ int r;
+ union kvmppc_one_reg val;
+ int size;
+ long int i;
+
+ size = one_reg_size(reg->id);
+ if (size > sizeof(val))
+ return -EINVAL;
+
+ if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size))
+ return -EFAULT;
+
+ r = kvmppc_set_one_reg(vcpu, reg->id, &val);
+
+ if (r == -EINVAL) {
+ r = 0;
+ switch (reg->id) {
+ case KVM_REG_PPC_DAR:
+ vcpu->arch.shared->dar = set_reg_val(reg->id, val);
+ break;
+ case KVM_REG_PPC_DSISR:
+ vcpu->arch.shared->dsisr = set_reg_val(reg->id, val);
+ break;
+ case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
+ i = reg->id - KVM_REG_PPC_FPR0;
+ vcpu->arch.fpr[i] = set_reg_val(reg->id, val);
+ break;
+ case KVM_REG_PPC_FPSCR:
+ vcpu->arch.fpscr = set_reg_val(reg->id, val);
+ break;
+#ifdef CONFIG_ALTIVEC
+ case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31:
+ if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
+ r = -ENXIO;
+ break;
+ }
+ vcpu->arch.vr[reg->id - KVM_REG_PPC_VR0] = val.vval;
+ break;
+ case KVM_REG_PPC_VSCR:
+ if (!cpu_has_feature(CPU_FTR_ALTIVEC)) {
+ r = -ENXIO;
+ break;
+ }
+ vcpu->arch.vscr.u[3] = set_reg_val(reg->id, val);
+ break;
+#endif /* CONFIG_ALTIVEC */
+ default:
+ r = -EINVAL;
+ break;
+ }
+ }
+
+ return r;
+}
+
int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
struct kvm_translation *tr)
{
diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
index b0f625a33345..00e619bf608e 100644
--- a/arch/powerpc/kvm/book3s_32_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -155,7 +155,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
/* Get host physical address for gpa */
hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT);
- if (is_error_pfn(hpaddr)) {
+ if (is_error_noslot_pfn(hpaddr)) {
printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n",
orig_pte->eaddr);
r = -EINVAL;
@@ -254,6 +254,7 @@ next_pteg:
kvmppc_mmu_hpte_cache_map(vcpu, pte);
+ kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT);
out:
return r;
}
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 4d72f9ebc554..ead58e317294 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -93,7 +93,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
/* Get host physical address for gpa */
hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT);
- if (is_error_pfn(hpaddr)) {
+ if (is_error_noslot_pfn(hpaddr)) {
printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr);
r = -EINVAL;
goto out;
@@ -171,6 +171,7 @@ map_again:
kvmppc_mmu_hpte_cache_map(vcpu, pte);
}
+ kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT);
out:
return r;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index d95d11322a15..8cc18abd6dde 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -24,6 +24,9 @@
#include <linux/slab.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
+#include <linux/srcu.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
#include <asm/tlbflush.h>
#include <asm/kvm_ppc.h>
@@ -40,6 +43,11 @@
/* Power architecture requires HPT is at least 256kB */
#define PPC_MIN_HPT_ORDER 18
+static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
+ long pte_index, unsigned long pteh,
+ unsigned long ptel, unsigned long *pte_idx_ret);
+static void kvmppc_rmap_reset(struct kvm *kvm);
+
long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
{
unsigned long hpt;
@@ -137,10 +145,11 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
/* Set the entire HPT to 0, i.e. invalid HPTEs */
memset((void *)kvm->arch.hpt_virt, 0, 1ul << order);
/*
- * Set the whole last_vcpu array to an invalid vcpu number.
- * This ensures that each vcpu will flush its TLB on next entry.
+ * Reset all the reverse-mapping chains for all memslots
*/
- memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu));
+ kvmppc_rmap_reset(kvm);
+ /* Ensure that each vcpu will flush its TLB on next entry. */
+ cpumask_setall(&kvm->arch.need_tlb_flush);
*htab_orderp = order;
err = 0;
} else {
@@ -184,6 +193,7 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
unsigned long addr, hash;
unsigned long psize;
unsigned long hp0, hp1;
+ unsigned long idx_ret;
long ret;
struct kvm *kvm = vcpu->kvm;
@@ -215,7 +225,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
hash = (hash << 3) + 7;
hp_v = hp0 | ((addr >> 16) & ~0x7fUL);
hp_r = hp1 | addr;
- ret = kvmppc_virtmode_h_enter(vcpu, H_EXACT, hash, hp_v, hp_r);
+ ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r,
+ &idx_ret);
if (ret != H_SUCCESS) {
pr_err("KVM: map_vrma at %lx failed, ret=%ld\n",
addr, ret);
@@ -260,7 +271,7 @@ static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
/*
* This is called to get a reference to a guest page if there isn't
- * one already in the kvm->arch.slot_phys[][] arrays.
+ * one already in the memslot->arch.slot_phys[] array.
*/
static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn,
struct kvm_memory_slot *memslot,
@@ -275,7 +286,7 @@ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn,
struct vm_area_struct *vma;
unsigned long pfn, i, npages;
- physp = kvm->arch.slot_phys[memslot->id];
+ physp = memslot->arch.slot_phys;
if (!physp)
return -EINVAL;
if (physp[gfn - memslot->base_gfn])
@@ -353,15 +364,10 @@ static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn,
return err;
}
-/*
- * We come here on a H_ENTER call from the guest when we are not
- * using mmu notifiers and we don't have the requested page pinned
- * already.
- */
-long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
- long pte_index, unsigned long pteh, unsigned long ptel)
+long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
+ long pte_index, unsigned long pteh,
+ unsigned long ptel, unsigned long *pte_idx_ret)
{
- struct kvm *kvm = vcpu->kvm;
unsigned long psize, gpa, gfn;
struct kvm_memory_slot *memslot;
long ret;
@@ -389,8 +395,8 @@ long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
do_insert:
/* Protect linux PTE lookup from page table destruction */
rcu_read_lock_sched(); /* this disables preemption too */
- vcpu->arch.pgdir = current->mm->pgd;
- ret = kvmppc_h_enter(vcpu, flags, pte_index, pteh, ptel);
+ ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel,
+ current->mm->pgd, false, pte_idx_ret);
rcu_read_unlock_sched();
if (ret == H_TOO_HARD) {
/* this can't happen */
@@ -401,6 +407,19 @@ long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
}
+/*
+ * We come here on a H_ENTER call from the guest when we are not
+ * using mmu notifiers and we don't have the requested page pinned
+ * already.
+ */
+long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
+ long pte_index, unsigned long pteh,
+ unsigned long ptel)
+{
+ return kvmppc_virtmode_do_h_enter(vcpu->kvm, flags, pte_index,
+ pteh, ptel, &vcpu->arch.gpr[4]);
+}
+
static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu,
gva_t eaddr)
{
@@ -570,7 +589,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
struct kvm *kvm = vcpu->kvm;
unsigned long *hptep, hpte[3], r;
unsigned long mmu_seq, psize, pte_size;
- unsigned long gfn, hva, pfn;
+ unsigned long gpa, gfn, hva, pfn;
struct kvm_memory_slot *memslot;
unsigned long *rmap;
struct revmap_entry *rev;
@@ -608,15 +627,14 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
/* Translate the logical address and get the page */
psize = hpte_page_size(hpte[0], r);
- gfn = hpte_rpn(r, psize);
+ gpa = (r & HPTE_R_RPN & ~(psize - 1)) | (ea & (psize - 1));
+ gfn = gpa >> PAGE_SHIFT;
memslot = gfn_to_memslot(kvm, gfn);
/* No memslot means it's an emulated MMIO region */
- if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
- unsigned long gpa = (gfn << PAGE_SHIFT) | (ea & (psize - 1));
+ if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
dsisr & DSISR_ISSTORE);
- }
if (!kvm->arch.using_mmu_notifiers)
return -EFAULT; /* should never get here */
@@ -710,7 +728,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
/* Check if we might have been invalidated; let the guest retry if so */
ret = RESUME_GUEST;
- if (mmu_notifier_retry(vcpu, mmu_seq)) {
+ if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) {
unlock_rmap(rmap);
goto out_unlock;
}
@@ -756,6 +774,25 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
goto out_put;
}
+static void kvmppc_rmap_reset(struct kvm *kvm)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+ int srcu_idx;
+
+ srcu_idx = srcu_read_lock(&kvm->srcu);
+ slots = kvm->memslots;
+ kvm_for_each_memslot(memslot, slots) {
+ /*
+ * This assumes it is acceptable to lose reference and
+ * change bits across a reset.
+ */
+ memset(memslot->arch.rmap, 0,
+ memslot->npages * sizeof(*memslot->arch.rmap));
+ }
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+}
+
static int kvm_handle_hva_range(struct kvm *kvm,
unsigned long start,
unsigned long end,
@@ -850,7 +887,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
psize = hpte_page_size(hptep[0], ptel);
if ((hptep[0] & HPTE_V_VALID) &&
hpte_rpn(ptel, psize) == gfn) {
- hptep[0] |= HPTE_V_ABSENT;
+ if (kvm->arch.using_mmu_notifiers)
+ hptep[0] |= HPTE_V_ABSENT;
kvmppc_invalidate_hpte(kvm, hptep, i);
/* Harvest R and C */
rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C);
@@ -877,6 +915,28 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
return 0;
}
+void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
+{
+ unsigned long *rmapp;
+ unsigned long gfn;
+ unsigned long n;
+
+ rmapp = memslot->arch.rmap;
+ gfn = memslot->base_gfn;
+ for (n = memslot->npages; n; --n) {
+ /*
+ * Testing the present bit without locking is OK because
+ * the memslot has been marked invalid already, and hence
+ * no new HPTEs referencing this page can be created,
+ * thus the present bit can't go from 0 to 1.
+ */
+ if (*rmapp & KVMPPC_RMAP_PRESENT)
+ kvm_unmap_rmapp(kvm, rmapp, gfn);
+ ++rmapp;
+ ++gfn;
+ }
+}
+
static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
unsigned long gfn)
{
@@ -1030,16 +1090,16 @@ static int kvm_test_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
return ret;
}
-long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
+long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ unsigned long *map)
{
unsigned long i;
- unsigned long *rmapp, *map;
+ unsigned long *rmapp;
preempt_disable();
rmapp = memslot->arch.rmap;
- map = memslot->dirty_bitmap;
for (i = 0; i < memslot->npages; ++i) {
- if (kvm_test_clear_dirty(kvm, rmapp))
+ if (kvm_test_clear_dirty(kvm, rmapp) && map)
__set_bit_le(i, map);
++rmapp;
}
@@ -1057,20 +1117,22 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
unsigned long hva, psize, offset;
unsigned long pa;
unsigned long *physp;
+ int srcu_idx;
+ srcu_idx = srcu_read_lock(&kvm->srcu);
memslot = gfn_to_memslot(kvm, gfn);
if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
- return NULL;
+ goto err;
if (!kvm->arch.using_mmu_notifiers) {
- physp = kvm->arch.slot_phys[memslot->id];
+ physp = memslot->arch.slot_phys;
if (!physp)
- return NULL;
+ goto err;
physp += gfn - memslot->base_gfn;
pa = *physp;
if (!pa) {
if (kvmppc_get_guest_page(kvm, gfn, memslot,
PAGE_SIZE) < 0)
- return NULL;
+ goto err;
pa = *physp;
}
page = pfn_to_page(pa >> PAGE_SHIFT);
@@ -1079,9 +1141,11 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
hva = gfn_to_hva_memslot(memslot, gfn);
npages = get_user_pages_fast(hva, 1, 1, pages);
if (npages < 1)
- return NULL;
+ goto err;
page = pages[0];
}
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+
psize = PAGE_SIZE;
if (PageHuge(page)) {
page = compound_head(page);
@@ -1091,6 +1155,10 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
if (nb_ret)
*nb_ret = psize - offset;
return page_address(page) + offset;
+
+ err:
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+ return NULL;
}
void kvmppc_unpin_guest_page(struct kvm *kvm, void *va)
@@ -1100,6 +1168,348 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va)
put_page(page);
}
+/*
+ * Functions for reading and writing the hash table via reads and
+ * writes on a file descriptor.
+ *
+ * Reads return the guest view of the hash table, which has to be
+ * pieced together from the real hash table and the guest_rpte
+ * values in the revmap array.
+ *
+ * On writes, each HPTE written is considered in turn, and if it
+ * is valid, it is written to the HPT as if an H_ENTER with the
+ * exact flag set was done. When the invalid count is non-zero
+ * in the header written to the stream, the kernel will make
+ * sure that that many HPTEs are invalid, and invalidate them
+ * if not.
+ */
+
+struct kvm_htab_ctx {
+ unsigned long index;
+ unsigned long flags;
+ struct kvm *kvm;
+ int first_pass;
+};
+
+#define HPTE_SIZE (2 * sizeof(unsigned long))
+
+static long record_hpte(unsigned long flags, unsigned long *hptp,
+ unsigned long *hpte, struct revmap_entry *revp,
+ int want_valid, int first_pass)
+{
+ unsigned long v, r;
+ int ok = 1;
+ int valid, dirty;
+
+ /* Unmodified entries are uninteresting except on the first pass */
+ dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
+ if (!first_pass && !dirty)
+ return 0;
+
+ valid = 0;
+ if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) {
+ valid = 1;
+ if ((flags & KVM_GET_HTAB_BOLTED_ONLY) &&
+ !(hptp[0] & HPTE_V_BOLTED))
+ valid = 0;
+ }
+ if (valid != want_valid)
+ return 0;
+
+ v = r = 0;
+ if (valid || dirty) {
+ /* lock the HPTE so it's stable and read it */
+ preempt_disable();
+ while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
+ cpu_relax();
+ v = hptp[0];
+ if (v & HPTE_V_ABSENT) {
+ v &= ~HPTE_V_ABSENT;
+ v |= HPTE_V_VALID;
+ }
+ /* re-evaluate valid and dirty from synchronized HPTE value */
+ valid = !!(v & HPTE_V_VALID);
+ if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED))
+ valid = 0;
+ r = revp->guest_rpte | (hptp[1] & (HPTE_R_R | HPTE_R_C));
+ dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
+ /* only clear modified if this is the right sort of entry */
+ if (valid == want_valid && dirty) {
+ r &= ~HPTE_GR_MODIFIED;
+ revp->guest_rpte = r;
+ }
+ asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
+ hptp[0] &= ~HPTE_V_HVLOCK;
+ preempt_enable();
+ if (!(valid == want_valid && (first_pass || dirty)))
+ ok = 0;
+ }
+ hpte[0] = v;
+ hpte[1] = r;
+ return ok;
+}
+
+static ssize_t kvm_htab_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct kvm_htab_ctx *ctx = file->private_data;
+ struct kvm *kvm = ctx->kvm;
+ struct kvm_get_htab_header hdr;
+ unsigned long *hptp;
+ struct revmap_entry *revp;
+ unsigned long i, nb, nw;
+ unsigned long __user *lbuf;
+ struct kvm_get_htab_header __user *hptr;
+ unsigned long flags;
+ int first_pass;
+ unsigned long hpte[2];
+
+ if (!access_ok(VERIFY_WRITE, buf, count))
+ return -EFAULT;
+
+ first_pass = ctx->first_pass;
+ flags = ctx->flags;
+
+ i = ctx->index;
+ hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
+ revp = kvm->arch.revmap + i;
+ lbuf = (unsigned long __user *)buf;
+
+ nb = 0;
+ while (nb + sizeof(hdr) + HPTE_SIZE < count) {
+ /* Initialize header */
+ hptr = (struct kvm_get_htab_header __user *)buf;
+ hdr.n_valid = 0;
+ hdr.n_invalid = 0;
+ nw = nb;
+ nb += sizeof(hdr);
+ lbuf = (unsigned long __user *)(buf + sizeof(hdr));
+
+ /* Skip uninteresting entries, i.e. clean on not-first pass */
+ if (!first_pass) {
+ while (i < kvm->arch.hpt_npte &&
+ !(revp->guest_rpte & HPTE_GR_MODIFIED)) {
+ ++i;
+ hptp += 2;
+ ++revp;
+ }
+ }
+ hdr.index = i;
+
+ /* Grab a series of valid entries */
+ while (i < kvm->arch.hpt_npte &&
+ hdr.n_valid < 0xffff &&
+ nb + HPTE_SIZE < count &&
+ record_hpte(flags, hptp, hpte, revp, 1, first_pass)) {
+ /* valid entry, write it out */
+ ++hdr.n_valid;
+ if (__put_user(hpte[0], lbuf) ||
+ __put_user(hpte[1], lbuf + 1))
+ return -EFAULT;
+ nb += HPTE_SIZE;
+ lbuf += 2;
+ ++i;
+ hptp += 2;
+ ++revp;
+ }
+ /* Now skip invalid entries while we can */
+ while (i < kvm->arch.hpt_npte &&
+ hdr.n_invalid < 0xffff &&
+ record_hpte(flags, hptp, hpte, revp, 0, first_pass)) {
+ /* found an invalid entry */
+ ++hdr.n_invalid;
+ ++i;
+ hptp += 2;
+ ++revp;
+ }
+
+ if (hdr.n_valid || hdr.n_invalid) {
+ /* write back the header */
+ if (__copy_to_user(hptr, &hdr, sizeof(hdr)))
+ return -EFAULT;
+ nw = nb;
+ buf = (char __user *)lbuf;
+ } else {
+ nb = nw;
+ }
+
+ /* Check if we've wrapped around the hash table */
+ if (i >= kvm->arch.hpt_npte) {
+ i = 0;
+ ctx->first_pass = 0;
+ break;
+ }
+ }
+
+ ctx->index = i;
+
+ return nb;
+}
+
+static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct kvm_htab_ctx *ctx = file->private_data;
+ struct kvm *kvm = ctx->kvm;
+ struct kvm_get_htab_header hdr;
+ unsigned long i, j;
+ unsigned long v, r;
+ unsigned long __user *lbuf;
+ unsigned long *hptp;
+ unsigned long tmp[2];
+ ssize_t nb;
+ long int err, ret;
+ int rma_setup;
+
+ if (!access_ok(VERIFY_READ, buf, count))
+ return -EFAULT;
+
+ /* lock out vcpus from running while we're doing this */
+ mutex_lock(&kvm->lock);
+ rma_setup = kvm->arch.rma_setup_done;
+ if (rma_setup) {
+ kvm->arch.rma_setup_done = 0; /* temporarily */
+ /* order rma_setup_done vs. vcpus_running */
+ smp_mb();
+ if (atomic_read(&kvm->arch.vcpus_running)) {
+ kvm->arch.rma_setup_done = 1;
+ mutex_unlock(&kvm->lock);
+ return -EBUSY;
+ }
+ }
+
+ err = 0;
+ for (nb = 0; nb + sizeof(hdr) <= count; ) {
+ err = -EFAULT;
+ if (__copy_from_user(&hdr, buf, sizeof(hdr)))
+ break;
+
+ err = 0;
+ if (nb + hdr.n_valid * HPTE_SIZE > count)
+ break;
+
+ nb += sizeof(hdr);
+ buf += sizeof(hdr);
+
+ err = -EINVAL;
+ i = hdr.index;
+ if (i >= kvm->arch.hpt_npte ||
+ i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte)
+ break;
+
+ hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
+ lbuf = (unsigned long __user *)buf;
+ for (j = 0; j < hdr.n_valid; ++j) {
+ err = -EFAULT;
+ if (__get_user(v, lbuf) || __get_user(r, lbuf + 1))
+ goto out;
+ err = -EINVAL;
+ if (!(v & HPTE_V_VALID))
+ goto out;
+ lbuf += 2;
+ nb += HPTE_SIZE;
+
+ if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT))
+ kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
+ err = -EIO;
+ ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r,
+ tmp);
+ if (ret != H_SUCCESS) {
+ pr_err("kvm_htab_write ret %ld i=%ld v=%lx "
+ "r=%lx\n", ret, i, v, r);
+ goto out;
+ }
+ if (!rma_setup && is_vrma_hpte(v)) {
+ unsigned long psize = hpte_page_size(v, r);
+ unsigned long senc = slb_pgsize_encoding(psize);
+ unsigned long lpcr;
+
+ kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
+ (VRMA_VSID << SLB_VSID_SHIFT_1T);
+ lpcr = kvm->arch.lpcr & ~LPCR_VRMASD;
+ lpcr |= senc << (LPCR_VRMASD_SH - 4);
+ kvm->arch.lpcr = lpcr;
+ rma_setup = 1;
+ }
+ ++i;
+ hptp += 2;
+ }
+
+ for (j = 0; j < hdr.n_invalid; ++j) {
+ if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT))
+ kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
+ ++i;
+ hptp += 2;
+ }
+ err = 0;
+ }
+
+ out:
+ /* Order HPTE updates vs. rma_setup_done */
+ smp_wmb();
+ kvm->arch.rma_setup_done = rma_setup;
+ mutex_unlock(&kvm->lock);
+
+ if (err)
+ return err;
+ return nb;
+}
+
+static int kvm_htab_release(struct inode *inode, struct file *filp)
+{
+ struct kvm_htab_ctx *ctx = filp->private_data;
+
+ filp->private_data = NULL;
+ if (!(ctx->flags & KVM_GET_HTAB_WRITE))
+ atomic_dec(&ctx->kvm->arch.hpte_mod_interest);
+ kvm_put_kvm(ctx->kvm);
+ kfree(ctx);
+ return 0;
+}
+
+static struct file_operations kvm_htab_fops = {
+ .read = kvm_htab_read,
+ .write = kvm_htab_write,
+ .llseek = default_llseek,
+ .release = kvm_htab_release,
+};
+
+int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
+{
+ int ret;
+ struct kvm_htab_ctx *ctx;
+ int rwflag;
+
+ /* reject flags we don't recognize */
+ if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE))
+ return -EINVAL;
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ kvm_get_kvm(kvm);
+ ctx->kvm = kvm;
+ ctx->index = ghf->start_index;
+ ctx->flags = ghf->flags;
+ ctx->first_pass = 1;
+
+ rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY;
+ ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag);
+ if (ret < 0) {
+ kvm_put_kvm(kvm);
+ return ret;
+ }
+
+ if (rwflag == O_RDONLY) {
+ mutex_lock(&kvm->slots_lock);
+ atomic_inc(&kvm->arch.hpte_mod_interest);
+ /* make sure kvmppc_do_h_enter etc. see the increment */
+ synchronize_srcu_expedited(&kvm->srcu);
+ mutex_unlock(&kvm->slots_lock);
+ }
+
+ return ret;
+}
+
void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
{
struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index b9a989dc76cc..d31a716f7f2b 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -22,6 +22,7 @@
#include <asm/kvm_book3s.h>
#include <asm/reg.h>
#include <asm/switch_to.h>
+#include <asm/time.h>
#define OP_19_XOP_RFID 18
#define OP_19_XOP_RFI 50
@@ -395,6 +396,12 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
(mfmsr() & MSR_HV))
vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32;
break;
+ case SPRN_PURR:
+ to_book3s(vcpu)->purr_offset = spr_val - get_tb();
+ break;
+ case SPRN_SPURR:
+ to_book3s(vcpu)->spurr_offset = spr_val - get_tb();
+ break;
case SPRN_GQR0:
case SPRN_GQR1:
case SPRN_GQR2:
@@ -412,6 +419,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
case SPRN_CTRLF:
case SPRN_CTRLT:
case SPRN_L2CR:
+ case SPRN_DSCR:
case SPRN_MMCR0_GEKKO:
case SPRN_MMCR1_GEKKO:
case SPRN_PMC1_GEKKO:
@@ -483,9 +491,15 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
*spr_val = to_book3s(vcpu)->hid[5];
break;
case SPRN_CFAR:
- case SPRN_PURR:
+ case SPRN_DSCR:
*spr_val = 0;
break;
+ case SPRN_PURR:
+ *spr_val = get_tb() + to_book3s(vcpu)->purr_offset;
+ break;
+ case SPRN_SPURR:
+ *spr_val = get_tb() + to_book3s(vcpu)->purr_offset;
+ break;
case SPRN_GQR0:
case SPRN_GQR1:
case SPRN_GQR2:
diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c
index a150817d6d4c..7057a02f0906 100644
--- a/arch/powerpc/kvm/book3s_exports.c
+++ b/arch/powerpc/kvm/book3s_exports.c
@@ -28,8 +28,5 @@ EXPORT_SYMBOL_GPL(kvmppc_load_up_fpu);
#ifdef CONFIG_ALTIVEC
EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec);
#endif
-#ifdef CONFIG_VSX
-EXPORT_SYMBOL_GPL(kvmppc_load_up_vsx);
-#endif
#endif
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 721d4603a235..71d0c90b62bf 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -30,6 +30,7 @@
#include <linux/cpumask.h>
#include <linux/spinlock.h>
#include <linux/page-flags.h>
+#include <linux/srcu.h>
#include <asm/reg.h>
#include <asm/cputable.h>
@@ -46,6 +47,7 @@
#include <asm/page.h>
#include <asm/hvcall.h>
#include <asm/switch_to.h>
+#include <asm/smp.h>
#include <linux/gfp.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
@@ -55,25 +57,77 @@
/* #define EXIT_DEBUG_SIMPLE */
/* #define EXIT_DEBUG_INT */
+/* Used to indicate that a guest page fault needs to be handled */
+#define RESUME_PAGE_FAULT (RESUME_GUEST | RESUME_FLAG_ARCH1)
+
+/* Used as a "null" value for timebase values */
+#define TB_NIL (~(u64)0)
+
static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
+/*
+ * We use the vcpu_load/put functions to measure stolen time.
+ * Stolen time is counted as time when either the vcpu is able to
+ * run as part of a virtual core, but the task running the vcore
+ * is preempted or sleeping, or when the vcpu needs something done
+ * in the kernel by the task running the vcpu, but that task is
+ * preempted or sleeping. Those two things have to be counted
+ * separately, since one of the vcpu tasks will take on the job
+ * of running the core, and the other vcpu tasks in the vcore will
+ * sleep waiting for it to do that, but that sleep shouldn't count
+ * as stolen time.
+ *
+ * Hence we accumulate stolen time when the vcpu can run as part of
+ * a vcore using vc->stolen_tb, and the stolen time when the vcpu
+ * needs its task to do other things in the kernel (for example,
+ * service a page fault) in busy_stolen. We don't accumulate
+ * stolen time for a vcore when it is inactive, or for a vcpu
+ * when it is in state RUNNING or NOTREADY. NOTREADY is a bit of
+ * a misnomer; it means that the vcpu task is not executing in
+ * the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in
+ * the kernel. We don't have any way of dividing up that time
+ * between time that the vcpu is genuinely stopped, time that
+ * the task is actively working on behalf of the vcpu, and time
+ * that the task is preempted, so we don't count any of it as
+ * stolen.
+ *
+ * Updates to busy_stolen are protected by arch.tbacct_lock;
+ * updates to vc->stolen_tb are protected by the arch.tbacct_lock
+ * of the vcpu that has taken responsibility for running the vcore
+ * (i.e. vc->runner). The stolen times are measured in units of
+ * timebase ticks. (Note that the != TB_NIL checks below are
+ * purely defensive; they should never fail.)
+ */
+
void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
- local_paca->kvm_hstate.kvm_vcpu = vcpu;
- local_paca->kvm_hstate.kvm_vcore = vc;
- if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE)
+ spin_lock(&vcpu->arch.tbacct_lock);
+ if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE &&
+ vc->preempt_tb != TB_NIL) {
vc->stolen_tb += mftb() - vc->preempt_tb;
+ vc->preempt_tb = TB_NIL;
+ }
+ if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST &&
+ vcpu->arch.busy_preempt != TB_NIL) {
+ vcpu->arch.busy_stolen += mftb() - vcpu->arch.busy_preempt;
+ vcpu->arch.busy_preempt = TB_NIL;
+ }
+ spin_unlock(&vcpu->arch.tbacct_lock);
}
void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
+ spin_lock(&vcpu->arch.tbacct_lock);
if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE)
vc->preempt_tb = mftb();
+ if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
+ vcpu->arch.busy_preempt = mftb();
+ spin_unlock(&vcpu->arch.tbacct_lock);
}
void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
@@ -142,6 +196,22 @@ static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
vpa->yield_count = 1;
}
+static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v,
+ unsigned long addr, unsigned long len)
+{
+ /* check address is cacheline aligned */
+ if (addr & (L1_CACHE_BYTES - 1))
+ return -EINVAL;
+ spin_lock(&vcpu->arch.vpa_update_lock);
+ if (v->next_gpa != addr || v->len != len) {
+ v->next_gpa = addr;
+ v->len = addr ? len : 0;
+ v->update_pending = 1;
+ }
+ spin_unlock(&vcpu->arch.vpa_update_lock);
+ return 0;
+}
+
/* Length for a per-processor buffer is passed in at offset 4 in the buffer */
struct reg_vpa {
u32 dummy;
@@ -317,10 +387,16 @@ static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap)
static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
{
+ if (!(vcpu->arch.vpa.update_pending ||
+ vcpu->arch.slb_shadow.update_pending ||
+ vcpu->arch.dtl.update_pending))
+ return;
+
spin_lock(&vcpu->arch.vpa_update_lock);
if (vcpu->arch.vpa.update_pending) {
kvmppc_update_vpa(vcpu, &vcpu->arch.vpa);
- init_vpa(vcpu, vcpu->arch.vpa.pinned_addr);
+ if (vcpu->arch.vpa.pinned_addr)
+ init_vpa(vcpu, vcpu->arch.vpa.pinned_addr);
}
if (vcpu->arch.dtl.update_pending) {
kvmppc_update_vpa(vcpu, &vcpu->arch.dtl);
@@ -332,24 +408,61 @@ static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
spin_unlock(&vcpu->arch.vpa_update_lock);
}
+/*
+ * Return the accumulated stolen time for the vcore up until `now'.
+ * The caller should hold the vcore lock.
+ */
+static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now)
+{
+ u64 p;
+
+ /*
+ * If we are the task running the vcore, then since we hold
+ * the vcore lock, we can't be preempted, so stolen_tb/preempt_tb
+ * can't be updated, so we don't need the tbacct_lock.
+ * If the vcore is inactive, it can't become active (since we
+ * hold the vcore lock), so the vcpu load/put functions won't
+ * update stolen_tb/preempt_tb, and we don't need tbacct_lock.
+ */
+ if (vc->vcore_state != VCORE_INACTIVE &&
+ vc->runner->arch.run_task != current) {
+ spin_lock(&vc->runner->arch.tbacct_lock);
+ p = vc->stolen_tb;
+ if (vc->preempt_tb != TB_NIL)
+ p += now - vc->preempt_tb;
+ spin_unlock(&vc->runner->arch.tbacct_lock);
+ } else {
+ p = vc->stolen_tb;
+ }
+ return p;
+}
+
static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
struct kvmppc_vcore *vc)
{
struct dtl_entry *dt;
struct lppaca *vpa;
- unsigned long old_stolen;
+ unsigned long stolen;
+ unsigned long core_stolen;
+ u64 now;
dt = vcpu->arch.dtl_ptr;
vpa = vcpu->arch.vpa.pinned_addr;
- old_stolen = vcpu->arch.stolen_logged;
- vcpu->arch.stolen_logged = vc->stolen_tb;
+ now = mftb();
+ core_stolen = vcore_stolen_time(vc, now);
+ stolen = core_stolen - vcpu->arch.stolen_logged;
+ vcpu->arch.stolen_logged = core_stolen;
+ spin_lock(&vcpu->arch.tbacct_lock);
+ stolen += vcpu->arch.busy_stolen;
+ vcpu->arch.busy_stolen = 0;
+ spin_unlock(&vcpu->arch.tbacct_lock);
if (!dt || !vpa)
return;
memset(dt, 0, sizeof(struct dtl_entry));
dt->dispatch_reason = 7;
dt->processor_id = vc->pcpu + vcpu->arch.ptid;
- dt->timebase = mftb();
- dt->enqueue_to_dispatch_time = vc->stolen_tb - old_stolen;
+ dt->timebase = now;
+ dt->enqueue_to_dispatch_time = stolen;
dt->srr0 = kvmppc_get_pc(vcpu);
dt->srr1 = vcpu->arch.shregs.msr;
++dt;
@@ -366,13 +479,16 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
unsigned long req = kvmppc_get_gpr(vcpu, 3);
unsigned long target, ret = H_SUCCESS;
struct kvm_vcpu *tvcpu;
+ int idx;
switch (req) {
case H_ENTER:
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
ret = kvmppc_virtmode_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5),
kvmppc_get_gpr(vcpu, 6),
kvmppc_get_gpr(vcpu, 7));
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
break;
case H_CEDE:
break;
@@ -429,6 +545,17 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
case BOOK3S_INTERRUPT_PERFMON:
r = RESUME_GUEST;
break;
+ case BOOK3S_INTERRUPT_MACHINE_CHECK:
+ /*
+ * Deliver a machine check interrupt to the guest.
+ * We have to do this, even if the host has handled the
+ * machine check, because machine checks use SRR0/1 and
+ * the interrupt might have trashed guest state in them.
+ */
+ kvmppc_book3s_queue_irqprio(vcpu,
+ BOOK3S_INTERRUPT_MACHINE_CHECK);
+ r = RESUME_GUEST;
+ break;
case BOOK3S_INTERRUPT_PROGRAM:
{
ulong flags;
@@ -470,12 +597,12 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
* have been handled already.
*/
case BOOK3S_INTERRUPT_H_DATA_STORAGE:
- r = kvmppc_book3s_hv_page_fault(run, vcpu,
- vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
+ r = RESUME_PAGE_FAULT;
break;
case BOOK3S_INTERRUPT_H_INST_STORAGE:
- r = kvmppc_book3s_hv_page_fault(run, vcpu,
- kvmppc_get_pc(vcpu), 0);
+ vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
+ vcpu->arch.fault_dsisr = 0;
+ r = RESUME_PAGE_FAULT;
break;
/*
* This occurs if the guest executes an illegal instruction.
@@ -535,36 +662,174 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
return 0;
}
-int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
{
- int r = -EINVAL;
+ int r = 0;
+ long int i;
- switch (reg->id) {
+ switch (id) {
case KVM_REG_PPC_HIOR:
- r = put_user(0, (u64 __user *)reg->addr);
+ *val = get_reg_val(id, 0);
+ break;
+ case KVM_REG_PPC_DABR:
+ *val = get_reg_val(id, vcpu->arch.dabr);
+ break;
+ case KVM_REG_PPC_DSCR:
+ *val = get_reg_val(id, vcpu->arch.dscr);
+ break;
+ case KVM_REG_PPC_PURR:
+ *val = get_reg_val(id, vcpu->arch.purr);
+ break;
+ case KVM_REG_PPC_SPURR:
+ *val = get_reg_val(id, vcpu->arch.spurr);
+ break;
+ case KVM_REG_PPC_AMR:
+ *val = get_reg_val(id, vcpu->arch.amr);
+ break;
+ case KVM_REG_PPC_UAMOR:
+ *val = get_reg_val(id, vcpu->arch.uamor);
+ break;
+ case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRA:
+ i = id - KVM_REG_PPC_MMCR0;
+ *val = get_reg_val(id, vcpu->arch.mmcr[i]);
+ break;
+ case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
+ i = id - KVM_REG_PPC_PMC1;
+ *val = get_reg_val(id, vcpu->arch.pmc[i]);
+ break;
+#ifdef CONFIG_VSX
+ case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
+ if (cpu_has_feature(CPU_FTR_VSX)) {
+ /* VSX => FP reg i is stored in arch.vsr[2*i] */
+ long int i = id - KVM_REG_PPC_FPR0;
+ *val = get_reg_val(id, vcpu->arch.vsr[2 * i]);
+ } else {
+ /* let generic code handle it */
+ r = -EINVAL;
+ }
+ break;
+ case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31:
+ if (cpu_has_feature(CPU_FTR_VSX)) {
+ long int i = id - KVM_REG_PPC_VSR0;
+ val->vsxval[0] = vcpu->arch.vsr[2 * i];
+ val->vsxval[1] = vcpu->arch.vsr[2 * i + 1];
+ } else {
+ r = -ENXIO;
+ }
+ break;
+#endif /* CONFIG_VSX */
+ case KVM_REG_PPC_VPA_ADDR:
+ spin_lock(&vcpu->arch.vpa_update_lock);
+ *val = get_reg_val(id, vcpu->arch.vpa.next_gpa);
+ spin_unlock(&vcpu->arch.vpa_update_lock);
+ break;
+ case KVM_REG_PPC_VPA_SLB:
+ spin_lock(&vcpu->arch.vpa_update_lock);
+ val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa;
+ val->vpaval.length = vcpu->arch.slb_shadow.len;
+ spin_unlock(&vcpu->arch.vpa_update_lock);
+ break;
+ case KVM_REG_PPC_VPA_DTL:
+ spin_lock(&vcpu->arch.vpa_update_lock);
+ val->vpaval.addr = vcpu->arch.dtl.next_gpa;
+ val->vpaval.length = vcpu->arch.dtl.len;
+ spin_unlock(&vcpu->arch.vpa_update_lock);
break;
default:
+ r = -EINVAL;
break;
}
return r;
}
-int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
{
- int r = -EINVAL;
+ int r = 0;
+ long int i;
+ unsigned long addr, len;
- switch (reg->id) {
+ switch (id) {
case KVM_REG_PPC_HIOR:
- {
- u64 hior;
/* Only allow this to be set to zero */
- r = get_user(hior, (u64 __user *)reg->addr);
- if (!r && (hior != 0))
+ if (set_reg_val(id, *val))
r = -EINVAL;
break;
- }
+ case KVM_REG_PPC_DABR:
+ vcpu->arch.dabr = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_DSCR:
+ vcpu->arch.dscr = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_PURR:
+ vcpu->arch.purr = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_SPURR:
+ vcpu->arch.spurr = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_AMR:
+ vcpu->arch.amr = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_UAMOR:
+ vcpu->arch.uamor = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRA:
+ i = id - KVM_REG_PPC_MMCR0;
+ vcpu->arch.mmcr[i] = set_reg_val(id, *val);
+ break;
+ case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
+ i = id - KVM_REG_PPC_PMC1;
+ vcpu->arch.pmc[i] = set_reg_val(id, *val);
+ break;
+#ifdef CONFIG_VSX
+ case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31:
+ if (cpu_has_feature(CPU_FTR_VSX)) {
+ /* VSX => FP reg i is stored in arch.vsr[2*i] */
+ long int i = id - KVM_REG_PPC_FPR0;
+ vcpu->arch.vsr[2 * i] = set_reg_val(id, *val);
+ } else {
+ /* let generic code handle it */
+ r = -EINVAL;
+ }
+ break;
+ case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31:
+ if (cpu_has_feature(CPU_FTR_VSX)) {
+ long int i = id - KVM_REG_PPC_VSR0;
+ vcpu->arch.vsr[2 * i] = val->vsxval[0];
+ vcpu->arch.vsr[2 * i + 1] = val->vsxval[1];
+ } else {
+ r = -ENXIO;
+ }
+ break;
+#endif /* CONFIG_VSX */
+ case KVM_REG_PPC_VPA_ADDR:
+ addr = set_reg_val(id, *val);
+ r = -EINVAL;
+ if (!addr && (vcpu->arch.slb_shadow.next_gpa ||
+ vcpu->arch.dtl.next_gpa))
+ break;
+ r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca));
+ break;
+ case KVM_REG_PPC_VPA_SLB:
+ addr = val->vpaval.addr;
+ len = val->vpaval.length;
+ r = -EINVAL;
+ if (addr && !vcpu->arch.vpa.next_gpa)
+ break;
+ r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len);
+ break;
+ case KVM_REG_PPC_VPA_DTL:
+ addr = val->vpaval.addr;
+ len = val->vpaval.length;
+ r = -EINVAL;
+ if (addr && (len < sizeof(struct dtl_entry) ||
+ !vcpu->arch.vpa.next_gpa))
+ break;
+ len -= len % sizeof(struct dtl_entry);
+ r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len);
+ break;
default:
+ r = -EINVAL;
break;
}
@@ -599,20 +864,18 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
goto free_vcpu;
vcpu->arch.shared = &vcpu->arch.shregs;
- vcpu->arch.last_cpu = -1;
vcpu->arch.mmcr[0] = MMCR0_FC;
vcpu->arch.ctrl = CTRL_RUNLATCH;
/* default to host PVR, since we can't spoof it */
vcpu->arch.pvr = mfspr(SPRN_PVR);
kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
spin_lock_init(&vcpu->arch.vpa_update_lock);
+ spin_lock_init(&vcpu->arch.tbacct_lock);
+ vcpu->arch.busy_preempt = TB_NIL;
kvmppc_mmu_book3s_hv_init(vcpu);
- /*
- * We consider the vcpu stopped until we see the first run ioctl for it.
- */
- vcpu->arch.state = KVMPPC_VCPU_STOPPED;
+ vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
init_waitqueue_head(&vcpu->arch.cpu_run);
@@ -624,9 +887,10 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
INIT_LIST_HEAD(&vcore->runnable_threads);
spin_lock_init(&vcore->lock);
init_waitqueue_head(&vcore->wq);
- vcore->preempt_tb = mftb();
+ vcore->preempt_tb = TB_NIL;
}
kvm->arch.vcores[core] = vcore;
+ kvm->arch.online_vcores++;
}
mutex_unlock(&kvm->lock);
@@ -637,7 +901,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
++vcore->num_threads;
spin_unlock(&vcore->lock);
vcpu->arch.vcore = vcore;
- vcpu->arch.stolen_logged = vcore->stolen_tb;
vcpu->arch.cpu_type = KVM_CPU_3S_64;
kvmppc_sanity_check(vcpu);
@@ -697,17 +960,18 @@ extern void xics_wake_cpu(int cpu);
static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *v;
+ u64 now;
if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
return;
+ spin_lock(&vcpu->arch.tbacct_lock);
+ now = mftb();
+ vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) -
+ vcpu->arch.stolen_logged;
+ vcpu->arch.busy_preempt = now;
vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
+ spin_unlock(&vcpu->arch.tbacct_lock);
--vc->n_runnable;
- ++vc->n_busy;
- /* decrement the physical thread id of each following vcpu */
- v = vcpu;
- list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list)
- --v->arch.ptid;
list_del(&vcpu->arch.run_list);
}
@@ -720,6 +984,7 @@ static int kvmppc_grab_hwthread(int cpu)
/* Ensure the thread won't go into the kernel if it wakes */
tpaca->kvm_hstate.hwthread_req = 1;
+ tpaca->kvm_hstate.kvm_vcpu = NULL;
/*
* If the thread is already executing in the kernel (e.g. handling
@@ -769,7 +1034,6 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
smp_wmb();
#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
if (vcpu->arch.ptid) {
- kvmppc_grab_hwthread(cpu);
xics_wake_cpu(cpu);
++vc->n_woken;
}
@@ -795,7 +1059,8 @@ static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc)
/*
* Check that we are on thread 0 and that any other threads in
- * this core are off-line.
+ * this core are off-line. Then grab the threads so they can't
+ * enter the kernel.
*/
static int on_primary_thread(void)
{
@@ -807,6 +1072,17 @@ static int on_primary_thread(void)
while (++thr < threads_per_core)
if (cpu_online(cpu + thr))
return 0;
+
+ /* Grab all hw threads so they can't go into the kernel */
+ for (thr = 1; thr < threads_per_core; ++thr) {
+ if (kvmppc_grab_hwthread(cpu + thr)) {
+ /* Couldn't grab one; let the others go */
+ do {
+ kvmppc_release_hwthread(cpu + thr);
+ } while (--thr > 0);
+ return 0;
+ }
+ }
return 1;
}
@@ -814,21 +1090,24 @@ static int on_primary_thread(void)
* Run a set of guest threads on a physical core.
* Called with vc->lock held.
*/
-static int kvmppc_run_core(struct kvmppc_vcore *vc)
+static void kvmppc_run_core(struct kvmppc_vcore *vc)
{
struct kvm_vcpu *vcpu, *vcpu0, *vnext;
long ret;
u64 now;
int ptid, i, need_vpa_update;
+ int srcu_idx;
+ struct kvm_vcpu *vcpus_to_update[threads_per_core];
/* don't start if any threads have a signal pending */
need_vpa_update = 0;
list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
if (signal_pending(vcpu->arch.run_task))
- return 0;
- need_vpa_update |= vcpu->arch.vpa.update_pending |
- vcpu->arch.slb_shadow.update_pending |
- vcpu->arch.dtl.update_pending;
+ return;
+ if (vcpu->arch.vpa.update_pending ||
+ vcpu->arch.slb_shadow.update_pending ||
+ vcpu->arch.dtl.update_pending)
+ vcpus_to_update[need_vpa_update++] = vcpu;
}
/*
@@ -838,7 +1117,7 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
vc->n_woken = 0;
vc->nap_count = 0;
vc->entry_exit_count = 0;
- vc->vcore_state = VCORE_RUNNING;
+ vc->vcore_state = VCORE_STARTING;
vc->in_guest = 0;
vc->napping_threads = 0;
@@ -848,24 +1127,12 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
*/
if (need_vpa_update) {
spin_unlock(&vc->lock);
- list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
- kvmppc_update_vpas(vcpu);
+ for (i = 0; i < need_vpa_update; ++i)
+ kvmppc_update_vpas(vcpus_to_update[i]);
spin_lock(&vc->lock);
}
/*
- * Make sure we are running on thread 0, and that
- * secondary threads are offline.
- * XXX we should also block attempts to bring any
- * secondary threads online.
- */
- if (threads_per_core > 1 && !on_primary_thread()) {
- list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
- vcpu->arch.ret = -EBUSY;
- goto out;
- }
-
- /*
* Assign physical thread IDs, first to non-ceded vcpus
* and then to ceded ones.
*/
@@ -879,28 +1146,36 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
}
}
if (!vcpu0)
- return 0; /* nothing to run */
+ goto out; /* nothing to run; should never happen */
list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
if (vcpu->arch.ceded)
vcpu->arch.ptid = ptid++;
- vc->stolen_tb += mftb() - vc->preempt_tb;
+ /*
+ * Make sure we are running on thread 0, and that
+ * secondary threads are offline.
+ */
+ if (threads_per_core > 1 && !on_primary_thread()) {
+ list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+ vcpu->arch.ret = -EBUSY;
+ goto out;
+ }
+
vc->pcpu = smp_processor_id();
list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
kvmppc_start_thread(vcpu);
kvmppc_create_dtl_entry(vcpu, vc);
}
- /* Grab any remaining hw threads so they can't go into the kernel */
- for (i = ptid; i < threads_per_core; ++i)
- kvmppc_grab_hwthread(vc->pcpu + i);
+ vc->vcore_state = VCORE_RUNNING;
preempt_disable();
spin_unlock(&vc->lock);
kvm_guest_enter();
+
+ srcu_idx = srcu_read_lock(&vcpu0->kvm->srcu);
+
__kvmppc_vcore_entry(NULL, vcpu0);
- for (i = 0; i < threads_per_core; ++i)
- kvmppc_release_hwthread(vc->pcpu + i);
spin_lock(&vc->lock);
/* disable sending of IPIs on virtual external irqs */
@@ -909,10 +1184,14 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
/* wait for secondary threads to finish writing their state to memory */
if (vc->nap_count < vc->n_woken)
kvmppc_wait_for_nap(vc);
+ for (i = 0; i < threads_per_core; ++i)
+ kvmppc_release_hwthread(vc->pcpu + i);
/* prevent other vcpu threads from doing kvmppc_start_thread() now */
vc->vcore_state = VCORE_EXITING;
spin_unlock(&vc->lock);
+ srcu_read_unlock(&vcpu0->kvm->srcu, srcu_idx);
+
/* make sure updates to secondary vcpu structs are visible now */
smp_mb();
kvm_guest_exit();
@@ -920,6 +1199,7 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
preempt_enable();
kvm_resched(vcpu);
+ spin_lock(&vc->lock);
now = get_tb();
list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
/* cancel pending dec exception if dec is positive */
@@ -943,10 +1223,8 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
}
}
- spin_lock(&vc->lock);
out:
vc->vcore_state = VCORE_INACTIVE;
- vc->preempt_tb = mftb();
list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
arch.run_list) {
if (vcpu->arch.ret != RESUME_GUEST) {
@@ -954,8 +1232,6 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
wake_up(&vcpu->arch.cpu_run);
}
}
-
- return 1;
}
/*
@@ -979,20 +1255,11 @@ static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state)
static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
{
DEFINE_WAIT(wait);
- struct kvm_vcpu *v;
- int all_idle = 1;
prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
vc->vcore_state = VCORE_SLEEPING;
spin_unlock(&vc->lock);
- list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
- if (!v->arch.ceded || v->arch.pending_exceptions) {
- all_idle = 0;
- break;
- }
- }
- if (all_idle)
- schedule();
+ schedule();
finish_wait(&vc->wq, &wait);
spin_lock(&vc->lock);
vc->vcore_state = VCORE_INACTIVE;
@@ -1001,13 +1268,13 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
{
int n_ceded;
- int prev_state;
struct kvmppc_vcore *vc;
struct kvm_vcpu *v, *vn;
kvm_run->exit_reason = 0;
vcpu->arch.ret = RESUME_GUEST;
vcpu->arch.trap = 0;
+ kvmppc_update_vpas(vcpu);
/*
* Synchronize with other threads in this virtual core
@@ -1017,8 +1284,9 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
vcpu->arch.ceded = 0;
vcpu->arch.run_task = current;
vcpu->arch.kvm_run = kvm_run;
- prev_state = vcpu->arch.state;
+ vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
+ vcpu->arch.busy_preempt = TB_NIL;
list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
++vc->n_runnable;
@@ -1027,33 +1295,26 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
* If the vcore is already running, we may be able to start
* this thread straight away and have it join in.
*/
- if (prev_state == KVMPPC_VCPU_STOPPED) {
+ if (!signal_pending(current)) {
if (vc->vcore_state == VCORE_RUNNING &&
VCORE_EXIT_COUNT(vc) == 0) {
vcpu->arch.ptid = vc->n_runnable - 1;
+ kvmppc_create_dtl_entry(vcpu, vc);
kvmppc_start_thread(vcpu);
+ } else if (vc->vcore_state == VCORE_SLEEPING) {
+ wake_up(&vc->wq);
}
- } else if (prev_state == KVMPPC_VCPU_BUSY_IN_HOST)
- --vc->n_busy;
+ }
while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
!signal_pending(current)) {
- if (vc->n_busy || vc->vcore_state != VCORE_INACTIVE) {
+ if (vc->vcore_state != VCORE_INACTIVE) {
spin_unlock(&vc->lock);
kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE);
spin_lock(&vc->lock);
continue;
}
- vc->runner = vcpu;
- n_ceded = 0;
- list_for_each_entry(v, &vc->runnable_threads, arch.run_list)
- n_ceded += v->arch.ceded;
- if (n_ceded == vc->n_runnable)
- kvmppc_vcore_blocked(vc);
- else
- kvmppc_run_core(vc);
-
list_for_each_entry_safe(v, vn, &vc->runnable_threads,
arch.run_list) {
kvmppc_core_prepare_to_enter(v);
@@ -1065,22 +1326,40 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
wake_up(&v->arch.cpu_run);
}
}
+ if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
+ break;
+ vc->runner = vcpu;
+ n_ceded = 0;
+ list_for_each_entry(v, &vc->runnable_threads, arch.run_list)
+ if (!v->arch.pending_exceptions)
+ n_ceded += v->arch.ceded;
+ if (n_ceded == vc->n_runnable)
+ kvmppc_vcore_blocked(vc);
+ else
+ kvmppc_run_core(vc);
vc->runner = NULL;
}
- if (signal_pending(current)) {
- if (vc->vcore_state == VCORE_RUNNING ||
- vc->vcore_state == VCORE_EXITING) {
- spin_unlock(&vc->lock);
- kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE);
- spin_lock(&vc->lock);
- }
- if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
- kvmppc_remove_runnable(vc, vcpu);
- vcpu->stat.signal_exits++;
- kvm_run->exit_reason = KVM_EXIT_INTR;
- vcpu->arch.ret = -EINTR;
- }
+ while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
+ (vc->vcore_state == VCORE_RUNNING ||
+ vc->vcore_state == VCORE_EXITING)) {
+ spin_unlock(&vc->lock);
+ kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE);
+ spin_lock(&vc->lock);
+ }
+
+ if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
+ kvmppc_remove_runnable(vc, vcpu);
+ vcpu->stat.signal_exits++;
+ kvm_run->exit_reason = KVM_EXIT_INTR;
+ vcpu->arch.ret = -EINTR;
+ }
+
+ if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) {
+ /* Wake up some vcpu to run the core */
+ v = list_first_entry(&vc->runnable_threads,
+ struct kvm_vcpu, arch.run_list);
+ wake_up(&v->arch.cpu_run);
}
spin_unlock(&vc->lock);
@@ -1090,6 +1369,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
{
int r;
+ int srcu_idx;
if (!vcpu->arch.sane) {
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -1120,6 +1400,7 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
flush_vsx_to_thread(current);
vcpu->arch.wqp = &vcpu->arch.vcore->wq;
vcpu->arch.pgdir = current->mm->pgd;
+ vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
do {
r = kvmppc_run_vcpu(run, vcpu);
@@ -1128,10 +1409,16 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
!(vcpu->arch.shregs.msr & MSR_PR)) {
r = kvmppc_pseries_do_hcall(vcpu);
kvmppc_core_prepare_to_enter(vcpu);
+ } else if (r == RESUME_PAGE_FAULT) {
+ srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = kvmppc_book3s_hv_page_fault(run, vcpu,
+ vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
+ srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
}
} while (r == RESUME_GUEST);
out:
+ vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
atomic_dec(&vcpu->kvm->arch.vcpus_running);
return r;
}
@@ -1273,7 +1560,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
n = kvm_dirty_bitmap_bytes(memslot);
memset(memslot->dirty_bitmap, 0, n);
- r = kvmppc_hv_get_dirty_log(kvm, memslot);
+ r = kvmppc_hv_get_dirty_log(kvm, memslot, memslot->dirty_bitmap);
if (r)
goto out;
@@ -1287,67 +1574,88 @@ out:
return r;
}
-static unsigned long slb_pgsize_encoding(unsigned long psize)
+static void unpin_slot(struct kvm_memory_slot *memslot)
{
- unsigned long senc = 0;
+ unsigned long *physp;
+ unsigned long j, npages, pfn;
+ struct page *page;
- if (psize > 0x1000) {
- senc = SLB_VSID_L;
- if (psize == 0x10000)
- senc |= SLB_VSID_LP_01;
+ physp = memslot->arch.slot_phys;
+ npages = memslot->npages;
+ if (!physp)
+ return;
+ for (j = 0; j < npages; j++) {
+ if (!(physp[j] & KVMPPC_GOT_PAGE))
+ continue;
+ pfn = physp[j] >> PAGE_SHIFT;
+ page = pfn_to_page(pfn);
+ SetPageDirty(page);
+ put_page(page);
+ }
+}
+
+void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
+ struct kvm_memory_slot *dont)
+{
+ if (!dont || free->arch.rmap != dont->arch.rmap) {
+ vfree(free->arch.rmap);
+ free->arch.rmap = NULL;
+ }
+ if (!dont || free->arch.slot_phys != dont->arch.slot_phys) {
+ unpin_slot(free);
+ vfree(free->arch.slot_phys);
+ free->arch.slot_phys = NULL;
}
- return senc;
+}
+
+int kvmppc_core_create_memslot(struct kvm_memory_slot *slot,
+ unsigned long npages)
+{
+ slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
+ if (!slot->arch.rmap)
+ return -ENOMEM;
+ slot->arch.slot_phys = NULL;
+
+ return 0;
}
int kvmppc_core_prepare_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem)
+ struct kvm_memory_slot *memslot,
+ struct kvm_userspace_memory_region *mem)
{
- unsigned long npages;
unsigned long *phys;
- /* Allocate a slot_phys array */
- phys = kvm->arch.slot_phys[mem->slot];
- if (!kvm->arch.using_mmu_notifiers && !phys) {
- npages = mem->memory_size >> PAGE_SHIFT;
- phys = vzalloc(npages * sizeof(unsigned long));
+ /* Allocate a slot_phys array if needed */
+ phys = memslot->arch.slot_phys;
+ if (!kvm->arch.using_mmu_notifiers && !phys && memslot->npages) {
+ phys = vzalloc(memslot->npages * sizeof(unsigned long));
if (!phys)
return -ENOMEM;
- kvm->arch.slot_phys[mem->slot] = phys;
- kvm->arch.slot_npages[mem->slot] = npages;
+ memslot->arch.slot_phys = phys;
}
return 0;
}
-static void unpin_slot(struct kvm *kvm, int slot_id)
+void kvmppc_core_commit_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ struct kvm_memory_slot old)
{
- unsigned long *physp;
- unsigned long j, npages, pfn;
- struct page *page;
+ unsigned long npages = mem->memory_size >> PAGE_SHIFT;
+ struct kvm_memory_slot *memslot;
- physp = kvm->arch.slot_phys[slot_id];
- npages = kvm->arch.slot_npages[slot_id];
- if (physp) {
- spin_lock(&kvm->arch.slot_phys_lock);
- for (j = 0; j < npages; j++) {
- if (!(physp[j] & KVMPPC_GOT_PAGE))
- continue;
- pfn = physp[j] >> PAGE_SHIFT;
- page = pfn_to_page(pfn);
- SetPageDirty(page);
- put_page(page);
- }
- kvm->arch.slot_phys[slot_id] = NULL;
- spin_unlock(&kvm->arch.slot_phys_lock);
- vfree(physp);
+ if (npages && old.npages) {
+ /*
+ * If modifying a memslot, reset all the rmap dirty bits.
+ * If this is a new memslot, we don't need to do anything
+ * since the rmap array starts out as all zeroes,
+ * i.e. no pages are dirty.
+ */
+ memslot = id_to_memslot(kvm->memslots, mem->slot);
+ kvmppc_hv_get_dirty_log(kvm, memslot, NULL);
}
}
-void kvmppc_core_commit_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem)
-{
-}
-
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
{
int err = 0;
@@ -1362,6 +1670,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
unsigned long rmls;
unsigned long *physp;
unsigned long i, npages;
+ int srcu_idx;
mutex_lock(&kvm->lock);
if (kvm->arch.rma_setup_done)
@@ -1377,12 +1686,13 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
}
/* Look up the memslot for guest physical address 0 */
+ srcu_idx = srcu_read_lock(&kvm->srcu);
memslot = gfn_to_memslot(kvm, 0);
/* We must have some memory at 0 by now */
err = -EINVAL;
if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
- goto out;
+ goto out_srcu;
/* Look up the VMA for the start of this memory slot */
hva = memslot->userspace_addr;
@@ -1406,14 +1716,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
err = -EPERM;
if (cpu_has_feature(CPU_FTR_ARCH_201)) {
pr_err("KVM: CPU requires an RMO\n");
- goto out;
+ goto out_srcu;
}
/* We can handle 4k, 64k or 16M pages in the VRMA */
err = -EINVAL;
if (!(psize == 0x1000 || psize == 0x10000 ||
psize == 0x1000000))
- goto out;
+ goto out_srcu;
/* Update VRMASD field in the LPCR */
senc = slb_pgsize_encoding(psize);
@@ -1436,7 +1746,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
err = -EINVAL;
if (rmls < 0) {
pr_err("KVM: Can't use RMA of 0x%lx bytes\n", rma_size);
- goto out;
+ goto out_srcu;
}
atomic_inc(&ri->use_count);
kvm->arch.rma = ri;
@@ -1465,17 +1775,24 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
/* Initialize phys addrs of pages in RMO */
npages = ri->npages;
porder = __ilog2(npages);
- physp = kvm->arch.slot_phys[memslot->id];
- spin_lock(&kvm->arch.slot_phys_lock);
- for (i = 0; i < npages; ++i)
- physp[i] = ((ri->base_pfn + i) << PAGE_SHIFT) + porder;
- spin_unlock(&kvm->arch.slot_phys_lock);
+ physp = memslot->arch.slot_phys;
+ if (physp) {
+ if (npages > memslot->npages)
+ npages = memslot->npages;
+ spin_lock(&kvm->arch.slot_phys_lock);
+ for (i = 0; i < npages; ++i)
+ physp[i] = ((ri->base_pfn + i) << PAGE_SHIFT) +
+ porder;
+ spin_unlock(&kvm->arch.slot_phys_lock);
+ }
}
/* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */
smp_wmb();
kvm->arch.rma_setup_done = 1;
err = 0;
+ out_srcu:
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
out:
mutex_unlock(&kvm->lock);
return err;
@@ -1496,6 +1813,13 @@ int kvmppc_core_init_vm(struct kvm *kvm)
return -ENOMEM;
kvm->arch.lpid = lpid;
+ /*
+ * Since we don't flush the TLB when tearing down a VM,
+ * and this lpid might have previously been used,
+ * make sure we flush on each core before running the new VM.
+ */
+ cpumask_setall(&kvm->arch.need_tlb_flush);
+
INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
kvm->arch.rma = NULL;
@@ -1523,16 +1847,19 @@ int kvmppc_core_init_vm(struct kvm *kvm)
kvm->arch.using_mmu_notifiers = !!cpu_has_feature(CPU_FTR_ARCH_206);
spin_lock_init(&kvm->arch.slot_phys_lock);
+
+ /*
+ * Don't allow secondary CPU threads to come online
+ * while any KVM VMs exist.
+ */
+ inhibit_secondary_onlining();
+
return 0;
}
void kvmppc_core_destroy_vm(struct kvm *kvm)
{
- unsigned long i;
-
- if (!kvm->arch.using_mmu_notifiers)
- for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
- unpin_slot(kvm, i);
+ uninhibit_secondary_onlining();
if (kvm->arch.rma) {
kvm_release_rma(kvm->arch.rma);
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index fb4eac290fef..ec0a9e5de100 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -157,8 +157,8 @@ static void __init kvm_linear_init_one(ulong size, int count, int type)
linear_info = alloc_bootmem(count * sizeof(struct kvmppc_linear_info));
for (i = 0; i < count; ++i) {
linear = alloc_bootmem_align(size, size);
- pr_info("Allocated KVM %s at %p (%ld MB)\n", typestr, linear,
- size >> 20);
+ pr_debug("Allocated KVM %s at %p (%ld MB)\n", typestr, linear,
+ size >> 20);
linear_info[i].base_virt = linear;
linear_info[i].base_pfn = __pa(linear) >> PAGE_SHIFT;
linear_info[i].npages = npages;
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
new file mode 100644
index 000000000000..35f3cf0269b3
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -0,0 +1,144 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * Copyright 2012 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ */
+
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/kernel.h>
+#include <asm/opal.h>
+
+/* SRR1 bits for machine check on POWER7 */
+#define SRR1_MC_LDSTERR (1ul << (63-42))
+#define SRR1_MC_IFETCH_SH (63-45)
+#define SRR1_MC_IFETCH_MASK 0x7
+#define SRR1_MC_IFETCH_SLBPAR 2 /* SLB parity error */
+#define SRR1_MC_IFETCH_SLBMULTI 3 /* SLB multi-hit */
+#define SRR1_MC_IFETCH_SLBPARMULTI 4 /* SLB parity + multi-hit */
+#define SRR1_MC_IFETCH_TLBMULTI 5 /* I-TLB multi-hit */
+
+/* DSISR bits for machine check on POWER7 */
+#define DSISR_MC_DERAT_MULTI 0x800 /* D-ERAT multi-hit */
+#define DSISR_MC_TLB_MULTI 0x400 /* D-TLB multi-hit */
+#define DSISR_MC_SLB_PARITY 0x100 /* SLB parity error */
+#define DSISR_MC_SLB_MULTI 0x080 /* SLB multi-hit */
+#define DSISR_MC_SLB_PARMULTI 0x040 /* SLB parity + multi-hit */
+
+/* POWER7 SLB flush and reload */
+static void reload_slb(struct kvm_vcpu *vcpu)
+{
+ struct slb_shadow *slb;
+ unsigned long i, n;
+
+ /* First clear out SLB */
+ asm volatile("slbmte %0,%0; slbia" : : "r" (0));
+
+ /* Do they have an SLB shadow buffer registered? */
+ slb = vcpu->arch.slb_shadow.pinned_addr;
+ if (!slb)
+ return;
+
+ /* Sanity check */
+ n = min_t(u32, slb->persistent, SLB_MIN_SIZE);
+ if ((void *) &slb->save_area[n] > vcpu->arch.slb_shadow.pinned_end)
+ return;
+
+ /* Load up the SLB from that */
+ for (i = 0; i < n; ++i) {
+ unsigned long rb = slb->save_area[i].esid;
+ unsigned long rs = slb->save_area[i].vsid;
+
+ rb = (rb & ~0xFFFul) | i; /* insert entry number */
+ asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb));
+ }
+}
+
+/* POWER7 TLB flush */
+static void flush_tlb_power7(struct kvm_vcpu *vcpu)
+{
+ unsigned long i, rb;
+
+ rb = TLBIEL_INVAL_SET_LPID;
+ for (i = 0; i < POWER7_TLB_SETS; ++i) {
+ asm volatile("tlbiel %0" : : "r" (rb));
+ rb += 1 << TLBIEL_INVAL_SET_SHIFT;
+ }
+}
+
+/*
+ * On POWER7, see if we can handle a machine check that occurred inside
+ * the guest in real mode, without switching to the host partition.
+ *
+ * Returns: 0 => exit guest, 1 => deliver machine check to guest
+ */
+static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu)
+{
+ unsigned long srr1 = vcpu->arch.shregs.msr;
+ struct opal_machine_check_event *opal_evt;
+ long handled = 1;
+
+ if (srr1 & SRR1_MC_LDSTERR) {
+ /* error on load/store */
+ unsigned long dsisr = vcpu->arch.shregs.dsisr;
+
+ if (dsisr & (DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI |
+ DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI)) {
+ /* flush and reload SLB; flushes D-ERAT too */
+ reload_slb(vcpu);
+ dsisr &= ~(DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI |
+ DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI);
+ }
+ if (dsisr & DSISR_MC_TLB_MULTI) {
+ flush_tlb_power7(vcpu);
+ dsisr &= ~DSISR_MC_TLB_MULTI;
+ }
+ /* Any other errors we don't understand? */
+ if (dsisr & 0xffffffffUL)
+ handled = 0;
+ }
+
+ switch ((srr1 >> SRR1_MC_IFETCH_SH) & SRR1_MC_IFETCH_MASK) {
+ case 0:
+ break;
+ case SRR1_MC_IFETCH_SLBPAR:
+ case SRR1_MC_IFETCH_SLBMULTI:
+ case SRR1_MC_IFETCH_SLBPARMULTI:
+ reload_slb(vcpu);
+ break;
+ case SRR1_MC_IFETCH_TLBMULTI:
+ flush_tlb_power7(vcpu);
+ break;
+ default:
+ handled = 0;
+ }
+
+ /*
+ * See if OPAL has already handled the condition.
+ * We assume that if the condition is recovered then OPAL
+ * will have generated an error log event that we will pick
+ * up and log later.
+ */
+ opal_evt = local_paca->opal_mc_evt;
+ if (opal_evt->version == OpalMCE_V1 &&
+ (opal_evt->severity == OpalMCE_SEV_NO_ERROR ||
+ opal_evt->disposition == OpalMCE_DISPOSITION_RECOVERED))
+ handled = 1;
+
+ if (handled)
+ opal_evt->in_use = 0;
+
+ return handled;
+}
+
+long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu)
+{
+ if (cpu_has_feature(CPU_FTR_ARCH_206))
+ return kvmppc_realmode_mc_power7(vcpu);
+
+ return 0;
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index fb0e821622d4..19c93bae1aea 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -35,6 +35,37 @@ static void *real_vmalloc_addr(void *x)
return __va(addr);
}
+/* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */
+static int global_invalidates(struct kvm *kvm, unsigned long flags)
+{
+ int global;
+
+ /*
+ * If there is only one vcore, and it's currently running,
+ * we can use tlbiel as long as we mark all other physical
+ * cores as potentially having stale TLB entries for this lpid.
+ * If we're not using MMU notifiers, we never take pages away
+ * from the guest, so we can use tlbiel if requested.
+ * Otherwise, don't use tlbiel.
+ */
+ if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcore)
+ global = 0;
+ else if (kvm->arch.using_mmu_notifiers)
+ global = 1;
+ else
+ global = !(flags & H_LOCAL);
+
+ if (!global) {
+ /* any other core might now have stale TLB entries... */
+ smp_wmb();
+ cpumask_setall(&kvm->arch.need_tlb_flush);
+ cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu,
+ &kvm->arch.need_tlb_flush);
+ }
+
+ return global;
+}
+
/*
* Add this HPTE into the chain for the real page.
* Must be called with the chain locked; it unlocks the chain.
@@ -59,13 +90,24 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
head->back = pte_index;
} else {
rev->forw = rev->back = pte_index;
- i = pte_index;
+ *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) |
+ pte_index | KVMPPC_RMAP_PRESENT;
}
- smp_wmb();
- *rmap = i | KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT; /* unlock */
+ unlock_rmap(rmap);
}
EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
+/*
+ * Note modification of an HPTE; set the HPTE modified bit
+ * if anyone is interested.
+ */
+static inline void note_hpte_modification(struct kvm *kvm,
+ struct revmap_entry *rev)
+{
+ if (atomic_read(&kvm->arch.hpte_mod_interest))
+ rev->guest_rpte |= HPTE_GR_MODIFIED;
+}
+
/* Remove this HPTE from the chain for a real page */
static void remove_revmap_chain(struct kvm *kvm, long pte_index,
struct revmap_entry *rev,
@@ -81,7 +123,7 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
ptel = rev->guest_rpte |= rcbits;
gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
- if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
+ if (!memslot)
return;
rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
@@ -103,14 +145,14 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
unlock_rmap(rmap);
}
-static pte_t lookup_linux_pte(struct kvm_vcpu *vcpu, unsigned long hva,
+static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
int writing, unsigned long *pte_sizep)
{
pte_t *ptep;
unsigned long ps = *pte_sizep;
unsigned int shift;
- ptep = find_linux_pte_or_hugepte(vcpu->arch.pgdir, hva, &shift);
+ ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
if (!ptep)
return __pte(0);
if (shift)
@@ -130,15 +172,15 @@ static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v)
hpte[0] = hpte_v;
}
-long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
- long pte_index, unsigned long pteh, unsigned long ptel)
+long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
+ long pte_index, unsigned long pteh, unsigned long ptel,
+ pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret)
{
- struct kvm *kvm = vcpu->kvm;
unsigned long i, pa, gpa, gfn, psize;
unsigned long slot_fn, hva;
unsigned long *hpte;
struct revmap_entry *rev;
- unsigned long g_ptel = ptel;
+ unsigned long g_ptel;
struct kvm_memory_slot *memslot;
unsigned long *physp, pte_size;
unsigned long is_io;
@@ -147,13 +189,14 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
unsigned int writing;
unsigned long mmu_seq;
unsigned long rcbits;
- bool realmode = vcpu->arch.vcore->vcore_state == VCORE_RUNNING;
psize = hpte_page_size(pteh, ptel);
if (!psize)
return H_PARAMETER;
writing = hpte_is_writable(ptel);
pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
+ ptel &= ~HPTE_GR_RESERVED;
+ g_ptel = ptel;
/* used later to detect if we might have been invalidated */
mmu_seq = kvm->mmu_notifier_seq;
@@ -183,7 +226,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
rmap = &memslot->arch.rmap[slot_fn];
if (!kvm->arch.using_mmu_notifiers) {
- physp = kvm->arch.slot_phys[memslot->id];
+ physp = memslot->arch.slot_phys;
if (!physp)
return H_PARAMETER;
physp += slot_fn;
@@ -201,7 +244,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
/* Look up the Linux PTE for the backing page */
pte_size = psize;
- pte = lookup_linux_pte(vcpu, hva, writing, &pte_size);
+ pte = lookup_linux_pte(pgdir, hva, writing, &pte_size);
if (pte_present(pte)) {
if (writing && !pte_write(pte))
/* make the actual HPTE be read-only */
@@ -210,6 +253,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
pa = pte_pfn(pte) << PAGE_SHIFT;
}
}
+
if (pte_size < psize)
return H_PARAMETER;
if (pa && pte_size > psize)
@@ -287,8 +331,10 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
rev = &kvm->arch.revmap[pte_index];
if (realmode)
rev = real_vmalloc_addr(rev);
- if (rev)
+ if (rev) {
rev->guest_rpte = g_ptel;
+ note_hpte_modification(kvm, rev);
+ }
/* Link HPTE into reverse-map chain */
if (pteh & HPTE_V_VALID) {
@@ -297,7 +343,7 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
lock_rmap(rmap);
/* Check for pending invalidations under the rmap chain lock */
if (kvm->arch.using_mmu_notifiers &&
- mmu_notifier_retry(vcpu, mmu_seq)) {
+ mmu_notifier_retry(kvm, mmu_seq)) {
/* inval in progress, write a non-present HPTE */
pteh |= HPTE_V_ABSENT;
pteh &= ~HPTE_V_VALID;
@@ -318,10 +364,17 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
hpte[0] = pteh;
asm volatile("ptesync" : : : "memory");
- vcpu->arch.gpr[4] = pte_index;
+ *pte_idx_ret = pte_index;
return H_SUCCESS;
}
-EXPORT_SYMBOL_GPL(kvmppc_h_enter);
+EXPORT_SYMBOL_GPL(kvmppc_do_h_enter);
+
+long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
+ long pte_index, unsigned long pteh, unsigned long ptel)
+{
+ return kvmppc_do_h_enter(vcpu->kvm, flags, pte_index, pteh, ptel,
+ vcpu->arch.pgdir, true, &vcpu->arch.gpr[4]);
+}
#define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token))
@@ -343,11 +396,10 @@ static inline int try_lock_tlbie(unsigned int *lock)
return old == 0;
}
-long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
- unsigned long pte_index, unsigned long avpn,
- unsigned long va)
+long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
+ unsigned long pte_index, unsigned long avpn,
+ unsigned long *hpret)
{
- struct kvm *kvm = vcpu->kvm;
unsigned long *hpte;
unsigned long v, r, rb;
struct revmap_entry *rev;
@@ -369,7 +421,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
if (v & HPTE_V_VALID) {
hpte[0] &= ~HPTE_V_VALID;
rb = compute_tlbie_rb(v, hpte[1], pte_index);
- if (!(flags & H_LOCAL) && atomic_read(&kvm->online_vcpus) > 1) {
+ if (global_invalidates(kvm, flags)) {
while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
cpu_relax();
asm volatile("ptesync" : : : "memory");
@@ -385,13 +437,22 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
/* Read PTE low word after tlbie to get final R/C values */
remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]);
}
- r = rev->guest_rpte;
+ r = rev->guest_rpte & ~HPTE_GR_RESERVED;
+ note_hpte_modification(kvm, rev);
unlock_hpte(hpte, 0);
- vcpu->arch.gpr[4] = v;
- vcpu->arch.gpr[5] = r;
+ hpret[0] = v;
+ hpret[1] = r;
return H_SUCCESS;
}
+EXPORT_SYMBOL_GPL(kvmppc_do_h_remove);
+
+long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
+ unsigned long pte_index, unsigned long avpn)
+{
+ return kvmppc_do_h_remove(vcpu->kvm, flags, pte_index, avpn,
+ &vcpu->arch.gpr[4]);
+}
long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
{
@@ -459,6 +520,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
args[j] = ((0x80 | flags) << 56) + pte_index;
rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+ note_hpte_modification(kvm, rev);
if (!(hp[0] & HPTE_V_VALID)) {
/* insert R and C bits from PTE */
@@ -534,8 +596,6 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
return H_NOT_FOUND;
}
- if (atomic_read(&kvm->online_vcpus) == 1)
- flags |= H_LOCAL;
v = hpte[0];
bits = (flags << 55) & HPTE_R_PP0;
bits |= (flags << 48) & HPTE_R_KEY_HI;
@@ -548,6 +608,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
if (rev) {
r = (rev->guest_rpte & ~mask) | bits;
rev->guest_rpte = r;
+ note_hpte_modification(kvm, rev);
}
r = (hpte[1] & ~mask) | bits;
@@ -555,7 +616,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
if (v & HPTE_V_VALID) {
rb = compute_tlbie_rb(v, r, pte_index);
hpte[0] = v & ~HPTE_V_VALID;
- if (!(flags & H_LOCAL)) {
+ if (global_invalidates(kvm, flags)) {
while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
cpu_relax();
asm volatile("ptesync" : : : "memory");
@@ -568,6 +629,28 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
asm volatile("tlbiel %0" : : "r" (rb));
asm volatile("ptesync" : : : "memory");
}
+ /*
+ * If the host has this page as readonly but the guest
+ * wants to make it read/write, reduce the permissions.
+ * Checking the host permissions involves finding the
+ * memslot and then the Linux PTE for the page.
+ */
+ if (hpte_is_writable(r) && kvm->arch.using_mmu_notifiers) {
+ unsigned long psize, gfn, hva;
+ struct kvm_memory_slot *memslot;
+ pgd_t *pgdir = vcpu->arch.pgdir;
+ pte_t pte;
+
+ psize = hpte_page_size(v, r);
+ gfn = ((r & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT;
+ memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
+ if (memslot) {
+ hva = __gfn_to_hva_memslot(memslot, gfn);
+ pte = lookup_linux_pte(pgdir, hva, 1, &psize);
+ if (pte_present(pte) && !pte_write(pte))
+ r = hpte_make_readonly(r);
+ }
+ }
}
hpte[1] = r;
eieio();
@@ -599,8 +682,10 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
v &= ~HPTE_V_ABSENT;
v |= HPTE_V_VALID;
}
- if (v & HPTE_V_VALID)
+ if (v & HPTE_V_VALID) {
r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C));
+ r &= ~HPTE_GR_RESERVED;
+ }
vcpu->arch.gpr[4 + i * 2] = v;
vcpu->arch.gpr[5 + i * 2] = r;
}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 74a24bbb9637..10b6c358dd77 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -27,6 +27,7 @@
#include <asm/asm-offsets.h>
#include <asm/exception-64s.h>
#include <asm/kvm_book3s_asm.h>
+#include <asm/mmu-hash64.h>
/*****************************************************************************
* *
@@ -134,8 +135,11 @@ kvm_start_guest:
27: /* XXX should handle hypervisor maintenance interrupts etc. here */
+ /* reload vcpu pointer after clearing the IPI */
+ ld r4,HSTATE_KVM_VCPU(r13)
+ cmpdi r4,0
/* if we have no vcpu to run, go back to sleep */
- beq cr1,kvm_no_guest
+ beq kvm_no_guest
/* were we napping due to cede? */
lbz r0,HSTATE_NAPPING(r13)
@@ -310,7 +314,33 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
mtspr SPRN_SDR1,r6 /* switch to partition page table */
mtspr SPRN_LPID,r7
isync
+
+ /* See if we need to flush the TLB */
+ lhz r6,PACAPACAINDEX(r13) /* test_bit(cpu, need_tlb_flush) */
+ clrldi r7,r6,64-6 /* extract bit number (6 bits) */
+ srdi r6,r6,6 /* doubleword number */
+ sldi r6,r6,3 /* address offset */
+ add r6,r6,r9
+ addi r6,r6,KVM_NEED_FLUSH /* dword in kvm->arch.need_tlb_flush */
li r0,1
+ sld r0,r0,r7
+ ld r7,0(r6)
+ and. r7,r7,r0
+ beq 22f
+23: ldarx r7,0,r6 /* if set, clear the bit */
+ andc r7,r7,r0
+ stdcx. r7,0,r6
+ bne 23b
+ li r6,128 /* and flush the TLB */
+ mtctr r6
+ li r7,0x800 /* IS field = 0b10 */
+ ptesync
+28: tlbiel r7
+ addi r7,r7,0x1000
+ bdnz 28b
+ ptesync
+
+22: li r0,1
stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */
b 10f
@@ -333,36 +363,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
mr r9,r4
blt hdec_soon
- /*
- * Invalidate the TLB if we could possibly have stale TLB
- * entries for this partition on this core due to the use
- * of tlbiel.
- * XXX maybe only need this on primary thread?
- */
- ld r9,VCPU_KVM(r4) /* pointer to struct kvm */
- lwz r5,VCPU_VCPUID(r4)
- lhz r6,PACAPACAINDEX(r13)
- rldimi r6,r5,0,62 /* XXX map as if threads 1:1 p:v */
- lhz r8,VCPU_LAST_CPU(r4)
- sldi r7,r6,1 /* see if this is the same vcpu */
- add r7,r7,r9 /* as last ran on this pcpu */
- lhz r0,KVM_LAST_VCPU(r7)
- cmpw r6,r8 /* on the same cpu core as last time? */
- bne 3f
- cmpw r0,r5 /* same vcpu as this core last ran? */
- beq 1f
-3: sth r6,VCPU_LAST_CPU(r4) /* if not, invalidate partition TLB */
- sth r5,KVM_LAST_VCPU(r7)
- li r6,128
- mtctr r6
- li r7,0x800 /* IS field = 0b10 */
- ptesync
-2: tlbiel r7
- addi r7,r7,0x1000
- bdnz 2b
- ptesync
-1:
-
/* Save purr/spurr */
mfspr r5,SPRN_PURR
mfspr r6,SPRN_SPURR
@@ -679,8 +679,7 @@ BEGIN_FTR_SECTION
1:
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
-nohpte_cont:
-hcall_real_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
+guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
/* Save DEC */
mfspr r5,SPRN_DEC
mftb r6
@@ -701,6 +700,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
std r6, VCPU_FAULT_DAR(r9)
stw r7, VCPU_FAULT_DSISR(r9)
+ /* See if it is a machine check */
+ cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK
+ beq machine_check_realmode
+mc_cont:
+
/* Save guest CTRL register, set runlatch to 1 */
6: mfspr r6,SPRN_CTRLF
stw r6,VCPU_CTRL(r9)
@@ -1113,38 +1117,41 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
/*
* For external and machine check interrupts, we need
* to call the Linux handler to process the interrupt.
- * We do that by jumping to the interrupt vector address
- * which we have in r12. The [h]rfid at the end of the
+ * We do that by jumping to absolute address 0x500 for
+ * external interrupts, or the machine_check_fwnmi label
+ * for machine checks (since firmware might have patched
+ * the vector area at 0x200). The [h]rfid at the end of the
* handler will return to the book3s_hv_interrupts.S code.
* For other interrupts we do the rfid to get back
- * to the book3s_interrupts.S code here.
+ * to the book3s_hv_interrupts.S code here.
*/
ld r8, HSTATE_VMHANDLER(r13)
ld r7, HSTATE_HOST_MSR(r13)
+ cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK
cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
+BEGIN_FTR_SECTION
beq 11f
- cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206)
/* RFI into the highmem handler, or branch to interrupt handler */
-12: mfmsr r6
- mtctr r12
+ mfmsr r6
li r0, MSR_RI
andc r6, r6, r0
mtmsrd r6, 1 /* Clear RI in MSR */
mtsrr0 r8
mtsrr1 r7
- beqctr
+ beqa 0x500 /* external interrupt (PPC970) */
+ beq cr1, 13f /* machine check */
RFI
-11:
-BEGIN_FTR_SECTION
- b 12b
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
- mtspr SPRN_HSRR0, r8
+ /* On POWER7, we have external interrupts set to use HSRR0/1 */
+11: mtspr SPRN_HSRR0, r8
mtspr SPRN_HSRR1, r7
ba 0x500
+13: b machine_check_fwnmi
+
/*
* Check whether an HDSI is an HPTE not found fault or something else.
* If it is an HPTE not found fault that is due to the guest accessing
@@ -1177,7 +1184,7 @@ kvmppc_hdsi:
cmpdi r3, 0 /* retry the instruction */
beq 6f
cmpdi r3, -1 /* handle in kernel mode */
- beq nohpte_cont
+ beq guest_exit_cont
cmpdi r3, -2 /* MMIO emulation; need instr word */
beq 2f
@@ -1191,6 +1198,7 @@ kvmppc_hdsi:
li r10, BOOK3S_INTERRUPT_DATA_STORAGE
li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */
rotldi r11, r11, 63
+fast_interrupt_c_return:
6: ld r7, VCPU_CTR(r9)
lwz r8, VCPU_XER(r9)
mtctr r7
@@ -1223,7 +1231,7 @@ kvmppc_hdsi:
/* Unset guest mode. */
li r0, KVM_GUEST_MODE_NONE
stb r0, HSTATE_IN_GUEST(r13)
- b nohpte_cont
+ b guest_exit_cont
/*
* Similarly for an HISI, reflect it to the guest as an ISI unless
@@ -1249,9 +1257,9 @@ kvmppc_hisi:
ld r11, VCPU_MSR(r9)
li r12, BOOK3S_INTERRUPT_H_INST_STORAGE
cmpdi r3, 0 /* retry the instruction */
- beq 6f
+ beq fast_interrupt_c_return
cmpdi r3, -1 /* handle in kernel mode */
- beq nohpte_cont
+ beq guest_exit_cont
/* Synthesize an ISI for the guest */
mr r11, r3
@@ -1260,12 +1268,7 @@ kvmppc_hisi:
li r10, BOOK3S_INTERRUPT_INST_STORAGE
li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */
rotldi r11, r11, 63
-6: ld r7, VCPU_CTR(r9)
- lwz r8, VCPU_XER(r9)
- mtctr r7
- mtxer r8
- mr r4, r9
- b fast_guest_return
+ b fast_interrupt_c_return
3: ld r6, VCPU_KVM(r9) /* not relocated, use VRMA */
ld r5, KVM_VRMA_SLB_V(r6)
@@ -1281,14 +1284,14 @@ kvmppc_hisi:
hcall_try_real_mode:
ld r3,VCPU_GPR(R3)(r9)
andi. r0,r11,MSR_PR
- bne hcall_real_cont
+ bne guest_exit_cont
clrrdi r3,r3,2
cmpldi r3,hcall_real_table_end - hcall_real_table
- bge hcall_real_cont
+ bge guest_exit_cont
LOAD_REG_ADDR(r4, hcall_real_table)
lwzx r3,r3,r4
cmpwi r3,0
- beq hcall_real_cont
+ beq guest_exit_cont
add r3,r3,r4
mtctr r3
mr r3,r9 /* get vcpu pointer */
@@ -1309,7 +1312,7 @@ hcall_real_fallback:
li r12,BOOK3S_INTERRUPT_SYSCALL
ld r9, HSTATE_KVM_VCPU(r13)
- b hcall_real_cont
+ b guest_exit_cont
.globl hcall_real_table
hcall_real_table:
@@ -1568,6 +1571,21 @@ kvm_cede_exit:
li r3,H_TOO_HARD
blr
+ /* Try to handle a machine check in real mode */
+machine_check_realmode:
+ mr r3, r9 /* get vcpu pointer */
+ bl .kvmppc_realmode_machine_check
+ nop
+ cmpdi r3, 0 /* continue exiting from guest? */
+ ld r9, HSTATE_KVM_VCPU(r13)
+ li r12, BOOK3S_INTERRUPT_MACHINE_CHECK
+ beq mc_cont
+ /* If not, deliver a machine check. SRR0/1 are already set */
+ li r10, BOOK3S_INTERRUPT_MACHINE_CHECK
+ li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */
+ rotldi r11, r11, 63
+ b fast_interrupt_c_return
+
secondary_too_late:
ld r5,HSTATE_KVM_VCORE(r13)
HMT_LOW
@@ -1587,6 +1605,10 @@ secondary_too_late:
.endr
secondary_nap:
+ /* Clear our vcpu pointer so we don't come back in early */
+ li r0, 0
+ std r0, HSTATE_KVM_VCPU(r13)
+ lwsync
/* Clear any pending IPI - assume we're a secondary thread */
ld r5, HSTATE_XICS_PHYS(r13)
li r7, XICS_XIRR
@@ -1612,8 +1634,6 @@ secondary_nap:
kvm_no_guest:
li r0, KVM_HWTHREAD_IN_NAP
stb r0, HSTATE_HWTHREAD_STATE(r13)
- li r0, 0
- std r0, HSTATE_KVM_VCPU(r13)
li r3, LPCR_PECE0
mfspr r4, SPRN_LPCR
diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
index 41cb0017e757..2c86b0d63714 100644
--- a/arch/powerpc/kvm/book3s_mmu_hpte.c
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -114,11 +114,6 @@ static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
hlist_del_init_rcu(&pte->list_vpte);
hlist_del_init_rcu(&pte->list_vpte_long);
- if (pte->pte.may_write)
- kvm_release_pfn_dirty(pte->pfn);
- else
- kvm_release_pfn_clean(pte->pfn);
-
spin_unlock(&vcpu3s->mmu_lock);
vcpu3s->hpte_cache_count--;
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 05c28f59f77f..28d38adeca73 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -52,8 +52,6 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
#define MSR_USER32 MSR_USER
#define MSR_USER64 MSR_USER
#define HW_PAGE_SIZE PAGE_SIZE
-#define __hard_irq_disable local_irq_disable
-#define __hard_irq_enable local_irq_enable
#endif
void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -66,7 +64,7 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max;
svcpu_put(svcpu);
#endif
-
+ vcpu->cpu = smp_processor_id();
#ifdef CONFIG_PPC_BOOK3S_32
current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu;
#endif
@@ -83,17 +81,71 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
svcpu_put(svcpu);
#endif
- kvmppc_giveup_ext(vcpu, MSR_FP);
- kvmppc_giveup_ext(vcpu, MSR_VEC);
- kvmppc_giveup_ext(vcpu, MSR_VSX);
+ kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX);
+ vcpu->cpu = -1;
+}
+
+int kvmppc_core_check_requests(struct kvm_vcpu *vcpu)
+{
+ int r = 1; /* Indicate we want to get back into the guest */
+
+ /* We misuse TLB_FLUSH to indicate that we want to clear
+ all shadow cache entries */
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
+ kvmppc_mmu_pte_flush(vcpu, 0, 0);
+
+ return r;
+}
+
+/************* MMU Notifiers *************/
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+ trace_kvm_unmap_hva(hva);
+
+ /*
+ * Flush all shadow tlb entries everywhere. This is slow, but
+ * we are 100% sure that we catch the to be unmapped page
+ */
+ kvm_flush_remote_tlbs(kvm);
+
+ return 0;
+}
+
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+ /* kvm_unmap_hva flushes everything anyways */
+ kvm_unmap_hva(kvm, start);
+
+ return 0;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+ /* XXX could be more clever ;) */
+ return 0;
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+ /* XXX could be more clever ;) */
+ return 0;
}
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+ /* The page will get remapped properly on its next fault */
+ kvm_unmap_hva(kvm, hva);
+}
+
+/*****************************************/
+
static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
{
ulong smsr = vcpu->arch.shared->msr;
/* Guest MSR values */
- smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE;
+ smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE;
/* Process MSR values */
smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
/* External providers the guest reserved */
@@ -379,10 +431,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
static inline int get_fpr_index(int i)
{
-#ifdef CONFIG_VSX
- i *= 2;
-#endif
- return i;
+ return i * TS_FPRWIDTH;
}
/* Give up external provider (FPU, Altivec, VSX) */
@@ -396,41 +445,49 @@ void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr)
u64 *thread_fpr = (u64*)t->fpr;
int i;
- if (!(vcpu->arch.guest_owned_ext & msr))
+ /*
+ * VSX instructions can access FP and vector registers, so if
+ * we are giving up VSX, make sure we give up FP and VMX as well.
+ */
+ if (msr & MSR_VSX)
+ msr |= MSR_FP | MSR_VEC;
+
+ msr &= vcpu->arch.guest_owned_ext;
+ if (!msr)
return;
#ifdef DEBUG_EXT
printk(KERN_INFO "Giving up ext 0x%lx\n", msr);
#endif
- switch (msr) {
- case MSR_FP:
+ if (msr & MSR_FP) {
+ /*
+ * Note that on CPUs with VSX, giveup_fpu stores
+ * both the traditional FP registers and the added VSX
+ * registers into thread.fpr[].
+ */
giveup_fpu(current);
for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
vcpu_fpr[i] = thread_fpr[get_fpr_index(i)];
vcpu->arch.fpscr = t->fpscr.val;
- break;
- case MSR_VEC:
+
+#ifdef CONFIG_VSX
+ if (cpu_has_feature(CPU_FTR_VSX))
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr) / 2; i++)
+ vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1];
+#endif
+ }
+
#ifdef CONFIG_ALTIVEC
+ if (msr & MSR_VEC) {
giveup_altivec(current);
memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr));
vcpu->arch.vscr = t->vscr;
-#endif
- break;
- case MSR_VSX:
-#ifdef CONFIG_VSX
- __giveup_vsx(current);
- for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
- vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1];
-#endif
- break;
- default:
- BUG();
}
+#endif
- vcpu->arch.guest_owned_ext &= ~msr;
- current->thread.regs->msr &= ~msr;
+ vcpu->arch.guest_owned_ext &= ~(msr | MSR_VSX);
kvmppc_recalc_shadow_msr(vcpu);
}
@@ -490,47 +547,56 @@ static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr,
return RESUME_GUEST;
}
- /* We already own the ext */
- if (vcpu->arch.guest_owned_ext & msr) {
- return RESUME_GUEST;
+ if (msr == MSR_VSX) {
+ /* No VSX? Give an illegal instruction interrupt */
+#ifdef CONFIG_VSX
+ if (!cpu_has_feature(CPU_FTR_VSX))
+#endif
+ {
+ kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
+ return RESUME_GUEST;
+ }
+
+ /*
+ * We have to load up all the FP and VMX registers before
+ * we can let the guest use VSX instructions.
+ */
+ msr = MSR_FP | MSR_VEC | MSR_VSX;
}
+ /* See if we already own all the ext(s) needed */
+ msr &= ~vcpu->arch.guest_owned_ext;
+ if (!msr)
+ return RESUME_GUEST;
+
#ifdef DEBUG_EXT
printk(KERN_INFO "Loading up ext 0x%lx\n", msr);
#endif
current->thread.regs->msr |= msr;
- switch (msr) {
- case MSR_FP:
+ if (msr & MSR_FP) {
for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++)
thread_fpr[get_fpr_index(i)] = vcpu_fpr[i];
-
+#ifdef CONFIG_VSX
+ for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr) / 2; i++)
+ thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i];
+#endif
t->fpscr.val = vcpu->arch.fpscr;
t->fpexc_mode = 0;
kvmppc_load_up_fpu();
- break;
- case MSR_VEC:
+ }
+
+ if (msr & MSR_VEC) {
#ifdef CONFIG_ALTIVEC
memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr));
t->vscr = vcpu->arch.vscr;
t->vrsave = -1;
kvmppc_load_up_altivec();
#endif
- break;
- case MSR_VSX:
-#ifdef CONFIG_VSX
- for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++)
- thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i];
- kvmppc_load_up_vsx();
-#endif
- break;
- default:
- BUG();
}
vcpu->arch.guest_owned_ext |= msr;
-
kvmppc_recalc_shadow_msr(vcpu);
return RESUME_GUEST;
@@ -540,18 +606,18 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
unsigned int exit_nr)
{
int r = RESUME_HOST;
+ int s;
vcpu->stat.sum_exits++;
run->exit_reason = KVM_EXIT_UNKNOWN;
run->ready_for_interrupt_injection = 1;
- /* We get here with MSR.EE=0, so enable it to be a nice citizen */
- __hard_irq_enable();
+ /* We get here with MSR.EE=1 */
+
+ trace_kvm_exit(exit_nr, vcpu);
+ kvm_guest_exit();
- trace_kvm_book3s_exit(exit_nr, vcpu);
- preempt_enable();
- kvm_resched(vcpu);
switch (exit_nr) {
case BOOK3S_INTERRUPT_INST_STORAGE:
{
@@ -802,7 +868,6 @@ program_interrupt:
}
}
- preempt_disable();
if (!(r & RESUME_HOST)) {
/* To avoid clobbering exit_reason, only check for signals if
* we aren't already exiting to userspace for some other
@@ -814,20 +879,13 @@ program_interrupt:
* and if we really did time things so badly, then we just exit
* again due to a host external interrupt.
*/
- __hard_irq_disable();
- if (signal_pending(current)) {
- __hard_irq_enable();
-#ifdef EXIT_DEBUG
- printk(KERN_EMERG "KVM: Going back to host\n");
-#endif
- vcpu->stat.signal_exits++;
- run->exit_reason = KVM_EXIT_INTR;
- r = -EINTR;
+ local_irq_disable();
+ s = kvmppc_prepare_to_enter(vcpu);
+ if (s <= 0) {
+ local_irq_enable();
+ r = s;
} else {
- /* In case an interrupt came in that was triggered
- * from userspace (like DEC), we need to check what
- * to inject now! */
- kvmppc_core_prepare_to_enter(vcpu);
+ kvmppc_lazy_ee_enable();
}
}
@@ -899,34 +957,59 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
return 0;
}
-int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
{
- int r = -EINVAL;
+ int r = 0;
- switch (reg->id) {
+ switch (id) {
case KVM_REG_PPC_HIOR:
- r = copy_to_user((u64 __user *)(long)reg->addr,
- &to_book3s(vcpu)->hior, sizeof(u64));
+ *val = get_reg_val(id, to_book3s(vcpu)->hior);
break;
+#ifdef CONFIG_VSX
+ case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: {
+ long int i = id - KVM_REG_PPC_VSR0;
+
+ if (!cpu_has_feature(CPU_FTR_VSX)) {
+ r = -ENXIO;
+ break;
+ }
+ val->vsxval[0] = vcpu->arch.fpr[i];
+ val->vsxval[1] = vcpu->arch.vsr[i];
+ break;
+ }
+#endif /* CONFIG_VSX */
default:
+ r = -EINVAL;
break;
}
return r;
}
-int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
+int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
{
- int r = -EINVAL;
+ int r = 0;
- switch (reg->id) {
+ switch (id) {
case KVM_REG_PPC_HIOR:
- r = copy_from_user(&to_book3s(vcpu)->hior,
- (u64 __user *)(long)reg->addr, sizeof(u64));
- if (!r)
- to_book3s(vcpu)->hior_explicit = true;
+ to_book3s(vcpu)->hior = set_reg_val(id, *val);
+ to_book3s(vcpu)->hior_explicit = true;
+ break;
+#ifdef CONFIG_VSX
+ case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: {
+ long int i = id - KVM_REG_PPC_VSR0;
+
+ if (!cpu_has_feature(CPU_FTR_VSX)) {
+ r = -ENXIO;
+ break;
+ }
+ vcpu->arch.fpr[i] = val->vsxval[0];
+ vcpu->arch.vsr[i] = val->vsxval[1];
break;
+ }
+#endif /* CONFIG_VSX */
default:
+ r = -EINVAL;
break;
}
@@ -1020,8 +1103,6 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
#endif
ulong ext_msr;
- preempt_disable();
-
/* Check if we can run the vcpu at all */
if (!vcpu->arch.sane) {
kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -1029,21 +1110,16 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
goto out;
}
- kvmppc_core_prepare_to_enter(vcpu);
-
/*
* Interrupts could be timers for the guest which we have to inject
* again, so let's postpone them until we're in the guest and if we
* really did time things so badly, then we just exit again due to
* a host external interrupt.
*/
- __hard_irq_disable();
-
- /* No need to go into the guest when all we do is going out */
- if (signal_pending(current)) {
- __hard_irq_enable();
- kvm_run->exit_reason = KVM_EXIT_INTR;
- ret = -EINTR;
+ local_irq_disable();
+ ret = kvmppc_prepare_to_enter(vcpu);
+ if (ret <= 0) {
+ local_irq_enable();
goto out;
}
@@ -1070,7 +1146,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
/* Save VSX state in stack */
used_vsr = current->thread.used_vsr;
if (used_vsr && (current->thread.regs->msr & MSR_VSX))
- __giveup_vsx(current);
+ __giveup_vsx(current);
#endif
/* Remember the MSR with disabled extensions */
@@ -1080,20 +1156,19 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
if (vcpu->arch.shared->msr & MSR_FP)
kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP);
- kvm_guest_enter();
+ kvmppc_lazy_ee_enable();
ret = __kvmppc_vcpu_run(kvm_run, vcpu);
- kvm_guest_exit();
-
- current->thread.regs->msr = ext_msr;
+ /* No need for kvm_guest_exit. It's done in handle_exit.
+ We also get here with interrupts enabled. */
/* Make sure we save the guest FPU/Altivec/VSX state */
- kvmppc_giveup_ext(vcpu, MSR_FP);
- kvmppc_giveup_ext(vcpu, MSR_VEC);
- kvmppc_giveup_ext(vcpu, MSR_VSX);
+ kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX);
+
+ current->thread.regs->msr = ext_msr;
- /* Restore FPU state from stack */
+ /* Restore FPU/VSX state from stack */
memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr));
current->thread.fpscr.val = fpscr;
current->thread.fpexc_mode = fpexc_mode;
@@ -1113,7 +1188,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
#endif
out:
- preempt_enable();
+ vcpu->mode = OUTSIDE_GUEST_MODE;
return ret;
}
@@ -1181,14 +1256,31 @@ int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, struct kvm_ppc_smmu_info *info)
}
#endif /* CONFIG_PPC64 */
+void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
+ struct kvm_memory_slot *dont)
+{
+}
+
+int kvmppc_core_create_memslot(struct kvm_memory_slot *slot,
+ unsigned long npages)
+{
+ return 0;
+}
+
int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
struct kvm_userspace_memory_region *mem)
{
return 0;
}
void kvmppc_core_commit_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem)
+ struct kvm_userspace_memory_region *mem,
+ struct kvm_memory_slot old)
+{
+}
+
+void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
{
}
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
index 9ecf6e35cd8d..8f7633e3afb8 100644
--- a/arch/powerpc/kvm/book3s_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -170,20 +170,21 @@ kvmppc_handler_skip_ins:
* Call kvmppc_handler_trampoline_enter in real mode
*
* On entry, r4 contains the guest shadow MSR
+ * MSR.EE has to be 0 when calling this function
*/
_GLOBAL(kvmppc_entry_trampoline)
mfmsr r5
LOAD_REG_ADDR(r7, kvmppc_handler_trampoline_enter)
toreal(r7)
- li r9, MSR_RI
- ori r9, r9, MSR_EE
- andc r9, r5, r9 /* Clear EE and RI in MSR value */
li r6, MSR_IR | MSR_DR
- ori r6, r6, MSR_EE
- andc r6, r5, r6 /* Clear EE, DR and IR in MSR value */
- MTMSR_EERI(r9) /* Clear EE and RI in MSR */
- mtsrr0 r7 /* before we set srr0/1 */
+ andc r6, r5, r6 /* Clear DR and IR in MSR value */
+ /*
+ * Set EE in HOST_MSR so that it's enabled when we get into our
+ * C exit handler function
+ */
+ ori r5, r5, MSR_EE
+ mtsrr0 r7
mtsrr1 r6
RFI
@@ -233,8 +234,5 @@ define_load_up(fpu)
#ifdef CONFIG_ALTIVEC
define_load_up(altivec)
#endif
-#ifdef CONFIG_VSX
-define_load_up(vsx)
-#endif
#include "book3s_segment.S"
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index d25a097c852b..69f114015780 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -36,9 +36,11 @@
#include <asm/dbell.h>
#include <asm/hw_irq.h>
#include <asm/irq.h>
+#include <asm/time.h>
#include "timing.h"
#include "booke.h"
+#include "trace.h"
unsigned long kvmppc_booke_handlers;
@@ -62,6 +64,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
{ "doorbell", VCPU_STAT(dbell_exits) },
{ "guest doorbell", VCPU_STAT(gdbell_exits) },
+ { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
{ NULL }
};
@@ -120,6 +123,16 @@ static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu)
}
#endif
+static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu)
+{
+#if defined(CONFIG_PPC_FPU) && !defined(CONFIG_KVM_BOOKE_HV)
+ /* We always treat the FP bit as enabled from the host
+ perspective, so only need to adjust the shadow MSR */
+ vcpu->arch.shadow_msr &= ~MSR_FP;
+ vcpu->arch.shadow_msr |= vcpu->arch.shared->msr & MSR_FP;
+#endif
+}
+
/*
* Helper function for "full" MSR writes. No need to call this if only
* EE/CE/ME/DE/RI are changing.
@@ -136,11 +149,13 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
kvmppc_mmu_msr_notify(vcpu, old_msr);
kvmppc_vcpu_sync_spe(vcpu);
+ kvmppc_vcpu_sync_fpu(vcpu);
}
static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu,
unsigned int priority)
{
+ trace_kvm_booke_queue_irqprio(vcpu, priority);
set_bit(priority, &vcpu->arch.pending_exceptions);
}
@@ -206,6 +221,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
}
+static void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu)
+{
+ kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG);
+}
+
+static void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu)
+{
+ clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions);
+}
+
static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
{
#ifdef CONFIG_KVM_BOOKE_HV
@@ -287,6 +312,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
bool crit;
bool keep_irq = false;
enum int_class int_class;
+ ulong new_msr = vcpu->arch.shared->msr;
/* Truncate crit indicators in 32 bit mode */
if (!(vcpu->arch.shared->msr & MSR_SF)) {
@@ -325,6 +351,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
msr_mask = MSR_CE | MSR_ME | MSR_DE;
int_class = INT_CLASS_NONCRIT;
break;
+ case BOOKE_IRQPRIO_WATCHDOG:
case BOOKE_IRQPRIO_CRITICAL:
case BOOKE_IRQPRIO_DBELL_CRIT:
allowed = vcpu->arch.shared->msr & MSR_CE;
@@ -381,7 +408,13 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
set_guest_esr(vcpu, vcpu->arch.queued_esr);
if (update_dear == true)
set_guest_dear(vcpu, vcpu->arch.queued_dear);
- kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask);
+
+ new_msr &= msr_mask;
+#if defined(CONFIG_64BIT)
+ if (vcpu->arch.epcr & SPRN_EPCR_ICM)
+ new_msr |= MSR_CM;
+#endif
+ kvmppc_set_msr(vcpu, new_msr);
if (!keep_irq)
clear_bit(priority, &vcpu->arch.pending_exceptions);
@@ -404,12 +437,121 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
return allowed;
}
+/*
+ * Return the number of jiffies until the next timeout. If the timeout is
+ * longer than the NEXT_TIMER_MAX_DELTA, then return NEXT_TIMER_MAX_DELTA
+ * because the larger value can break the timer APIs.
+ */
+static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu)
+{
+ u64 tb, wdt_tb, wdt_ticks = 0;
+ u64 nr_jiffies = 0;
+ u32 period = TCR_GET_WP(vcpu->arch.tcr);
+
+ wdt_tb = 1ULL << (63 - period);
+ tb = get_tb();
+ /*
+ * The watchdog timeout will hapeen when TB bit corresponding
+ * to watchdog will toggle from 0 to 1.
+ */
+ if (tb & wdt_tb)
+ wdt_ticks = wdt_tb;
+
+ wdt_ticks += wdt_tb - (tb & (wdt_tb - 1));
+
+ /* Convert timebase ticks to jiffies */
+ nr_jiffies = wdt_ticks;
+
+ if (do_div(nr_jiffies, tb_ticks_per_jiffy))
+ nr_jiffies++;
+
+ return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA);
+}
+
+static void arm_next_watchdog(struct kvm_vcpu *vcpu)
+{
+ unsigned long nr_jiffies;
+ unsigned long flags;
+
+ /*
+ * If TSR_ENW and TSR_WIS are not set then no need to exit to
+ * userspace, so clear the KVM_REQ_WATCHDOG request.
+ */
+ if ((vcpu->arch.tsr & (TSR_ENW | TSR_WIS)) != (TSR_ENW | TSR_WIS))
+ clear_bit(KVM_REQ_WATCHDOG, &vcpu->requests);
+
+ spin_lock_irqsave(&vcpu->arch.wdt_lock, flags);
+ nr_jiffies = watchdog_next_timeout(vcpu);
+ /*
+ * If the number of jiffies of watchdog timer >= NEXT_TIMER_MAX_DELTA
+ * then do not run the watchdog timer as this can break timer APIs.
+ */
+ if (nr_jiffies < NEXT_TIMER_MAX_DELTA)
+ mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
+ else
+ del_timer(&vcpu->arch.wdt_timer);
+ spin_unlock_irqrestore(&vcpu->arch.wdt_lock, flags);
+}
+
+void kvmppc_watchdog_func(unsigned long data)
+{
+ struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
+ u32 tsr, new_tsr;
+ int final;
+
+ do {
+ new_tsr = tsr = vcpu->arch.tsr;
+ final = 0;
+
+ /* Time out event */
+ if (tsr & TSR_ENW) {
+ if (tsr & TSR_WIS)
+ final = 1;
+ else
+ new_tsr = tsr | TSR_WIS;
+ } else {
+ new_tsr = tsr | TSR_ENW;
+ }
+ } while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
+
+ if (new_tsr & TSR_WIS) {
+ smp_wmb();
+ kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+ kvm_vcpu_kick(vcpu);
+ }
+
+ /*
+ * If this is final watchdog expiry and some action is required
+ * then exit to userspace.
+ */
+ if (final && (vcpu->arch.tcr & TCR_WRC_MASK) &&
+ vcpu->arch.watchdog_enabled) {
+ smp_wmb();
+ kvm_make_request(KVM_REQ_WATCHDOG, vcpu);
+ kvm_vcpu_kick(vcpu);
+ }
+
+ /*
+ * Stop running the watchdog timer after final expiration to
+ * prevent the host from being flooded with timers if the
+ * guest sets a short period.
+ * Timers will resume when TSR/TCR is updated next time.
+ */
+ if (!final)
+ arm_next_watchdog(vcpu);
+}
+
static void update_timer_ints(struct kvm_vcpu *vcpu)
{
if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS))
kvmppc_core_queue_dec(vcpu);
else
kvmppc_core_dequeue_dec(vcpu);
+
+ if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS))
+ kvmppc_core_queue_watchdog(vcpu);
+ else
+ kvmppc_core_dequeue_watchdog(vcpu);
}
static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
@@ -417,13 +559,6 @@ static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
unsigned long *pending = &vcpu->arch.pending_exceptions;
unsigned int priority;
- if (vcpu->requests) {
- if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) {
- smp_mb();
- update_timer_ints(vcpu);
- }
- }
-
priority = __ffs(*pending);
while (priority < BOOKE_IRQPRIO_MAX) {
if (kvmppc_booke_irqprio_deliver(vcpu, priority))
@@ -459,37 +594,20 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu)
return r;
}
-/*
- * Common checks before entering the guest world. Call with interrupts
- * disabled.
- *
- * returns !0 if a signal is pending and check_signal is true
- */
-static int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
+int kvmppc_core_check_requests(struct kvm_vcpu *vcpu)
{
- int r = 0;
+ int r = 1; /* Indicate we want to get back into the guest */
- WARN_ON_ONCE(!irqs_disabled());
- while (true) {
- if (need_resched()) {
- local_irq_enable();
- cond_resched();
- local_irq_disable();
- continue;
- }
-
- if (signal_pending(current)) {
- r = 1;
- break;
- }
-
- if (kvmppc_core_prepare_to_enter(vcpu)) {
- /* interrupts got enabled in between, so we
- are back at square 1 */
- continue;
- }
+ if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu))
+ update_timer_ints(vcpu);
+#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
+ kvmppc_core_flush_tlb(vcpu);
+#endif
- break;
+ if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_WATCHDOG;
+ r = 0;
}
return r;
@@ -497,7 +615,7 @@ static int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
{
- int ret;
+ int ret, s;
#ifdef CONFIG_PPC_FPU
unsigned int fpscr;
int fpexc_mode;
@@ -510,11 +628,13 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
}
local_irq_disable();
- if (kvmppc_prepare_to_enter(vcpu)) {
- kvm_run->exit_reason = KVM_EXIT_INTR;
- ret = -EINTR;
+ s = kvmppc_prepare_to_enter(vcpu);
+ if (s <= 0) {
+ local_irq_enable();
+ ret = s;
goto out;
}
+ kvmppc_lazy_ee_enable();
kvm_guest_enter();
@@ -542,6 +662,9 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
ret = __kvmppc_vcpu_run(kvm_run, vcpu);
+ /* No need for kvm_guest_exit. It's done in handle_exit.
+ We also get here with interrupts enabled. */
+
#ifdef CONFIG_PPC_FPU
kvmppc_save_guest_fp(vcpu);
@@ -557,10 +680,8 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
current->thread.fpexc_mode = fpexc_mode;
#endif
- kvm_guest_exit();
-
out:
- local_irq_enable();
+ vcpu->mode = OUTSIDE_GUEST_MODE;
return ret;
}
@@ -668,6 +789,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
unsigned int exit_nr)
{
int r = RESUME_HOST;
+ int s;
/* update before a new last_exit_type is rewritten */
kvmppc_update_timing_stats(vcpu);
@@ -677,6 +799,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
local_irq_enable();
+ trace_kvm_exit(exit_nr, vcpu);
+ kvm_guest_exit();
+
run->exit_reason = KVM_EXIT_UNKNOWN;
run->ready_for_interrupt_injection = 1;
@@ -971,10 +1096,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
*/
if (!(r & RESUME_HOST)) {
local_irq_disable();
- if (kvmppc_prepare_to_enter(vcpu)) {
- run->exit_reason = KVM_EXIT_INTR;
- r = (-EINTR << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
- kvmppc_account_exit(vcpu, SIGNAL_EXITS);
+ s = kvmppc_prepare_to_enter(vcpu);
+ if (s <= 0) {
+ local_irq_enable();
+ r = (s << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
+ } else {
+ kvmppc_lazy_ee_enable();
}
}
@@ -1011,6 +1138,21 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
return r;
}
+int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+ /* setup watchdog timer once */
+ spin_lock_init(&vcpu->arch.wdt_lock);
+ setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func,
+ (unsigned long)vcpu);
+
+ return 0;
+}
+
+void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+ del_timer_sync(&vcpu->arch.wdt_timer);
+}
+
int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
int i;
@@ -1106,7 +1248,13 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
}
if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
+ u32 old_tsr = vcpu->arch.tsr;
+
vcpu->arch.tsr = sregs->u.e.tsr;
+
+ if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS))
+ arm_next_watchdog(vcpu);
+
update_timer_ints(vcpu);
}
@@ -1221,12 +1369,70 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
{
- return -EINVAL;
+ int r = -EINVAL;
+
+ switch (reg->id) {
+ case KVM_REG_PPC_IAC1:
+ case KVM_REG_PPC_IAC2:
+ case KVM_REG_PPC_IAC3:
+ case KVM_REG_PPC_IAC4: {
+ int iac = reg->id - KVM_REG_PPC_IAC1;
+ r = copy_to_user((u64 __user *)(long)reg->addr,
+ &vcpu->arch.dbg_reg.iac[iac], sizeof(u64));
+ break;
+ }
+ case KVM_REG_PPC_DAC1:
+ case KVM_REG_PPC_DAC2: {
+ int dac = reg->id - KVM_REG_PPC_DAC1;
+ r = copy_to_user((u64 __user *)(long)reg->addr,
+ &vcpu->arch.dbg_reg.dac[dac], sizeof(u64));
+ break;
+ }
+#if defined(CONFIG_64BIT)
+ case KVM_REG_PPC_EPCR:
+ r = put_user(vcpu->arch.epcr, (u32 __user *)(long)reg->addr);
+ break;
+#endif
+ default:
+ break;
+ }
+ return r;
}
int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg)
{
- return -EINVAL;
+ int r = -EINVAL;
+
+ switch (reg->id) {
+ case KVM_REG_PPC_IAC1:
+ case KVM_REG_PPC_IAC2:
+ case KVM_REG_PPC_IAC3:
+ case KVM_REG_PPC_IAC4: {
+ int iac = reg->id - KVM_REG_PPC_IAC1;
+ r = copy_from_user(&vcpu->arch.dbg_reg.iac[iac],
+ (u64 __user *)(long)reg->addr, sizeof(u64));
+ break;
+ }
+ case KVM_REG_PPC_DAC1:
+ case KVM_REG_PPC_DAC2: {
+ int dac = reg->id - KVM_REG_PPC_DAC1;
+ r = copy_from_user(&vcpu->arch.dbg_reg.dac[dac],
+ (u64 __user *)(long)reg->addr, sizeof(u64));
+ break;
+ }
+#if defined(CONFIG_64BIT)
+ case KVM_REG_PPC_EPCR: {
+ u32 new_epcr;
+ r = get_user(new_epcr, (u32 __user *)(long)reg->addr);
+ if (r == 0)
+ kvmppc_set_epcr(vcpu, new_epcr);
+ break;
+ }
+#endif
+ default:
+ break;
+ }
+ return r;
}
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
@@ -1253,20 +1459,50 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
return -ENOTSUPP;
}
+void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
+ struct kvm_memory_slot *dont)
+{
+}
+
+int kvmppc_core_create_memslot(struct kvm_memory_slot *slot,
+ unsigned long npages)
+{
+ return 0;
+}
+
int kvmppc_core_prepare_memory_region(struct kvm *kvm,
+ struct kvm_memory_slot *memslot,
struct kvm_userspace_memory_region *mem)
{
return 0;
}
void kvmppc_core_commit_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem)
+ struct kvm_userspace_memory_region *mem,
+ struct kvm_memory_slot old)
+{
+}
+
+void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot)
+{
+}
+
+void kvmppc_set_epcr(struct kvm_vcpu *vcpu, u32 new_epcr)
{
+#if defined(CONFIG_64BIT)
+ vcpu->arch.epcr = new_epcr;
+#ifdef CONFIG_KVM_BOOKE_HV
+ vcpu->arch.shadow_epcr &= ~SPRN_EPCR_GICM;
+ if (vcpu->arch.epcr & SPRN_EPCR_ICM)
+ vcpu->arch.shadow_epcr |= SPRN_EPCR_GICM;
+#endif
+#endif
}
void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr)
{
vcpu->arch.tcr = new_tcr;
+ arm_next_watchdog(vcpu);
update_timer_ints(vcpu);
}
@@ -1281,6 +1517,14 @@ void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
{
clear_bits(tsr_bits, &vcpu->arch.tsr);
+
+ /*
+ * We may have stopped the watchdog due to
+ * being stuck on final expiration.
+ */
+ if (tsr_bits & (TSR_ENW | TSR_WIS))
+ arm_next_watchdog(vcpu);
+
update_timer_ints(vcpu);
}
@@ -1298,12 +1542,14 @@ void kvmppc_decrementer_func(unsigned long data)
void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
+ vcpu->cpu = smp_processor_id();
current->thread.kvm_vcpu = vcpu;
}
void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu)
{
current->thread.kvm_vcpu = NULL;
+ vcpu->cpu = -1;
}
int __init kvmppc_booke_init(void)
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index ba61974c1e20..e9b88e433f64 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -69,6 +69,7 @@ extern unsigned long kvmppc_booke_handlers;
void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr);
void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr);
+void kvmppc_set_epcr(struct kvm_vcpu *vcpu, u32 new_epcr);
void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr);
void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits);
void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits);
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index 12834bb608ab..4685b8cf2249 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -133,10 +133,10 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
vcpu->arch.csrr1 = spr_val;
break;
case SPRN_DBCR0:
- vcpu->arch.dbcr0 = spr_val;
+ vcpu->arch.dbg_reg.dbcr0 = spr_val;
break;
case SPRN_DBCR1:
- vcpu->arch.dbcr1 = spr_val;
+ vcpu->arch.dbg_reg.dbcr1 = spr_val;
break;
case SPRN_DBSR:
vcpu->arch.dbsr &= ~spr_val;
@@ -145,6 +145,14 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
kvmppc_clr_tsr_bits(vcpu, spr_val);
break;
case SPRN_TCR:
+ /*
+ * WRC is a 2-bit field that is supposed to preserve its
+ * value once written to non-zero.
+ */
+ if (vcpu->arch.tcr & TCR_WRC_MASK) {
+ spr_val &= ~TCR_WRC_MASK;
+ spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
+ }
kvmppc_set_tcr(vcpu, spr_val);
break;
@@ -229,7 +237,17 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val)
case SPRN_IVOR15:
vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = spr_val;
break;
-
+ case SPRN_MCSR:
+ vcpu->arch.mcsr &= ~spr_val;
+ break;
+#if defined(CONFIG_64BIT)
+ case SPRN_EPCR:
+ kvmppc_set_epcr(vcpu, spr_val);
+#ifdef CONFIG_KVM_BOOKE_HV
+ mtspr(SPRN_EPCR, vcpu->arch.shadow_epcr);
+#endif
+ break;
+#endif
default:
emulated = EMULATE_FAIL;
}
@@ -258,10 +276,10 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
*spr_val = vcpu->arch.csrr1;
break;
case SPRN_DBCR0:
- *spr_val = vcpu->arch.dbcr0;
+ *spr_val = vcpu->arch.dbg_reg.dbcr0;
break;
case SPRN_DBCR1:
- *spr_val = vcpu->arch.dbcr1;
+ *spr_val = vcpu->arch.dbg_reg.dbcr1;
break;
case SPRN_DBSR:
*spr_val = vcpu->arch.dbsr;
@@ -321,6 +339,14 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val)
case SPRN_IVOR15:
*spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG];
break;
+ case SPRN_MCSR:
+ *spr_val = vcpu->arch.mcsr;
+ break;
+#if defined(CONFIG_64BIT)
+ case SPRN_EPCR:
+ *spr_val = vcpu->arch.epcr;
+ break;
+#endif
default:
emulated = EMULATE_FAIL;
diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
index 099fe8272b57..e8ed7d659c55 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -16,6 +16,7 @@
*
* Author: Varun Sethi <varun.sethi@freescale.com>
* Author: Scott Wood <scotwood@freescale.com>
+ * Author: Mihai Caraman <mihai.caraman@freescale.com>
*
* This file is derived from arch/powerpc/kvm/booke_interrupts.S
*/
@@ -30,31 +31,33 @@
#include <asm/bitsperlong.h>
#include <asm/thread_info.h>
+#ifdef CONFIG_64BIT
+#include <asm/exception-64e.h>
+#else
#include "../kernel/head_booke.h" /* for THREAD_NORMSAVE() */
-
-#define GET_VCPU(vcpu, thread) \
- PPC_LL vcpu, THREAD_KVM_VCPU(thread)
+#endif
#define LONGBYTES (BITS_PER_LONG / 8)
#define VCPU_GUEST_SPRG(n) (VCPU_GUEST_SPRGS + (n * LONGBYTES))
/* The host stack layout: */
-#define HOST_R1 (0 * LONGBYTES) /* Implied by stwu. */
-#define HOST_CALLEE_LR (1 * LONGBYTES)
-#define HOST_RUN (2 * LONGBYTES) /* struct kvm_run */
+#define HOST_R1 0 /* Implied by stwu. */
+#define HOST_CALLEE_LR PPC_LR_STKOFF
+#define HOST_RUN (HOST_CALLEE_LR + LONGBYTES)
/*
* r2 is special: it holds 'current', and it made nonvolatile in the
* kernel with the -ffixed-r2 gcc option.
*/
-#define HOST_R2 (3 * LONGBYTES)
-#define HOST_CR (4 * LONGBYTES)
-#define HOST_NV_GPRS (5 * LONGBYTES)
+#define HOST_R2 (HOST_RUN + LONGBYTES)
+#define HOST_CR (HOST_R2 + LONGBYTES)
+#define HOST_NV_GPRS (HOST_CR + LONGBYTES)
#define __HOST_NV_GPR(n) (HOST_NV_GPRS + ((n - 14) * LONGBYTES))
#define HOST_NV_GPR(n) __HOST_NV_GPR(__REG_##n)
#define HOST_MIN_STACK_SIZE (HOST_NV_GPR(R31) + LONGBYTES)
#define HOST_STACK_SIZE ((HOST_MIN_STACK_SIZE + 15) & ~15) /* Align. */
-#define HOST_STACK_LR (HOST_STACK_SIZE + LONGBYTES) /* In caller stack frame. */
+/* LR in caller stack frame. */
+#define HOST_STACK_LR (HOST_STACK_SIZE + PPC_LR_STKOFF)
#define NEED_EMU 0x00000001 /* emulation -- save nv regs */
#define NEED_DEAR 0x00000002 /* save faulting DEAR */
@@ -201,12 +204,128 @@
b kvmppc_resume_host
.endm
+#ifdef CONFIG_64BIT
+/* Exception types */
+#define EX_GEN 1
+#define EX_GDBELL 2
+#define EX_DBG 3
+#define EX_MC 4
+#define EX_CRIT 5
+#define EX_TLB 6
+
+/*
+ * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h
+ */
+.macro kvm_handler intno type scratch, paca_ex, ex_r10, ex_r11, srr0, srr1, flags
+ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
+ mr r11, r4
+ /*
+ * Get vcpu from Paca: paca->__current.thread->kvm_vcpu
+ */
+ PPC_LL r4, PACACURRENT(r13)
+ PPC_LL r4, (THREAD + THREAD_KVM_VCPU)(r4)
+ stw r10, VCPU_CR(r4)
+ PPC_STL r11, VCPU_GPR(R4)(r4)
+ PPC_STL r5, VCPU_GPR(R5)(r4)
+ .if \type == EX_CRIT
+ PPC_LL r5, (\paca_ex + EX_R13)(r13)
+ .else
+ mfspr r5, \scratch
+ .endif
+ PPC_STL r6, VCPU_GPR(R6)(r4)
+ PPC_STL r8, VCPU_GPR(R8)(r4)
+ PPC_STL r9, VCPU_GPR(R9)(r4)
+ PPC_STL r5, VCPU_GPR(R13)(r4)
+ PPC_LL r6, (\paca_ex + \ex_r10)(r13)
+ PPC_LL r8, (\paca_ex + \ex_r11)(r13)
+ PPC_STL r3, VCPU_GPR(R3)(r4)
+ PPC_STL r7, VCPU_GPR(R7)(r4)
+ PPC_STL r12, VCPU_GPR(R12)(r4)
+ PPC_STL r6, VCPU_GPR(R10)(r4)
+ PPC_STL r8, VCPU_GPR(R11)(r4)
+ mfctr r5
+ PPC_STL r5, VCPU_CTR(r4)
+ mfspr r5, \srr0
+ mfspr r6, \srr1
+ kvm_handler_common \intno, \srr0, \flags
+.endm
+
+#define EX_PARAMS(type) \
+ EX_##type, \
+ SPRN_SPRG_##type##_SCRATCH, \
+ PACA_EX##type, \
+ EX_R10, \
+ EX_R11
+
+#define EX_PARAMS_TLB \
+ EX_TLB, \
+ SPRN_SPRG_GEN_SCRATCH, \
+ PACA_EXTLB, \
+ EX_TLB_R10, \
+ EX_TLB_R11
+
+kvm_handler BOOKE_INTERRUPT_CRITICAL, EX_PARAMS(CRIT), \
+ SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_handler BOOKE_INTERRUPT_MACHINE_CHECK, EX_PARAMS(MC), \
+ SPRN_MCSRR0, SPRN_MCSRR1, 0
+kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, EX_PARAMS(GEN), \
+ SPRN_SRR0, SPRN_SRR1,(NEED_EMU | NEED_DEAR | NEED_ESR)
+kvm_handler BOOKE_INTERRUPT_INST_STORAGE, EX_PARAMS(GEN), \
+ SPRN_SRR0, SPRN_SRR1, NEED_ESR
+kvm_handler BOOKE_INTERRUPT_EXTERNAL, EX_PARAMS(GEN), \
+ SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_ALIGNMENT, EX_PARAMS(GEN), \
+ SPRN_SRR0, SPRN_SRR1,(NEED_DEAR | NEED_ESR)
+kvm_handler BOOKE_INTERRUPT_PROGRAM, EX_PARAMS(GEN), \
+ SPRN_SRR0, SPRN_SRR1,NEED_ESR
+kvm_handler BOOKE_INTERRUPT_FP_UNAVAIL, EX_PARAMS(GEN), \
+ SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_AP_UNAVAIL, EX_PARAMS(GEN), \
+ SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_DECREMENTER, EX_PARAMS(GEN), \
+ SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_FIT, EX_PARAMS(GEN), \
+ SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_WATCHDOG, EX_PARAMS(CRIT),\
+ SPRN_CSRR0, SPRN_CSRR1, 0
+/*
+ * Only bolted TLB miss exception handlers are supported for now
+ */
+kvm_handler BOOKE_INTERRUPT_DTLB_MISS, EX_PARAMS_TLB, \
+ SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR)
+kvm_handler BOOKE_INTERRUPT_ITLB_MISS, EX_PARAMS_TLB, \
+ SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_SPE_UNAVAIL, EX_PARAMS(GEN), \
+ SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_SPE_FP_DATA, EX_PARAMS(GEN), \
+ SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_SPE_FP_ROUND, EX_PARAMS(GEN), \
+ SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_PERFORMANCE_MONITOR, EX_PARAMS(GEN), \
+ SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_DOORBELL, EX_PARAMS(GEN), \
+ SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_DOORBELL_CRITICAL, EX_PARAMS(CRIT), \
+ SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_handler BOOKE_INTERRUPT_HV_PRIV, EX_PARAMS(GEN), \
+ SPRN_SRR0, SPRN_SRR1, NEED_EMU
+kvm_handler BOOKE_INTERRUPT_HV_SYSCALL, EX_PARAMS(GEN), \
+ SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_GUEST_DBELL, EX_PARAMS(GDBELL), \
+ SPRN_GSRR0, SPRN_GSRR1, 0
+kvm_handler BOOKE_INTERRUPT_GUEST_DBELL_CRIT, EX_PARAMS(CRIT), \
+ SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(DBG), \
+ SPRN_DSRR0, SPRN_DSRR1, 0
+kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(CRIT), \
+ SPRN_CSRR0, SPRN_CSRR1, 0
+#else
/*
* For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h
*/
.macro kvm_handler intno srr0, srr1, flags
_GLOBAL(kvmppc_handler_\intno\()_\srr1)
- GET_VCPU(r11, r10)
+ PPC_LL r11, THREAD_KVM_VCPU(r10)
PPC_STL r3, VCPU_GPR(R3)(r11)
mfspr r3, SPRN_SPRG_RSCRATCH0
PPC_STL r4, VCPU_GPR(R4)(r11)
@@ -233,7 +352,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
.macro kvm_lvl_handler intno scratch srr0, srr1, flags
_GLOBAL(kvmppc_handler_\intno\()_\srr1)
mfspr r10, SPRN_SPRG_THREAD
- GET_VCPU(r11, r10)
+ PPC_LL r11, THREAD_KVM_VCPU(r10)
PPC_STL r3, VCPU_GPR(R3)(r11)
mfspr r3, \scratch
PPC_STL r4, VCPU_GPR(R4)(r11)
@@ -295,7 +414,7 @@ kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \
SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0
kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \
SPRN_SPRG_RSCRATCH_DBG, SPRN_DSRR0, SPRN_DSRR1, 0
-
+#endif
/* Registers:
* SPRG_SCRATCH0: guest r10
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index aa8b81428bf4..c70d37ed770a 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -27,8 +27,7 @@
#define E500_TLB_NUM 2
#define E500_TLB_VALID 1
-#define E500_TLB_DIRTY 2
-#define E500_TLB_BITMAP 4
+#define E500_TLB_BITMAP 2
struct tlbe_ref {
pfn_t pfn;
@@ -130,9 +129,9 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500,
ulong value);
int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu);
int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu);
-int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb);
-int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int rt, int ra, int rb);
-int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb);
+int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea);
+int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int type, gva_t ea);
+int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea);
int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500);
void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500);
@@ -155,7 +154,7 @@ get_tlb_size(const struct kvm_book3e_206_tlb_entry *tlbe)
static inline gva_t get_tlb_eaddr(const struct kvm_book3e_206_tlb_entry *tlbe)
{
- return tlbe->mas2 & 0xfffff000;
+ return tlbe->mas2 & MAS2_EPN;
}
static inline u64 get_tlb_bytes(const struct kvm_book3e_206_tlb_entry *tlbe)
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index e04b0ef55ce0..e78f353a836a 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -89,6 +89,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
int ra = get_ra(inst);
int rb = get_rb(inst);
int rt = get_rt(inst);
+ gva_t ea;
switch (get_op(inst)) {
case 31:
@@ -113,15 +114,20 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
break;
case XOP_TLBSX:
- emulated = kvmppc_e500_emul_tlbsx(vcpu,rb);
+ ea = kvmppc_get_ea_indexed(vcpu, ra, rb);
+ emulated = kvmppc_e500_emul_tlbsx(vcpu, ea);
break;
- case XOP_TLBILX:
- emulated = kvmppc_e500_emul_tlbilx(vcpu, rt, ra, rb);
+ case XOP_TLBILX: {
+ int type = rt & 0x3;
+ ea = kvmppc_get_ea_indexed(vcpu, ra, rb);
+ emulated = kvmppc_e500_emul_tlbilx(vcpu, type, ea);
break;
+ }
case XOP_TLBIVAX:
- emulated = kvmppc_e500_emul_tlbivax(vcpu, ra, rb);
+ ea = kvmppc_get_ea_indexed(vcpu, ra, rb);
+ emulated = kvmppc_e500_emul_tlbivax(vcpu, ea);
break;
default:
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index ff38b664195d..cf3f18012371 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -304,17 +304,13 @@ static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref,
ref->flags = E500_TLB_VALID;
if (tlbe_is_writable(gtlbe))
- ref->flags |= E500_TLB_DIRTY;
+ kvm_set_pfn_dirty(pfn);
}
static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref)
{
if (ref->flags & E500_TLB_VALID) {
- if (ref->flags & E500_TLB_DIRTY)
- kvm_release_pfn_dirty(ref->pfn);
- else
- kvm_release_pfn_clean(ref->pfn);
-
+ trace_kvm_booke206_ref_release(ref->pfn, ref->flags);
ref->flags = 0;
}
}
@@ -357,6 +353,13 @@ static void clear_tlb_refs(struct kvmppc_vcpu_e500 *vcpu_e500)
clear_tlb_privs(vcpu_e500);
}
+void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+ clear_tlb_refs(vcpu_e500);
+ clear_tlb1_bitmap(vcpu_e500);
+}
+
static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
unsigned int eaddr, int as)
{
@@ -412,7 +415,8 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
struct tlbe_ref *ref)
{
struct kvm_memory_slot *slot;
- unsigned long pfn, hva;
+ unsigned long pfn = 0; /* silence GCC warning */
+ unsigned long hva;
int pfnmap = 0;
int tsize = BOOK3E_PAGESZ_4K;
@@ -521,7 +525,7 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
if (likely(!pfnmap)) {
unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
pfn = gfn_to_pfn_memslot(slot, gfn);
- if (is_error_pfn(pfn)) {
+ if (is_error_noslot_pfn(pfn)) {
printk(KERN_ERR "Couldn't get real page for gfn %lx!\n",
(long)gfn);
return;
@@ -541,6 +545,9 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
/* Clear i-cache for new pages */
kvmppc_mmu_flush_icache(pfn);
+
+ /* Drop refcount on page, so that mmu notifiers can clear it */
+ kvm_release_pfn_clean(pfn);
}
/* XXX only map the one-one case, for now use TLB0 */
@@ -682,14 +689,11 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value)
return EMULATE_DONE;
}
-int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb)
+int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea)
{
struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
unsigned int ia;
int esel, tlbsel;
- gva_t ea;
-
- ea = ((ra) ? kvmppc_get_gpr(vcpu, ra) : 0) + kvmppc_get_gpr(vcpu, rb);
ia = (ea >> 2) & 0x1;
@@ -716,7 +720,7 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb)
}
static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
- int pid, int rt)
+ int pid, int type)
{
struct kvm_book3e_206_tlb_entry *tlbe;
int tid, esel;
@@ -725,7 +729,7 @@ static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries; esel++) {
tlbe = get_entry(vcpu_e500, tlbsel, esel);
tid = get_tlb_tid(tlbe);
- if (rt == 0 || tid == pid) {
+ if (type == 0 || tid == pid) {
inval_gtlbe_on_host(vcpu_e500, tlbsel, esel);
kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
}
@@ -733,14 +737,9 @@ static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
}
static void tlbilx_one(struct kvmppc_vcpu_e500 *vcpu_e500, int pid,
- int ra, int rb)
+ gva_t ea)
{
int tlbsel, esel;
- gva_t ea;
-
- ea = kvmppc_get_gpr(&vcpu_e500->vcpu, rb);
- if (ra)
- ea += kvmppc_get_gpr(&vcpu_e500->vcpu, ra);
for (tlbsel = 0; tlbsel < 2; tlbsel++) {
esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, -1);
@@ -752,16 +751,16 @@ static void tlbilx_one(struct kvmppc_vcpu_e500 *vcpu_e500, int pid,
}
}
-int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int rt, int ra, int rb)
+int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int type, gva_t ea)
{
struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
int pid = get_cur_spid(vcpu);
- if (rt == 0 || rt == 1) {
- tlbilx_all(vcpu_e500, 0, pid, rt);
- tlbilx_all(vcpu_e500, 1, pid, rt);
- } else if (rt == 3) {
- tlbilx_one(vcpu_e500, pid, ra, rb);
+ if (type == 0 || type == 1) {
+ tlbilx_all(vcpu_e500, 0, pid, type);
+ tlbilx_all(vcpu_e500, 1, pid, type);
+ } else if (type == 3) {
+ tlbilx_one(vcpu_e500, pid, ea);
}
return EMULATE_DONE;
@@ -786,16 +785,13 @@ int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu)
return EMULATE_DONE;
}
-int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb)
+int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea)
{
struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
int as = !!get_cur_sas(vcpu);
unsigned int pid = get_cur_spid(vcpu);
int esel, tlbsel;
struct kvm_book3e_206_tlb_entry *gtlbe = NULL;
- gva_t ea;
-
- ea = kvmppc_get_gpr(vcpu, rb);
for (tlbsel = 0; tlbsel < 2; tlbsel++) {
esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as);
@@ -875,6 +871,8 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
gtlbe->mas1 = vcpu->arch.shared->mas1;
gtlbe->mas2 = vcpu->arch.shared->mas2;
+ if (!(vcpu->arch.shared->msr & MSR_CM))
+ gtlbe->mas2 &= 0xffffffffUL;
gtlbe->mas7_3 = vcpu->arch.shared->mas7_3;
trace_kvm_booke206_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1,
@@ -1039,8 +1037,12 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
sesel = 0; /* unused */
priv = &vcpu_e500->gtlb_priv[tlbsel][esel];
- kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K,
- &priv->ref, eaddr, &stlbe);
+ /* Only triggers after clear_tlb_refs */
+ if (unlikely(!(priv->ref.flags & E500_TLB_VALID)))
+ kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe);
+ else
+ kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K,
+ &priv->ref, eaddr, &stlbe);
break;
case 1: {
@@ -1060,6 +1062,49 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel);
}
+/************* MMU Notifiers *************/
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+ trace_kvm_unmap_hva(hva);
+
+ /*
+ * Flush all shadow tlb entries everywhere. This is slow, but
+ * we are 100% sure that we catch the to be unmapped page
+ */
+ kvm_flush_remote_tlbs(kvm);
+
+ return 0;
+}
+
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+ /* kvm_unmap_hva flushes everything anyways */
+ kvm_unmap_hva(kvm, start);
+
+ return 0;
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long hva)
+{
+ /* XXX could be more clever ;) */
+ return 0;
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+ /* XXX could be more clever ;) */
+ return 0;
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+ /* The page will get remapped properly on its next fault */
+ kvm_unmap_hva(kvm, hva);
+}
+
+/*****************************************/
+
static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500)
{
int i;
@@ -1081,6 +1126,8 @@ static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500)
}
vcpu_e500->num_shared_tlb_pages = 0;
+
+ kfree(vcpu_e500->shared_tlb_pages);
vcpu_e500->shared_tlb_pages = NULL;
} else {
kfree(vcpu_e500->gtlb_arch);
@@ -1178,21 +1225,27 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
}
virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL);
- if (!virt)
+ if (!virt) {
+ ret = -ENOMEM;
goto err_put_page;
+ }
privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0],
GFP_KERNEL);
privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1],
GFP_KERNEL);
- if (!privs[0] || !privs[1])
- goto err_put_page;
+ if (!privs[0] || !privs[1]) {
+ ret = -ENOMEM;
+ goto err_privs;
+ }
g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1],
GFP_KERNEL);
- if (!g2h_bitmap)
- goto err_put_page;
+ if (!g2h_bitmap) {
+ ret = -ENOMEM;
+ goto err_privs;
+ }
free_gtlb(vcpu_e500);
@@ -1232,10 +1285,11 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
kvmppc_recalc_tlb1map_range(vcpu_e500);
return 0;
-err_put_page:
+err_privs:
kfree(privs[0]);
kfree(privs[1]);
+err_put_page:
for (i = 0; i < num_pages; i++)
put_page(pages[i]);
@@ -1332,7 +1386,7 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
if (!vcpu_e500->gtlb_priv[1])
goto err;
- vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(unsigned int) *
+ vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(u64) *
vcpu_e500->gtlb_params[1].entries,
GFP_KERNEL);
if (!vcpu_e500->g2h_tlb1_map)
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index ee04abaefe23..b0855e5d8905 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -131,6 +131,125 @@ u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb)
return vcpu->arch.dec - jd;
}
+static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+{
+ enum emulation_result emulated = EMULATE_DONE;
+ ulong spr_val = kvmppc_get_gpr(vcpu, rs);
+
+ switch (sprn) {
+ case SPRN_SRR0:
+ vcpu->arch.shared->srr0 = spr_val;
+ break;
+ case SPRN_SRR1:
+ vcpu->arch.shared->srr1 = spr_val;
+ break;
+
+ /* XXX We need to context-switch the timebase for
+ * watchdog and FIT. */
+ case SPRN_TBWL: break;
+ case SPRN_TBWU: break;
+
+ case SPRN_MSSSR0: break;
+
+ case SPRN_DEC:
+ vcpu->arch.dec = spr_val;
+ kvmppc_emulate_dec(vcpu);
+ break;
+
+ case SPRN_SPRG0:
+ vcpu->arch.shared->sprg0 = spr_val;
+ break;
+ case SPRN_SPRG1:
+ vcpu->arch.shared->sprg1 = spr_val;
+ break;
+ case SPRN_SPRG2:
+ vcpu->arch.shared->sprg2 = spr_val;
+ break;
+ case SPRN_SPRG3:
+ vcpu->arch.shared->sprg3 = spr_val;
+ break;
+
+ default:
+ emulated = kvmppc_core_emulate_mtspr(vcpu, sprn,
+ spr_val);
+ if (emulated == EMULATE_FAIL)
+ printk(KERN_INFO "mtspr: unknown spr "
+ "0x%x\n", sprn);
+ break;
+ }
+
+ kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS);
+
+ return emulated;
+}
+
+static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+{
+ enum emulation_result emulated = EMULATE_DONE;
+ ulong spr_val = 0;
+
+ switch (sprn) {
+ case SPRN_SRR0:
+ spr_val = vcpu->arch.shared->srr0;
+ break;
+ case SPRN_SRR1:
+ spr_val = vcpu->arch.shared->srr1;
+ break;
+ case SPRN_PVR:
+ spr_val = vcpu->arch.pvr;
+ break;
+ case SPRN_PIR:
+ spr_val = vcpu->vcpu_id;
+ break;
+ case SPRN_MSSSR0:
+ spr_val = 0;
+ break;
+
+ /* Note: mftb and TBRL/TBWL are user-accessible, so
+ * the guest can always access the real TB anyways.
+ * In fact, we probably will never see these traps. */
+ case SPRN_TBWL:
+ spr_val = get_tb() >> 32;
+ break;
+ case SPRN_TBWU:
+ spr_val = get_tb();
+ break;
+
+ case SPRN_SPRG0:
+ spr_val = vcpu->arch.shared->sprg0;
+ break;
+ case SPRN_SPRG1:
+ spr_val = vcpu->arch.shared->sprg1;
+ break;
+ case SPRN_SPRG2:
+ spr_val = vcpu->arch.shared->sprg2;
+ break;
+ case SPRN_SPRG3:
+ spr_val = vcpu->arch.shared->sprg3;
+ break;
+ /* Note: SPRG4-7 are user-readable, so we don't get
+ * a trap. */
+
+ case SPRN_DEC:
+ spr_val = kvmppc_get_dec(vcpu, get_tb());
+ break;
+ default:
+ emulated = kvmppc_core_emulate_mfspr(vcpu, sprn,
+ &spr_val);
+ if (unlikely(emulated == EMULATE_FAIL)) {
+ printk(KERN_INFO "mfspr: unknown spr "
+ "0x%x\n", sprn);
+ }
+ break;
+ }
+
+ if (emulated == EMULATE_DONE)
+ kvmppc_set_gpr(vcpu, rt, spr_val);
+ kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS);
+
+ return emulated;
+}
+
/* XXX to do:
* lhax
* lhaux
@@ -156,7 +275,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
int sprn = get_sprn(inst);
enum emulation_result emulated = EMULATE_DONE;
int advance = 1;
- ulong spr_val = 0;
/* this default type might be overwritten by subcategories */
kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS);
@@ -236,62 +354,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
break;
case OP_31_XOP_MFSPR:
- switch (sprn) {
- case SPRN_SRR0:
- spr_val = vcpu->arch.shared->srr0;
- break;
- case SPRN_SRR1:
- spr_val = vcpu->arch.shared->srr1;
- break;
- case SPRN_PVR:
- spr_val = vcpu->arch.pvr;
- break;
- case SPRN_PIR:
- spr_val = vcpu->vcpu_id;
- break;
- case SPRN_MSSSR0:
- spr_val = 0;
- break;
-
- /* Note: mftb and TBRL/TBWL are user-accessible, so
- * the guest can always access the real TB anyways.
- * In fact, we probably will never see these traps. */
- case SPRN_TBWL:
- spr_val = get_tb() >> 32;
- break;
- case SPRN_TBWU:
- spr_val = get_tb();
- break;
-
- case SPRN_SPRG0:
- spr_val = vcpu->arch.shared->sprg0;
- break;
- case SPRN_SPRG1:
- spr_val = vcpu->arch.shared->sprg1;
- break;
- case SPRN_SPRG2:
- spr_val = vcpu->arch.shared->sprg2;
- break;
- case SPRN_SPRG3:
- spr_val = vcpu->arch.shared->sprg3;
- break;
- /* Note: SPRG4-7 are user-readable, so we don't get
- * a trap. */
-
- case SPRN_DEC:
- spr_val = kvmppc_get_dec(vcpu, get_tb());
- break;
- default:
- emulated = kvmppc_core_emulate_mfspr(vcpu, sprn,
- &spr_val);
- if (unlikely(emulated == EMULATE_FAIL)) {
- printk(KERN_INFO "mfspr: unknown spr "
- "0x%x\n", sprn);
- }
- break;
- }
- kvmppc_set_gpr(vcpu, rt, spr_val);
- kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS);
+ emulated = kvmppc_emulate_mfspr(vcpu, sprn, rt);
break;
case OP_31_XOP_STHX:
@@ -308,49 +371,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
break;
case OP_31_XOP_MTSPR:
- spr_val = kvmppc_get_gpr(vcpu, rs);
- switch (sprn) {
- case SPRN_SRR0:
- vcpu->arch.shared->srr0 = spr_val;
- break;
- case SPRN_SRR1:
- vcpu->arch.shared->srr1 = spr_val;
- break;
-
- /* XXX We need to context-switch the timebase for
- * watchdog and FIT. */
- case SPRN_TBWL: break;
- case SPRN_TBWU: break;
-
- case SPRN_MSSSR0: break;
-
- case SPRN_DEC:
- vcpu->arch.dec = spr_val;
- kvmppc_emulate_dec(vcpu);
- break;
-
- case SPRN_SPRG0:
- vcpu->arch.shared->sprg0 = spr_val;
- break;
- case SPRN_SPRG1:
- vcpu->arch.shared->sprg1 = spr_val;
- break;
- case SPRN_SPRG2:
- vcpu->arch.shared->sprg2 = spr_val;
- break;
- case SPRN_SPRG3:
- vcpu->arch.shared->sprg3 = spr_val;
- break;
-
- default:
- emulated = kvmppc_core_emulate_mtspr(vcpu, sprn,
- spr_val);
- if (emulated == EMULATE_FAIL)
- printk(KERN_INFO "mtspr: unknown spr "
- "0x%x\n", sprn);
- break;
- }
- kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS);
+ emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs);
break;
case OP_31_XOP_DCBI:
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 4d213b8b0fb5..70739a089560 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -30,6 +30,7 @@
#include <asm/kvm_ppc.h>
#include <asm/tlbflush.h>
#include <asm/cputhreads.h>
+#include <asm/irqflags.h>
#include "timing.h"
#include "../mm/mmu_decl.h"
@@ -38,8 +39,7 @@
int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
{
- return !(v->arch.shared->msr & MSR_WE) ||
- !!(v->arch.pending_exceptions) ||
+ return !!(v->arch.pending_exceptions) ||
v->requests;
}
@@ -48,6 +48,85 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
return 1;
}
+#ifndef CONFIG_KVM_BOOK3S_64_HV
+/*
+ * Common checks before entering the guest world. Call with interrupts
+ * disabled.
+ *
+ * returns:
+ *
+ * == 1 if we're ready to go into guest state
+ * <= 0 if we need to go back to the host with return value
+ */
+int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
+{
+ int r = 1;
+
+ WARN_ON_ONCE(!irqs_disabled());
+ while (true) {
+ if (need_resched()) {
+ local_irq_enable();
+ cond_resched();
+ local_irq_disable();
+ continue;
+ }
+
+ if (signal_pending(current)) {
+ kvmppc_account_exit(vcpu, SIGNAL_EXITS);
+ vcpu->run->exit_reason = KVM_EXIT_INTR;
+ r = -EINTR;
+ break;
+ }
+
+ vcpu->mode = IN_GUEST_MODE;
+
+ /*
+ * Reading vcpu->requests must happen after setting vcpu->mode,
+ * so we don't miss a request because the requester sees
+ * OUTSIDE_GUEST_MODE and assumes we'll be checking requests
+ * before next entering the guest (and thus doesn't IPI).
+ */
+ smp_mb();
+
+ if (vcpu->requests) {
+ /* Make sure we process requests preemptable */
+ local_irq_enable();
+ trace_kvm_check_requests(vcpu);
+ r = kvmppc_core_check_requests(vcpu);
+ local_irq_disable();
+ if (r > 0)
+ continue;
+ break;
+ }
+
+ if (kvmppc_core_prepare_to_enter(vcpu)) {
+ /* interrupts got enabled in between, so we
+ are back at square 1 */
+ continue;
+ }
+
+#ifdef CONFIG_PPC64
+ /* lazy EE magic */
+ hard_irq_disable();
+ if (lazy_irq_pending()) {
+ /* Got an interrupt in between, try again */
+ local_irq_enable();
+ local_irq_disable();
+ kvm_guest_exit();
+ continue;
+ }
+
+ trace_hardirqs_on();
+#endif
+
+ kvm_guest_enter();
+ break;
+ }
+
+ return r;
+}
+#endif /* CONFIG_KVM_BOOK3S_64_HV */
+
int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
{
int nr = kvmppc_get_gpr(vcpu, 11);
@@ -67,18 +146,18 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
}
switch (nr) {
- case HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE:
+ case KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE):
{
vcpu->arch.magic_page_pa = param1;
vcpu->arch.magic_page_ea = param2;
r2 = KVM_MAGIC_FEAT_SR | KVM_MAGIC_FEAT_MAS0_TO_SPRG7;
- r = HC_EV_SUCCESS;
+ r = EV_SUCCESS;
break;
}
- case HC_VENDOR_KVM | KVM_HC_FEATURES:
- r = HC_EV_SUCCESS;
+ case KVM_HCALL_TOKEN(KVM_HC_FEATURES):
+ r = EV_SUCCESS;
#if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500V2)
/* XXX Missing magic page on 44x */
r2 |= (1 << KVM_FEATURE_MAGIC_PAGE);
@@ -86,8 +165,13 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
/* Second return value is in r4 */
break;
+ case EV_HCALL_TOKEN(EV_IDLE):
+ r = EV_SUCCESS;
+ kvm_vcpu_block(vcpu);
+ clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
+ break;
default:
- r = HC_EV_UNIMPLEMENTED;
+ r = EV_UNIMPLEMENTED;
break;
}
@@ -220,6 +304,7 @@ int kvm_dev_ioctl_check_extension(long ext)
switch (ext) {
#ifdef CONFIG_BOOKE
case KVM_CAP_PPC_BOOKE_SREGS:
+ case KVM_CAP_PPC_BOOKE_WATCHDOG:
#else
case KVM_CAP_PPC_SEGSTATE:
case KVM_CAP_PPC_HIOR:
@@ -229,6 +314,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_PPC_IRQ_LEVEL:
case KVM_CAP_ENABLE_CAP:
case KVM_CAP_ONE_REG:
+ case KVM_CAP_IOEVENTFD:
r = 1;
break;
#ifndef CONFIG_KVM_BOOK3S_64_HV
@@ -260,10 +346,22 @@ int kvm_dev_ioctl_check_extension(long ext)
if (cpu_has_feature(CPU_FTR_ARCH_201))
r = 2;
break;
+#endif
case KVM_CAP_SYNC_MMU:
+#ifdef CONFIG_KVM_BOOK3S_64_HV
r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0;
+#elif defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+ r = 1;
+#else
+ r = 0;
+ break;
+#endif
+#ifdef CONFIG_KVM_BOOK3S_64_HV
+ case KVM_CAP_PPC_HTAB_FD:
+ r = 1;
break;
#endif
+ break;
case KVM_CAP_NR_VCPUS:
/*
* Recommending a number of CPUs is somewhat arbitrary; we
@@ -302,19 +400,12 @@ long kvm_arch_dev_ioctl(struct file *filp,
void kvm_arch_free_memslot(struct kvm_memory_slot *free,
struct kvm_memory_slot *dont)
{
- if (!dont || free->arch.rmap != dont->arch.rmap) {
- vfree(free->arch.rmap);
- free->arch.rmap = NULL;
- }
+ kvmppc_core_free_memslot(free, dont);
}
int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
{
- slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
- if (!slot->arch.rmap)
- return -ENOMEM;
-
- return 0;
+ return kvmppc_core_create_memslot(slot, npages);
}
int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -323,7 +414,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
int user_alloc)
{
- return kvmppc_core_prepare_memory_region(kvm, mem);
+ return kvmppc_core_prepare_memory_region(kvm, memslot, mem);
}
void kvm_arch_commit_memory_region(struct kvm *kvm,
@@ -331,7 +422,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
struct kvm_memory_slot old,
int user_alloc)
{
- kvmppc_core_commit_memory_region(kvm, mem);
+ kvmppc_core_commit_memory_region(kvm, mem, old);
}
void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -341,6 +432,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot)
{
+ kvmppc_core_flush_memslot(kvm, slot);
}
struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
@@ -354,6 +446,11 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
return vcpu;
}
+int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
{
/* Make sure we're not using the vcpu anymore */
@@ -390,6 +487,8 @@ enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer)
int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
{
+ int ret;
+
hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu);
vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup;
@@ -398,13 +497,14 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
#ifdef CONFIG_KVM_EXIT_TIMING
mutex_init(&vcpu->arch.exit_timing_lock);
#endif
-
- return 0;
+ ret = kvmppc_subarch_vcpu_init(vcpu);
+ return ret;
}
void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
{
kvmppc_mmu_destroy(vcpu);
+ kvmppc_subarch_vcpu_uninit(vcpu);
}
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -420,7 +520,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
#endif
kvmppc_core_vcpu_load(vcpu, cpu);
- vcpu->cpu = smp_processor_id();
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -429,7 +528,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
#ifdef CONFIG_BOOKE
vcpu->arch.vrsave = mfspr(SPRN_VRSAVE);
#endif
- vcpu->cpu = -1;
}
int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
@@ -527,6 +625,13 @@ int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
vcpu->mmio_is_write = 0;
vcpu->arch.mmio_sign_extend = 0;
+ if (!kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr,
+ bytes, &run->mmio.data)) {
+ kvmppc_complete_mmio_load(vcpu, run);
+ vcpu->mmio_needed = 0;
+ return EMULATE_DONE;
+ }
+
return EMULATE_DO_MMIO;
}
@@ -536,8 +641,8 @@ int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu,
{
int r;
- r = kvmppc_handle_load(run, vcpu, rt, bytes, is_bigendian);
vcpu->arch.mmio_sign_extend = 1;
+ r = kvmppc_handle_load(run, vcpu, rt, bytes, is_bigendian);
return r;
}
@@ -575,6 +680,13 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
}
}
+ if (!kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr,
+ bytes, &run->mmio.data)) {
+ kvmppc_complete_mmio_load(vcpu, run);
+ vcpu->mmio_needed = 0;
+ return EMULATE_DONE;
+ }
+
return EMULATE_DO_MMIO;
}
@@ -649,6 +761,12 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
r = 0;
vcpu->arch.papr_enabled = true;
break;
+#ifdef CONFIG_BOOKE
+ case KVM_CAP_PPC_BOOKE_WATCHDOG:
+ r = 0;
+ vcpu->arch.watchdog_enabled = true;
+ break;
+#endif
#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
case KVM_CAP_SW_TLB: {
struct kvm_config_tlb cfg;
@@ -751,9 +869,16 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
{
+ u32 inst_nop = 0x60000000;
+#ifdef CONFIG_KVM_BOOKE_HV
+ u32 inst_sc1 = 0x44000022;
+ pvinfo->hcall[0] = inst_sc1;
+ pvinfo->hcall[1] = inst_nop;
+ pvinfo->hcall[2] = inst_nop;
+ pvinfo->hcall[3] = inst_nop;
+#else
u32 inst_lis = 0x3c000000;
u32 inst_ori = 0x60000000;
- u32 inst_nop = 0x60000000;
u32 inst_sc = 0x44000002;
u32 inst_imm_mask = 0xffff;
@@ -770,6 +895,9 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
pvinfo->hcall[1] = inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask);
pvinfo->hcall[2] = inst_sc;
pvinfo->hcall[3] = inst_nop;
+#endif
+
+ pvinfo->flags = KVM_PPC_PVINFO_FLAGS_EV_IDLE;
return 0;
}
@@ -832,6 +960,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = 0;
break;
}
+
+ case KVM_PPC_GET_HTAB_FD: {
+ struct kvm *kvm = filp->private_data;
+ struct kvm_get_htab_fd ghf;
+
+ r = -EFAULT;
+ if (copy_from_user(&ghf, argp, sizeof(ghf)))
+ break;
+ r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf);
+ break;
+ }
#endif /* CONFIG_KVM_BOOK3S_64_HV */
#ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
index ddb6a2149d44..e326489a5420 100644
--- a/arch/powerpc/kvm/trace.h
+++ b/arch/powerpc/kvm/trace.h
@@ -31,6 +31,126 @@ TRACE_EVENT(kvm_ppc_instr,
__entry->inst, __entry->pc, __entry->emulate)
);
+#ifdef CONFIG_PPC_BOOK3S
+#define kvm_trace_symbol_exit \
+ {0x100, "SYSTEM_RESET"}, \
+ {0x200, "MACHINE_CHECK"}, \
+ {0x300, "DATA_STORAGE"}, \
+ {0x380, "DATA_SEGMENT"}, \
+ {0x400, "INST_STORAGE"}, \
+ {0x480, "INST_SEGMENT"}, \
+ {0x500, "EXTERNAL"}, \
+ {0x501, "EXTERNAL_LEVEL"}, \
+ {0x502, "EXTERNAL_HV"}, \
+ {0x600, "ALIGNMENT"}, \
+ {0x700, "PROGRAM"}, \
+ {0x800, "FP_UNAVAIL"}, \
+ {0x900, "DECREMENTER"}, \
+ {0x980, "HV_DECREMENTER"}, \
+ {0xc00, "SYSCALL"}, \
+ {0xd00, "TRACE"}, \
+ {0xe00, "H_DATA_STORAGE"}, \
+ {0xe20, "H_INST_STORAGE"}, \
+ {0xe40, "H_EMUL_ASSIST"}, \
+ {0xf00, "PERFMON"}, \
+ {0xf20, "ALTIVEC"}, \
+ {0xf40, "VSX"}
+#else
+#define kvm_trace_symbol_exit \
+ {0, "CRITICAL"}, \
+ {1, "MACHINE_CHECK"}, \
+ {2, "DATA_STORAGE"}, \
+ {3, "INST_STORAGE"}, \
+ {4, "EXTERNAL"}, \
+ {5, "ALIGNMENT"}, \
+ {6, "PROGRAM"}, \
+ {7, "FP_UNAVAIL"}, \
+ {8, "SYSCALL"}, \
+ {9, "AP_UNAVAIL"}, \
+ {10, "DECREMENTER"}, \
+ {11, "FIT"}, \
+ {12, "WATCHDOG"}, \
+ {13, "DTLB_MISS"}, \
+ {14, "ITLB_MISS"}, \
+ {15, "DEBUG"}, \
+ {32, "SPE_UNAVAIL"}, \
+ {33, "SPE_FP_DATA"}, \
+ {34, "SPE_FP_ROUND"}, \
+ {35, "PERFORMANCE_MONITOR"}, \
+ {36, "DOORBELL"}, \
+ {37, "DOORBELL_CRITICAL"}, \
+ {38, "GUEST_DBELL"}, \
+ {39, "GUEST_DBELL_CRIT"}, \
+ {40, "HV_SYSCALL"}, \
+ {41, "HV_PRIV"}
+#endif
+
+TRACE_EVENT(kvm_exit,
+ TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
+ TP_ARGS(exit_nr, vcpu),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, exit_nr )
+ __field( unsigned long, pc )
+ __field( unsigned long, msr )
+ __field( unsigned long, dar )
+#ifdef CONFIG_KVM_BOOK3S_PR
+ __field( unsigned long, srr1 )
+#endif
+ __field( unsigned long, last_inst )
+ ),
+
+ TP_fast_assign(
+#ifdef CONFIG_KVM_BOOK3S_PR
+ struct kvmppc_book3s_shadow_vcpu *svcpu;
+#endif
+ __entry->exit_nr = exit_nr;
+ __entry->pc = kvmppc_get_pc(vcpu);
+ __entry->dar = kvmppc_get_fault_dar(vcpu);
+ __entry->msr = vcpu->arch.shared->msr;
+#ifdef CONFIG_KVM_BOOK3S_PR
+ svcpu = svcpu_get(vcpu);
+ __entry->srr1 = svcpu->shadow_srr1;
+ svcpu_put(svcpu);
+#endif
+ __entry->last_inst = vcpu->arch.last_inst;
+ ),
+
+ TP_printk("exit=%s"
+ " | pc=0x%lx"
+ " | msr=0x%lx"
+ " | dar=0x%lx"
+#ifdef CONFIG_KVM_BOOK3S_PR
+ " | srr1=0x%lx"
+#endif
+ " | last_inst=0x%lx"
+ ,
+ __print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit),
+ __entry->pc,
+ __entry->msr,
+ __entry->dar,
+#ifdef CONFIG_KVM_BOOK3S_PR
+ __entry->srr1,
+#endif
+ __entry->last_inst
+ )
+);
+
+TRACE_EVENT(kvm_unmap_hva,
+ TP_PROTO(unsigned long hva),
+ TP_ARGS(hva),
+
+ TP_STRUCT__entry(
+ __field( unsigned long, hva )
+ ),
+
+ TP_fast_assign(
+ __entry->hva = hva;
+ ),
+
+ TP_printk("unmap hva 0x%lx\n", __entry->hva)
+);
+
TRACE_EVENT(kvm_stlb_inval,
TP_PROTO(unsigned int stlb_index),
TP_ARGS(stlb_index),
@@ -98,41 +218,31 @@ TRACE_EVENT(kvm_gtlb_write,
__entry->word1, __entry->word2)
);
-
-/*************************************************************************
- * Book3S trace points *
- *************************************************************************/
-
-#ifdef CONFIG_KVM_BOOK3S_PR
-
-TRACE_EVENT(kvm_book3s_exit,
- TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
- TP_ARGS(exit_nr, vcpu),
+TRACE_EVENT(kvm_check_requests,
+ TP_PROTO(struct kvm_vcpu *vcpu),
+ TP_ARGS(vcpu),
TP_STRUCT__entry(
- __field( unsigned int, exit_nr )
- __field( unsigned long, pc )
- __field( unsigned long, msr )
- __field( unsigned long, dar )
- __field( unsigned long, srr1 )
+ __field( __u32, cpu_nr )
+ __field( __u32, requests )
),
TP_fast_assign(
- struct kvmppc_book3s_shadow_vcpu *svcpu;
- __entry->exit_nr = exit_nr;
- __entry->pc = kvmppc_get_pc(vcpu);
- __entry->dar = kvmppc_get_fault_dar(vcpu);
- __entry->msr = vcpu->arch.shared->msr;
- svcpu = svcpu_get(vcpu);
- __entry->srr1 = svcpu->shadow_srr1;
- svcpu_put(svcpu);
+ __entry->cpu_nr = vcpu->vcpu_id;
+ __entry->requests = vcpu->requests;
),
- TP_printk("exit=0x%x | pc=0x%lx | msr=0x%lx | dar=0x%lx | srr1=0x%lx",
- __entry->exit_nr, __entry->pc, __entry->msr, __entry->dar,
- __entry->srr1)
+ TP_printk("vcpu=%x requests=%x",
+ __entry->cpu_nr, __entry->requests)
);
+
+/*************************************************************************
+ * Book3S trace points *
+ *************************************************************************/
+
+#ifdef CONFIG_KVM_BOOK3S_PR
+
TRACE_EVENT(kvm_book3s_reenter,
TP_PROTO(int r, struct kvm_vcpu *vcpu),
TP_ARGS(r, vcpu),
@@ -395,6 +505,44 @@ TRACE_EVENT(kvm_booke206_gtlb_write,
__entry->mas2, __entry->mas7_3)
);
+TRACE_EVENT(kvm_booke206_ref_release,
+ TP_PROTO(__u64 pfn, __u32 flags),
+ TP_ARGS(pfn, flags),
+
+ TP_STRUCT__entry(
+ __field( __u64, pfn )
+ __field( __u32, flags )
+ ),
+
+ TP_fast_assign(
+ __entry->pfn = pfn;
+ __entry->flags = flags;
+ ),
+
+ TP_printk("pfn=%llx flags=%x",
+ __entry->pfn, __entry->flags)
+);
+
+TRACE_EVENT(kvm_booke_queue_irqprio,
+ TP_PROTO(struct kvm_vcpu *vcpu, unsigned int priority),
+ TP_ARGS(vcpu, priority),
+
+ TP_STRUCT__entry(
+ __field( __u32, cpu_nr )
+ __field( __u32, priority )
+ __field( unsigned long, pending )
+ ),
+
+ TP_fast_assign(
+ __entry->cpu_nr = vcpu->vcpu_id;
+ __entry->priority = priority;
+ __entry->pending = vcpu->arch.pending_exceptions;
+ ),
+
+ TP_printk("vcpu=%x prio=%x pending=%lx",
+ __entry->cpu_nr, __entry->priority, __entry->pending)
+);
+
#endif
#endif /* _TRACE_KVM_H */
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index e7a896acd982..48a920d51489 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -90,6 +90,7 @@ config MPIC
config PPC_EPAPR_HV_PIC
bool
default n
+ select EPAPR_PARAVIRT
config MPIC_WEIRD
bool
diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c
index 51ffafae561e..63c5f04ea580 100644
--- a/arch/powerpc/sysdev/fsl_msi.c
+++ b/arch/powerpc/sysdev/fsl_msi.c
@@ -236,7 +236,6 @@ static void fsl_msi_cascade(unsigned int irq, struct irq_desc *desc)
u32 intr_index;
u32 have_shift = 0;
struct fsl_msi_cascade_data *cascade_data;
- unsigned int ret;
cascade_data = irq_get_handler_data(irq);
msi_data = cascade_data->msi_data;
@@ -268,7 +267,9 @@ static void fsl_msi_cascade(unsigned int irq, struct irq_desc *desc)
case FSL_PIC_IP_IPIC:
msir_value = fsl_msi_read(msi_data->msi_regs, msir_index * 0x4);
break;
- case FSL_PIC_IP_VMPIC:
+#ifdef CONFIG_EPAPR_PARAVIRT
+ case FSL_PIC_IP_VMPIC: {
+ unsigned int ret;
ret = fh_vmpic_get_msir(virq_to_hw(irq), &msir_value);
if (ret) {
pr_err("fsl-msi: fh_vmpic_get_msir() failed for "
@@ -277,6 +278,8 @@ static void fsl_msi_cascade(unsigned int irq, struct irq_desc *desc)
}
break;
}
+#endif
+ }
while (msir_value) {
intr_index = ffs(msir_value) - 1;
@@ -508,10 +511,12 @@ static const struct of_device_id fsl_of_msi_ids[] = {
.compatible = "fsl,ipic-msi",
.data = &ipic_msi_feature,
},
+#ifdef CONFIG_EPAPR_PARAVIRT
{
.compatible = "fsl,vmpic-msi",
.data = &vmpic_msi_feature,
},
+#endif
{}
};
diff --git a/arch/powerpc/sysdev/fsl_soc.c b/arch/powerpc/sysdev/fsl_soc.c
index c449dbd1c938..97118dc3d285 100644
--- a/arch/powerpc/sysdev/fsl_soc.c
+++ b/arch/powerpc/sysdev/fsl_soc.c
@@ -253,6 +253,7 @@ struct platform_diu_data_ops diu_ops;
EXPORT_SYMBOL(diu_ops);
#endif
+#ifdef CONFIG_EPAPR_PARAVIRT
/*
* Restart the current partition
*
@@ -278,3 +279,4 @@ void fsl_hv_halt(void)
pr_info("hv exit\n");
fh_partition_stop(-1);
}
+#endif
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index ff1e2f8ef94a..c30615e605ac 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -629,10 +629,27 @@ int kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
break;
case KVM_S390_SIGP_STOP:
case KVM_S390_RESTART:
+ VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type);
+ inti->type = s390int->type;
+ break;
case KVM_S390_INT_EXTERNAL_CALL:
+ if (s390int->parm & 0xffff0000) {
+ kfree(inti);
+ return -EINVAL;
+ }
+ VCPU_EVENT(vcpu, 3, "inject: external call source-cpu:%u",
+ s390int->parm);
+ inti->type = s390int->type;
+ inti->extcall.code = s390int->parm;
+ break;
case KVM_S390_INT_EMERGENCY:
- VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type);
+ if (s390int->parm & 0xffff0000) {
+ kfree(inti);
+ return -EINVAL;
+ }
+ VCPU_EVENT(vcpu, 3, "inject: emergency %u\n", s390int->parm);
inti->type = s390int->type;
+ inti->emerg.code = s390int->parm;
break;
case KVM_S390_INT_VIRTIO:
case KVM_S390_INT_SERVICE:
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index d91a95568002..c9011bfaabbe 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -355,6 +355,11 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags);
}
+int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
{
atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH |
@@ -993,7 +998,7 @@ static int __init kvm_s390_init(void)
}
memcpy(facilities, S390_lowcore.stfle_fac_list, 16);
facilities[0] &= 0xff00fff3f47c0000ULL;
- facilities[1] &= 0x201c000000000000ULL;
+ facilities[1] &= 0x001c000000000000ULL;
return 0;
}
diff --git a/arch/x86/include/asm/clocksource.h b/arch/x86/include/asm/clocksource.h
index 0bdbbb3b9ce7..16a57f4ed64d 100644
--- a/arch/x86/include/asm/clocksource.h
+++ b/arch/x86/include/asm/clocksource.h
@@ -8,6 +8,7 @@
#define VCLOCK_NONE 0 /* No vDSO clock available. */
#define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */
#define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */
+#define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */
struct arch_clocksource_data {
int vclock_mode;
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index da40b1e2228e..2d9075e863a0 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -202,6 +202,7 @@
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
+#define X86_FEATURE_TSC_ADJUST (9*32+ 1) /* TSC adjustment MSR 0x3b */
#define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */
#define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */
#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 4da3c0c4c974..a09c28571064 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -19,6 +19,7 @@
#include <asm/acpi.h>
#include <asm/apicdef.h>
#include <asm/page.h>
+#include <asm/pvclock.h>
#ifdef CONFIG_X86_32
#include <linux/threads.h>
#include <asm/kmap_types.h>
@@ -81,6 +82,10 @@ enum fixed_addresses {
VVAR_PAGE,
VSYSCALL_HPET,
#endif
+#ifdef CONFIG_PARAVIRT_CLOCK
+ PVCLOCK_FIXMAP_BEGIN,
+ PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1,
+#endif
FIX_DBGP_BASE,
FIX_EARLYCON_MEM_BASE,
#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
index 317ff1703d0b..6080d2694bad 100644
--- a/arch/x86/include/asm/kexec.h
+++ b/arch/x86/include/asm/kexec.h
@@ -163,6 +163,9 @@ struct kimage_arch {
};
#endif
+typedef void crash_vmclear_fn(void);
+extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss;
+
#endif /* __ASSEMBLY__ */
#endif /* _ASM_X86_KEXEC_H */
diff --git a/arch/x86/include/asm/kvm_guest.h b/arch/x86/include/asm/kvm_guest.h
new file mode 100644
index 000000000000..a92b1763c419
--- /dev/null
+++ b/arch/x86/include/asm/kvm_guest.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_X86_KVM_GUEST_H
+#define _ASM_X86_KVM_GUEST_H
+
+int kvm_setup_vsyscall_timeinfo(void);
+
+#endif /* _ASM_X86_KVM_GUEST_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b2e11f452435..dc87b65e9c3a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -22,6 +22,8 @@
#include <linux/kvm_para.h>
#include <linux/kvm_types.h>
#include <linux/perf_event.h>
+#include <linux/pvclock_gtod.h>
+#include <linux/clocksource.h>
#include <asm/pvclock-abi.h>
#include <asm/desc.h>
@@ -442,6 +444,7 @@ struct kvm_vcpu_arch {
s8 virtual_tsc_shift;
u32 virtual_tsc_mult;
u32 virtual_tsc_khz;
+ s64 ia32_tsc_adjust_msr;
atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
unsigned nmi_pending; /* NMI queued after currently running handler */
@@ -559,6 +562,12 @@ struct kvm_arch {
u64 cur_tsc_write;
u64 cur_tsc_offset;
u8 cur_tsc_generation;
+ int nr_vcpus_matched_tsc;
+
+ spinlock_t pvclock_gtod_sync_lock;
+ bool use_master_clock;
+ u64 master_kernel_ns;
+ cycle_t master_cycle_now;
struct kvm_xen_hvm_config xen_hvm_config;
@@ -612,6 +621,12 @@ struct kvm_vcpu_stat {
struct x86_instruction_info;
+struct msr_data {
+ bool host_initiated;
+ u32 index;
+ u64 data;
+};
+
struct kvm_x86_ops {
int (*cpu_has_kvm_support)(void); /* __init */
int (*disabled_by_bios)(void); /* __init */
@@ -634,7 +649,7 @@ struct kvm_x86_ops {
void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu);
int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
- int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+ int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr);
u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
void (*get_segment)(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -697,10 +712,11 @@ struct kvm_x86_ops {
bool (*has_wbinvd_exit)(void);
void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale);
+ u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu);
void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
- u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu);
+ u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc);
void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
@@ -785,7 +801,7 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu,
void kvm_enable_efer_bits(u64);
int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr);
struct x86_emulate_ctxt;
@@ -812,7 +828,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr);
unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index e400cdb2dd65..6e930b218724 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -236,6 +236,7 @@
#define MSR_IA32_EBL_CR_POWERON 0x0000002a
#define MSR_EBC_FREQUENCY_ID 0x0000002c
#define MSR_IA32_FEATURE_CONTROL 0x0000003a
+#define MSR_IA32_TSC_ADJUST 0x0000003b
#define FEATURE_CONTROL_LOCKED (1<<0)
#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1)
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index c59cc97fe6c1..109a9dd5d454 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -6,6 +6,7 @@
/* some helper functions for xen and kvm pv clock sources */
cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
void pvclock_set_flags(u8 flags);
unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
@@ -56,4 +57,50 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
return product;
}
+static __always_inline
+u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src)
+{
+ u64 delta = __native_read_tsc() - src->tsc_timestamp;
+ return pvclock_scale_delta(delta, src->tsc_to_system_mul,
+ src->tsc_shift);
+}
+
+static __always_inline
+unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
+ cycle_t *cycles, u8 *flags)
+{
+ unsigned version;
+ cycle_t ret, offset;
+ u8 ret_flags;
+
+ version = src->version;
+ /* Note: emulated platforms which do not advertise SSE2 support
+ * result in kvmclock not using the necessary RDTSC barriers.
+ * Without barriers, it is possible that RDTSC instruction reads from
+ * the time stamp counter outside rdtsc_barrier protected section
+ * below, resulting in violation of monotonicity.
+ */
+ rdtsc_barrier();
+ offset = pvclock_get_nsec_offset(src);
+ ret = src->system_time + offset;
+ ret_flags = src->flags;
+ rdtsc_barrier();
+
+ *cycles = ret;
+ *flags = ret_flags;
+ return version;
+}
+
+struct pvclock_vsyscall_time_info {
+ struct pvclock_vcpu_time_info pvti;
+ u32 migrate_count;
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
+#define PVCLOCK_VSYSCALL_NR_PAGES (((NR_CPUS-1)/(PAGE_SIZE/PVTI_SIZE))+1)
+
+int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
+ int size);
+struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu);
+
#endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 36ec21c36d68..c2d56b34830d 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -445,8 +445,7 @@ enum vmcs_field {
#define VMX_EPTP_WB_BIT (1ull << 14)
#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
-#define VMX_EPT_AD_BIT (1ull << 21)
-#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
+#define VMX_EPT_AD_BIT (1ull << 21)
#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index eaea1d31f753..80f80955cfd8 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -33,6 +33,26 @@ extern void map_vsyscall(void);
*/
extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address);
+#ifdef CONFIG_X86_64
+
+#define VGETCPU_CPU_MASK 0xfff
+
+static inline unsigned int __getcpu(void)
+{
+ unsigned int p;
+
+ if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
+ /* Load per CPU data from RDTSCP */
+ native_read_tscp(&p);
+ } else {
+ /* Load per CPU data from GDT */
+ asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
+ }
+
+ return p;
+}
+#endif /* CONFIG_X86_64 */
+
#endif /* __KERNEL__ */
#endif /* _ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 13ad89971d47..74467feb4dc5 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -16,6 +16,7 @@
#include <linux/delay.h>
#include <linux/elf.h>
#include <linux/elfcore.h>
+#include <linux/module.h>
#include <asm/processor.h>
#include <asm/hardirq.h>
@@ -30,6 +31,27 @@
int in_crash_kexec;
+/*
+ * This is used to VMCLEAR all VMCSs loaded on the
+ * processor. And when loading kvm_intel module, the
+ * callback function pointer will be assigned.
+ *
+ * protected by rcu.
+ */
+crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL;
+EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss);
+
+static inline void cpu_crash_vmclear_loaded_vmcss(void)
+{
+ crash_vmclear_fn *do_vmclear_operation = NULL;
+
+ rcu_read_lock();
+ do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss);
+ if (do_vmclear_operation)
+ do_vmclear_operation();
+ rcu_read_unlock();
+}
+
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
@@ -46,6 +68,11 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
#endif
crash_save_cpu(regs, cpu);
+ /*
+ * VMCLEAR VMCSs loaded on all cpus if needed.
+ */
+ cpu_crash_vmclear_loaded_vmcss();
+
/* Disable VMX or SVM if needed.
*
* We need to disable virtualization on all CPUs.
@@ -88,6 +115,11 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
kdump_nmi_shootdown_cpus();
+ /*
+ * VMCLEAR VMCSs loaded on this cpu if needed.
+ */
+ cpu_crash_vmclear_loaded_vmcss();
+
/* Booting kdump kernel with VMX or SVM enabled won't work,
* because (among other limitations) we can't disable paging
* with the virt flags.
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 4180a874c764..08b973f64032 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -42,6 +42,7 @@
#include <asm/apic.h>
#include <asm/apicdef.h>
#include <asm/hypervisor.h>
+#include <asm/kvm_guest.h>
static int kvmapf = 1;
@@ -62,6 +63,15 @@ static int parse_no_stealacc(char *arg)
early_param("no-steal-acc", parse_no_stealacc);
+static int kvmclock_vsyscall = 1;
+static int parse_no_kvmclock_vsyscall(char *arg)
+{
+ kvmclock_vsyscall = 0;
+ return 0;
+}
+
+early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
+
static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
static int has_steal_clock = 0;
@@ -110,11 +120,6 @@ void kvm_async_pf_task_wait(u32 token)
struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
struct kvm_task_sleep_node n, *e;
DEFINE_WAIT(wait);
- int cpu, idle;
-
- cpu = get_cpu();
- idle = idle_cpu(cpu);
- put_cpu();
spin_lock(&b->lock);
e = _find_apf_task(b, token);
@@ -128,7 +133,7 @@ void kvm_async_pf_task_wait(u32 token)
n.token = token;
n.cpu = smp_processor_id();
- n.halted = idle || preempt_count() > 1;
+ n.halted = is_idle_task(current) || preempt_count() > 1;
init_waitqueue_head(&n.wq);
hlist_add_head(&n.link, &b->list);
spin_unlock(&b->lock);
@@ -471,6 +476,9 @@ void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
apic_set_eoi_write(kvm_guest_apic_eoi_write);
+ if (kvmclock_vsyscall)
+ kvm_setup_vsyscall_timeinfo();
+
#ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
register_cpu_notifier(&kvm_cpu_notifier);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f1b42b3a186c..220a360010f8 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -23,6 +23,7 @@
#include <asm/apic.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
+#include <linux/memblock.h>
#include <asm/x86_init.h>
#include <asm/reboot.h>
@@ -39,7 +40,7 @@ static int parse_no_kvmclock(char *arg)
early_param("no-kvmclock", parse_no_kvmclock);
/* The hypervisor will put information about time periodically here */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock);
+static struct pvclock_vsyscall_time_info *hv_clock;
static struct pvclock_wall_clock wall_clock;
/*
@@ -52,15 +53,20 @@ static unsigned long kvm_get_wallclock(void)
struct pvclock_vcpu_time_info *vcpu_time;
struct timespec ts;
int low, high;
+ int cpu;
low = (int)__pa_symbol(&wall_clock);
high = ((u64)__pa_symbol(&wall_clock) >> 32);
native_write_msr(msr_kvm_wall_clock, low, high);
- vcpu_time = &get_cpu_var(hv_clock);
+ preempt_disable();
+ cpu = smp_processor_id();
+
+ vcpu_time = &hv_clock[cpu].pvti;
pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
- put_cpu_var(hv_clock);
+
+ preempt_enable();
return ts.tv_sec;
}
@@ -74,9 +80,11 @@ static cycle_t kvm_clock_read(void)
{
struct pvclock_vcpu_time_info *src;
cycle_t ret;
+ int cpu;
preempt_disable_notrace();
- src = &__get_cpu_var(hv_clock);
+ cpu = smp_processor_id();
+ src = &hv_clock[cpu].pvti;
ret = pvclock_clocksource_read(src);
preempt_enable_notrace();
return ret;
@@ -99,8 +107,15 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
static unsigned long kvm_get_tsc_khz(void)
{
struct pvclock_vcpu_time_info *src;
- src = &per_cpu(hv_clock, 0);
- return pvclock_tsc_khz(src);
+ int cpu;
+ unsigned long tsc_khz;
+
+ preempt_disable();
+ cpu = smp_processor_id();
+ src = &hv_clock[cpu].pvti;
+ tsc_khz = pvclock_tsc_khz(src);
+ preempt_enable();
+ return tsc_khz;
}
static void kvm_get_preset_lpj(void)
@@ -119,10 +134,14 @@ bool kvm_check_and_clear_guest_paused(void)
{
bool ret = false;
struct pvclock_vcpu_time_info *src;
+ int cpu = smp_processor_id();
- src = &__get_cpu_var(hv_clock);
+ if (!hv_clock)
+ return ret;
+
+ src = &hv_clock[cpu].pvti;
if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) {
- __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED);
+ src->flags &= ~PVCLOCK_GUEST_STOPPED;
ret = true;
}
@@ -141,9 +160,10 @@ int kvm_register_clock(char *txt)
{
int cpu = smp_processor_id();
int low, high, ret;
+ struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti;
- low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
- high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
+ low = (int)__pa(src) | 1;
+ high = ((u64)__pa(src) >> 32);
ret = native_write_msr_safe(msr_kvm_system_time, low, high);
printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
cpu, high, low, txt);
@@ -197,6 +217,8 @@ static void kvm_shutdown(void)
void __init kvmclock_init(void)
{
+ unsigned long mem;
+
if (!kvm_para_available())
return;
@@ -209,8 +231,18 @@ void __init kvmclock_init(void)
printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
msr_kvm_system_time, msr_kvm_wall_clock);
- if (kvm_register_clock("boot clock"))
+ mem = memblock_alloc(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS,
+ PAGE_SIZE);
+ if (!mem)
+ return;
+ hv_clock = __va(mem);
+
+ if (kvm_register_clock("boot clock")) {
+ hv_clock = NULL;
+ memblock_free(mem,
+ sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
return;
+ }
pv_time_ops.sched_clock = kvm_clock_read;
x86_platform.calibrate_tsc = kvm_get_tsc_khz;
x86_platform.get_wallclock = kvm_get_wallclock;
@@ -233,3 +265,37 @@ void __init kvmclock_init(void)
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
}
+
+int __init kvm_setup_vsyscall_timeinfo(void)
+{
+#ifdef CONFIG_X86_64
+ int cpu;
+ int ret;
+ u8 flags;
+ struct pvclock_vcpu_time_info *vcpu_time;
+ unsigned int size;
+
+ size = sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS;
+
+ preempt_disable();
+ cpu = smp_processor_id();
+
+ vcpu_time = &hv_clock[cpu].pvti;
+ flags = pvclock_read_flags(vcpu_time);
+
+ if (!(flags & PVCLOCK_TSC_STABLE_BIT)) {
+ preempt_enable();
+ return 1;
+ }
+
+ if ((ret = pvclock_init_vsyscall(hv_clock, size))) {
+ preempt_enable();
+ return ret;
+ }
+
+ preempt_enable();
+
+ kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
+#endif
+ return 0;
+}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 42eb3300dfc6..85c39590c1a4 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -17,23 +17,13 @@
#include <linux/kernel.h>
#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/bootmem.h>
+#include <asm/fixmap.h>
#include <asm/pvclock.h>
-/*
- * These are perodically updated
- * xen: magic shared_info page
- * kvm: gpa registered via msr
- * and then copied here.
- */
-struct pvclock_shadow_time {
- u64 tsc_timestamp; /* TSC at last update of time vals. */
- u64 system_timestamp; /* Time, in nanosecs, since boot. */
- u32 tsc_to_nsec_mul;
- int tsc_shift;
- u32 version;
- u8 flags;
-};
-
static u8 valid_flags __read_mostly = 0;
void pvclock_set_flags(u8 flags)
@@ -41,34 +31,6 @@ void pvclock_set_flags(u8 flags)
valid_flags = flags;
}
-static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
-{
- u64 delta = native_read_tsc() - shadow->tsc_timestamp;
- return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul,
- shadow->tsc_shift);
-}
-
-/*
- * Reads a consistent set of time-base values from hypervisor,
- * into a shadow data area.
- */
-static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
- struct pvclock_vcpu_time_info *src)
-{
- do {
- dst->version = src->version;
- rmb(); /* fetch version before data */
- dst->tsc_timestamp = src->tsc_timestamp;
- dst->system_timestamp = src->system_time;
- dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
- dst->tsc_shift = src->tsc_shift;
- dst->flags = src->flags;
- rmb(); /* test version after fetching data */
- } while ((src->version & 1) || (dst->version != src->version));
-
- return dst->version;
-}
-
unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
{
u64 pv_tsc_khz = 1000000ULL << 32;
@@ -88,23 +50,32 @@ void pvclock_resume(void)
atomic64_set(&last_value, 0);
}
+u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
+{
+ unsigned version;
+ cycle_t ret;
+ u8 flags;
+
+ do {
+ version = __pvclock_read_cycles(src, &ret, &flags);
+ } while ((src->version & 1) || version != src->version);
+
+ return flags & valid_flags;
+}
+
cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
{
- struct pvclock_shadow_time shadow;
unsigned version;
- cycle_t ret, offset;
+ cycle_t ret;
u64 last;
+ u8 flags;
do {
- version = pvclock_get_time_values(&shadow, src);
- barrier();
- offset = pvclock_get_nsec_offset(&shadow);
- ret = shadow.system_timestamp + offset;
- barrier();
- } while (version != src->version);
+ version = __pvclock_read_cycles(src, &ret, &flags);
+ } while ((src->version & 1) || version != src->version);
if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
- (shadow.flags & PVCLOCK_TSC_STABLE_BIT))
+ (flags & PVCLOCK_TSC_STABLE_BIT))
return ret;
/*
@@ -156,3 +127,71 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
}
+
+static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
+
+static struct pvclock_vsyscall_time_info *
+pvclock_get_vsyscall_user_time_info(int cpu)
+{
+ if (!pvclock_vdso_info) {
+ BUG();
+ return NULL;
+ }
+
+ return &pvclock_vdso_info[cpu];
+}
+
+struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
+{
+ return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
+}
+
+#ifdef CONFIG_X86_64
+static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
+ void *v)
+{
+ struct task_migration_notifier *mn = v;
+ struct pvclock_vsyscall_time_info *pvti;
+
+ pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
+
+ /* this is NULL when pvclock vsyscall is not initialized */
+ if (unlikely(pvti == NULL))
+ return NOTIFY_DONE;
+
+ pvti->migrate_count++;
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block pvclock_migrate = {
+ .notifier_call = pvclock_task_migrate,
+};
+
+/*
+ * Initialize the generic pvclock vsyscall state. This will allocate
+ * a/some page(s) for the per-vcpu pvclock information, set up a
+ * fixmap mapping for the page(s)
+ */
+
+int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
+ int size)
+{
+ int idx;
+
+ WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
+
+ pvclock_vdso_info = i;
+
+ for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
+ __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
+ __pa_symbol(i) + (idx*PAGE_SIZE),
+ PAGE_KERNEL_VVAR);
+ }
+
+
+ register_task_migration_notifier(&pvclock_migrate);
+
+ return 0;
+}
+#endif
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index ec79e773342e..a20ecb5b6cbf 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -320,6 +320,8 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
if (index == 0) {
entry->ebx &= kvm_supported_word9_x86_features;
cpuid_mask(&entry->ebx, 9);
+ // TSC_ADJUST is emulated
+ entry->ebx |= F(TSC_ADJUST);
} else
entry->ebx = 0;
entry->eax = 0;
@@ -659,6 +661,7 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
} else
*eax = *ebx = *ecx = *edx = 0;
}
+EXPORT_SYMBOL_GPL(kvm_cpuid);
void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
{
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 58fc51488828..b7fd07984888 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -31,6 +31,14 @@ static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
return best && (best->ecx & bit(X86_FEATURE_XSAVE));
}
+static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ return best && (best->ebx & bit(X86_FEATURE_TSC_ADJUST));
+}
+
static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index bba39bfa1c4b..a27e76371108 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -676,8 +676,9 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
addr.seg);
if (!usable)
goto bad;
- /* code segment or read-only data segment */
- if (((desc.type & 8) || !(desc.type & 2)) && write)
+ /* code segment in protected mode or read-only data segment */
+ if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8))
+ || !(desc.type & 2)) && write)
goto bad;
/* unreadable code segment */
if (!fetch && (desc.type & 8) && !(desc.type & 2))
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 43e9fadca5d0..9392f527f107 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1011,7 +1011,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
local_irq_save(flags);
now = apic->lapic_timer.timer.base->get_time();
- guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
+ guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
if (likely(tscdeadline > guest_tsc)) {
ns = (tscdeadline - guest_tsc) * 1000000ULL;
do_div(ns, this_tsc_khz);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6f85fe0bf958..01d7c2ad05f5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2382,12 +2382,20 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
|| (!vcpu->arch.mmu.direct_map && write_fault
&& !is_write_protection(vcpu) && !user_fault)) {
+ /*
+ * There are two cases:
+ * - the one is other vcpu creates new sp in the window
+ * between mapping_level() and acquiring mmu-lock.
+ * - the another case is the new sp is created by itself
+ * (page-fault path) when guest uses the target gfn as
+ * its page table.
+ * Both of these cases can be fixed by allowing guest to
+ * retry the access, it will refault, then we can establish
+ * the mapping by using small page.
+ */
if (level > PT_PAGE_TABLE_LEVEL &&
- has_wrprotected_page(vcpu->kvm, gfn, level)) {
- ret = 1;
- drop_spte(vcpu->kvm, sptep);
+ has_wrprotected_page(vcpu->kvm, gfn, level))
goto done;
- }
spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
@@ -2505,6 +2513,14 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
mmu_free_roots(vcpu);
}
+static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
+{
+ int bit7;
+
+ bit7 = (gpte >> 7) & 1;
+ return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
+}
+
static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
bool no_dirty_log)
{
@@ -2517,6 +2533,26 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
return gfn_to_pfn_memslot_atomic(slot, gfn);
}
+static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ u64 gpte)
+{
+ if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+ goto no_present;
+
+ if (!is_present_gpte(gpte))
+ goto no_present;
+
+ if (!(gpte & PT_ACCESSED_MASK))
+ goto no_present;
+
+ return false;
+
+no_present:
+ drop_spte(vcpu->kvm, spte);
+ return true;
+}
+
static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
u64 *start, u64 *end)
@@ -2671,7 +2707,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
* PT_PAGE_TABLE_LEVEL and there would be no adjustment done
* here.
*/
- if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
+ if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
level == PT_PAGE_TABLE_LEVEL &&
PageTransCompound(pfn_to_page(pfn)) &&
!has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
@@ -2699,18 +2735,13 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
}
}
-static bool mmu_invalid_pfn(pfn_t pfn)
-{
- return unlikely(is_invalid_pfn(pfn));
-}
-
static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
pfn_t pfn, unsigned access, int *ret_val)
{
bool ret = true;
/* The pfn is invalid, report the error! */
- if (unlikely(is_invalid_pfn(pfn))) {
+ if (unlikely(is_error_pfn(pfn))) {
*ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
goto exit;
}
@@ -2862,7 +2893,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
return r;
spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu, mmu_seq))
+ if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
if (likely(!force_pt_level))
@@ -3331,7 +3362,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
return r;
spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu, mmu_seq))
+ if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
if (likely(!force_pt_level))
@@ -3399,14 +3430,6 @@ static void paging_free(struct kvm_vcpu *vcpu)
nonpaging_free(vcpu);
}
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
-{
- int bit7;
-
- bit7 = (gpte >> 7) & 1;
- return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
-}
-
static inline void protect_clean_gpte(unsigned *access, unsigned gpte)
{
unsigned mask;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 714e2c01a6fe..891eb6d93b8b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -305,51 +305,43 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
addr, access);
}
-static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *spte,
- pt_element_t gpte)
+static bool
+FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ u64 *spte, pt_element_t gpte, bool no_dirty_log)
{
- if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
- goto no_present;
-
- if (!is_present_gpte(gpte))
- goto no_present;
-
- if (!(gpte & PT_ACCESSED_MASK))
- goto no_present;
-
- return false;
-
-no_present:
- drop_spte(vcpu->kvm, spte);
- return true;
-}
-
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- u64 *spte, const void *pte)
-{
- pt_element_t gpte;
unsigned pte_access;
+ gfn_t gfn;
pfn_t pfn;
- gpte = *(const pt_element_t *)pte;
- if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
- return;
+ if (prefetch_invalid_gpte(vcpu, sp, spte, gpte))
+ return false;
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
+
+ gfn = gpte_to_gfn(gpte);
pte_access = sp->role.access & gpte_access(vcpu, gpte);
protect_clean_gpte(&pte_access, gpte);
- pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
- if (mmu_invalid_pfn(pfn))
- return;
+ pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
+ no_dirty_log && (pte_access & ACC_WRITE_MASK));
+ if (is_error_pfn(pfn))
+ return false;
/*
- * we call mmu_set_spte() with host_writable = true because that
- * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
+ * we call mmu_set_spte() with host_writable = true because
+ * pte_prefetch_gfn_to_pfn always gets a writable pfn.
*/
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
- NULL, PT_PAGE_TABLE_LEVEL,
- gpte_to_gfn(gpte), pfn, true, true);
+ NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true);
+
+ return true;
+}
+
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ u64 *spte, const void *pte)
+{
+ pt_element_t gpte = *(const pt_element_t *)pte;
+
+ FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false);
}
static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
@@ -395,53 +387,34 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
spte = sp->spt + i;
for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
- pt_element_t gpte;
- unsigned pte_access;
- gfn_t gfn;
- pfn_t pfn;
-
if (spte == sptep)
continue;
if (is_shadow_present_pte(*spte))
continue;
- gpte = gptep[i];
-
- if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
- continue;
-
- pte_access = sp->role.access & gpte_access(vcpu, gpte);
- protect_clean_gpte(&pte_access, gpte);
- gfn = gpte_to_gfn(gpte);
- pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
- pte_access & ACC_WRITE_MASK);
- if (mmu_invalid_pfn(pfn))
+ if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true))
break;
-
- mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
- NULL, PT_PAGE_TABLE_LEVEL, gfn,
- pfn, true, true);
}
}
/*
* Fetch a shadow pte for a specific level in the paging hierarchy.
+ * If the guest tries to write a write-protected page, we need to
+ * emulate this operation, return 1 to indicate this case.
*/
-static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *gw,
int user_fault, int write_fault, int hlevel,
- int *emulate, pfn_t pfn, bool map_writable,
- bool prefault)
+ pfn_t pfn, bool map_writable, bool prefault)
{
- unsigned access = gw->pt_access;
struct kvm_mmu_page *sp = NULL;
- int top_level;
- unsigned direct_access;
struct kvm_shadow_walk_iterator it;
+ unsigned direct_access, access = gw->pt_access;
+ int top_level, emulate = 0;
if (!is_present_gpte(gw->ptes[gw->level - 1]))
- return NULL;
+ return 0;
direct_access = gw->pte_access;
@@ -505,17 +478,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
clear_sp_write_flooding_count(it.sptep);
mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
- user_fault, write_fault, emulate, it.level,
+ user_fault, write_fault, &emulate, it.level,
gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
- return it.sptep;
+ return emulate;
out_gpte_changed:
if (sp)
kvm_mmu_put_page(sp, it.sptep);
kvm_release_pfn_clean(pfn);
- return NULL;
+ return 0;
}
/*
@@ -538,8 +511,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
int write_fault = error_code & PFERR_WRITE_MASK;
int user_fault = error_code & PFERR_USER_MASK;
struct guest_walker walker;
- u64 *sptep;
- int emulate = 0;
int r;
pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL;
@@ -594,24 +565,20 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
return r;
spin_lock(&vcpu->kvm->mmu_lock);
- if (mmu_notifier_retry(vcpu, mmu_seq))
+ if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
kvm_mmu_free_some_pages(vcpu);
if (!force_pt_level)
transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
- sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
- level, &emulate, pfn, map_writable, prefault);
- (void)sptep;
- pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
- sptep, *sptep, emulate);
-
+ r = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
+ level, pfn, map_writable, prefault);
++vcpu->stat.pf_fixed;
kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
spin_unlock(&vcpu->kvm->mmu_lock);
- return emulate;
+ return r;
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -757,7 +724,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
sizeof(pt_element_t)))
return -EINVAL;
- if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
+ if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) {
vcpu->kvm->tlbs_dirty++;
continue;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d017df3899ef..d29d3cd1c156 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -20,6 +20,7 @@
#include "mmu.h"
#include "kvm_cache_regs.h"
#include "x86.h"
+#include "cpuid.h"
#include <linux/module.h>
#include <linux/mod_devicetable.h>
@@ -630,15 +631,12 @@ static int svm_hardware_enable(void *garbage)
return -EBUSY;
if (!has_svm()) {
- printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n",
- me);
+ pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
return -EINVAL;
}
sd = per_cpu(svm_data, me);
-
if (!sd) {
- printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n",
- me);
+ pr_err("%s: svm_data is NULL on %d\n", __func__, me);
return -EINVAL;
}
@@ -1012,6 +1010,13 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
svm->tsc_ratio = ratio;
}
+static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ return svm->vmcb->control.tsc_offset;
+}
+
static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1189,6 +1194,8 @@ static void init_vmcb(struct vcpu_svm *svm)
static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ u32 dummy;
+ u32 eax = 1;
init_vmcb(svm);
@@ -1197,8 +1204,9 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
}
- vcpu->arch.regs_avail = ~0;
- vcpu->arch.regs_dirty = ~0;
+
+ kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
+ kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
return 0;
}
@@ -1254,11 +1262,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
svm->asid_generation = 0;
init_vmcb(svm);
- kvm_write_tsc(&svm->vcpu, 0);
-
- err = fx_init(&svm->vcpu);
- if (err)
- goto free_page4;
svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
if (kvm_vcpu_is_bsp(&svm->vcpu))
@@ -1268,8 +1271,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
return &svm->vcpu;
-free_page4:
- __free_page(hsave_page);
free_page3:
__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
free_page2:
@@ -3008,11 +3009,11 @@ static int cr8_write_interception(struct vcpu_svm *svm)
return 0;
}
-u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu)
+u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
{
struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
return vmcb->control.tsc_offset +
- svm_scale_tsc(vcpu, native_read_tsc());
+ svm_scale_tsc(vcpu, host_tsc);
}
static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
@@ -3131,13 +3132,15 @@ static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
return 0;
}
-static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
+static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ u32 ecx = msr->index;
+ u64 data = msr->data;
switch (ecx) {
case MSR_IA32_TSC:
- kvm_write_tsc(vcpu, data);
+ kvm_write_tsc(vcpu, msr);
break;
case MSR_STAR:
svm->vmcb->save.star = data;
@@ -3192,20 +3195,24 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
break;
default:
- return kvm_set_msr_common(vcpu, ecx, data);
+ return kvm_set_msr_common(vcpu, msr);
}
return 0;
}
static int wrmsr_interception(struct vcpu_svm *svm)
{
+ struct msr_data msr;
u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
| ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
+ msr.data = data;
+ msr.index = ecx;
+ msr.host_initiated = false;
svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
- if (svm_set_msr(&svm->vcpu, ecx, data)) {
+ if (svm_set_msr(&svm->vcpu, &msr)) {
trace_kvm_msr_write_ex(ecx, data);
kvm_inject_gp(&svm->vcpu, 0);
} else {
@@ -4302,6 +4309,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.has_wbinvd_exit = svm_has_wbinvd_exit,
.set_tsc_khz = svm_set_tsc_khz,
+ .read_tsc_offset = svm_read_tsc_offset,
.write_tsc_offset = svm_write_tsc_offset,
.adjust_tsc_offset = svm_adjust_tsc_offset,
.compute_tsc_offset = svm_compute_tsc_offset,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index bca63f04dccb..fe5e00ed7036 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -4,6 +4,7 @@
#include <linux/tracepoint.h>
#include <asm/vmx.h>
#include <asm/svm.h>
+#include <asm/clocksource.h>
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvm
@@ -754,6 +755,68 @@ TRACE_EVENT(
__entry->write ? "Write" : "Read",
__entry->gpa_match ? "GPA" : "GVA")
);
+
+#ifdef CONFIG_X86_64
+
+#define host_clocks \
+ {VCLOCK_NONE, "none"}, \
+ {VCLOCK_TSC, "tsc"}, \
+ {VCLOCK_HPET, "hpet"} \
+
+TRACE_EVENT(kvm_update_master_clock,
+ TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched),
+ TP_ARGS(use_master_clock, host_clock, offset_matched),
+
+ TP_STRUCT__entry(
+ __field( bool, use_master_clock )
+ __field( unsigned int, host_clock )
+ __field( bool, offset_matched )
+ ),
+
+ TP_fast_assign(
+ __entry->use_master_clock = use_master_clock;
+ __entry->host_clock = host_clock;
+ __entry->offset_matched = offset_matched;
+ ),
+
+ TP_printk("masterclock %d hostclock %s offsetmatched %u",
+ __entry->use_master_clock,
+ __print_symbolic(__entry->host_clock, host_clocks),
+ __entry->offset_matched)
+);
+
+TRACE_EVENT(kvm_track_tsc,
+ TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched,
+ unsigned int online_vcpus, bool use_master_clock,
+ unsigned int host_clock),
+ TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock,
+ host_clock),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( unsigned int, nr_vcpus_matched_tsc )
+ __field( unsigned int, online_vcpus )
+ __field( bool, use_master_clock )
+ __field( unsigned int, host_clock )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->nr_vcpus_matched_tsc = nr_matched;
+ __entry->online_vcpus = online_vcpus;
+ __entry->use_master_clock = use_master_clock;
+ __entry->host_clock = host_clock;
+ ),
+
+ TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u"
+ " hostclock %s",
+ __entry->vcpu_id, __entry->use_master_clock,
+ __entry->nr_vcpus_matched_tsc, __entry->online_vcpus,
+ __print_symbolic(__entry->host_clock, host_clocks))
+);
+
+#endif /* CONFIG_X86_64 */
+
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f85815945fc6..9120ae1901e4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -42,6 +42,7 @@
#include <asm/i387.h>
#include <asm/xcr.h>
#include <asm/perf_event.h>
+#include <asm/kexec.h>
#include "trace.h"
@@ -802,11 +803,6 @@ static inline bool cpu_has_vmx_ept_ad_bits(void)
return vmx_capability.ept & VMX_EPT_AD_BIT;
}
-static inline bool cpu_has_vmx_invept_individual_addr(void)
-{
- return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
-}
-
static inline bool cpu_has_vmx_invept_context(void)
{
return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
@@ -992,6 +988,46 @@ static void vmcs_load(struct vmcs *vmcs)
vmcs, phys_addr);
}
+#ifdef CONFIG_KEXEC
+/*
+ * This bitmap is used to indicate whether the vmclear
+ * operation is enabled on all cpus. All disabled by
+ * default.
+ */
+static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
+
+static inline void crash_enable_local_vmclear(int cpu)
+{
+ cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
+}
+
+static inline void crash_disable_local_vmclear(int cpu)
+{
+ cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
+}
+
+static inline int crash_local_vmclear_enabled(int cpu)
+{
+ return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
+}
+
+static void crash_vmclear_local_loaded_vmcss(void)
+{
+ int cpu = raw_smp_processor_id();
+ struct loaded_vmcs *v;
+
+ if (!crash_local_vmclear_enabled(cpu))
+ return;
+
+ list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
+ loaded_vmcss_on_cpu_link)
+ vmcs_clear(v->vmcs);
+}
+#else
+static inline void crash_enable_local_vmclear(int cpu) { }
+static inline void crash_disable_local_vmclear(int cpu) { }
+#endif /* CONFIG_KEXEC */
+
static void __loaded_vmcs_clear(void *arg)
{
struct loaded_vmcs *loaded_vmcs = arg;
@@ -1001,15 +1037,28 @@ static void __loaded_vmcs_clear(void *arg)
return; /* vcpu migration can race with cpu offline */
if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
per_cpu(current_vmcs, cpu) = NULL;
+ crash_disable_local_vmclear(cpu);
list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
+
+ /*
+ * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
+ * is before setting loaded_vmcs->vcpu to -1 which is done in
+ * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
+ * then adds the vmcs into percpu list before it is deleted.
+ */
+ smp_wmb();
+
loaded_vmcs_init(loaded_vmcs);
+ crash_enable_local_vmclear(cpu);
}
static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
{
- if (loaded_vmcs->cpu != -1)
- smp_call_function_single(
- loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1);
+ int cpu = loaded_vmcs->cpu;
+
+ if (cpu != -1)
+ smp_call_function_single(cpu,
+ __loaded_vmcs_clear, loaded_vmcs, 1);
}
static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
@@ -1051,17 +1100,6 @@ static inline void ept_sync_context(u64 eptp)
}
}
-static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
-{
- if (enable_ept) {
- if (cpu_has_vmx_invept_individual_addr())
- __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
- eptp, gpa);
- else
- ept_sync_context(eptp);
- }
-}
-
static __always_inline unsigned long vmcs_readl(unsigned long field)
{
unsigned long value;
@@ -1535,8 +1573,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
local_irq_disable();
+ crash_disable_local_vmclear(cpu);
+
+ /*
+ * Read loaded_vmcs->cpu should be before fetching
+ * loaded_vmcs->loaded_vmcss_on_cpu_link.
+ * See the comments in __loaded_vmcs_clear().
+ */
+ smp_rmb();
+
list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
&per_cpu(loaded_vmcss_on_cpu, cpu));
+ crash_enable_local_vmclear(cpu);
local_irq_enable();
/*
@@ -1839,11 +1887,10 @@ static u64 guest_read_tsc(void)
* Like guest_read_tsc, but always returns L1's notion of the timestamp
* counter, even if a nested guest (L2) is currently running.
*/
-u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu)
+u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
{
- u64 host_tsc, tsc_offset;
+ u64 tsc_offset;
- rdtscll(host_tsc);
tsc_offset = is_guest_mode(vcpu) ?
to_vmx(vcpu)->nested.vmcs01_tsc_offset :
vmcs_read64(TSC_OFFSET);
@@ -1866,6 +1913,11 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
WARN(1, "user requested TSC rate below hardware speed\n");
}
+static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
+{
+ return vmcs_read64(TSC_OFFSET);
+}
+
/*
* writes 'offset' into guest's timestamp counter offset register
*/
@@ -2202,15 +2254,17 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct shared_msr_entry *msr;
int ret = 0;
+ u32 msr_index = msr_info->index;
+ u64 data = msr_info->data;
switch (msr_index) {
case MSR_EFER:
- ret = kvm_set_msr_common(vcpu, msr_index, data);
+ ret = kvm_set_msr_common(vcpu, msr_info);
break;
#ifdef CONFIG_X86_64
case MSR_FS_BASE:
@@ -2236,7 +2290,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
case MSR_IA32_TSC:
- kvm_write_tsc(vcpu, data);
+ kvm_write_tsc(vcpu, msr_info);
break;
case MSR_IA32_CR_PAT:
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
@@ -2244,7 +2298,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
vcpu->arch.pat = data;
break;
}
- ret = kvm_set_msr_common(vcpu, msr_index, data);
+ ret = kvm_set_msr_common(vcpu, msr_info);
+ break;
+ case MSR_IA32_TSC_ADJUST:
+ ret = kvm_set_msr_common(vcpu, msr_info);
break;
case MSR_TSC_AUX:
if (!vmx->rdtscp_enabled)
@@ -2267,7 +2324,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
}
break;
}
- ret = kvm_set_msr_common(vcpu, msr_index, data);
+ ret = kvm_set_msr_common(vcpu, msr_info);
}
return ret;
@@ -2341,6 +2398,18 @@ static int hardware_enable(void *garbage)
return -EBUSY;
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+
+ /*
+ * Now we can enable the vmclear operation in kdump
+ * since the loaded_vmcss_on_cpu list on this cpu
+ * has been initialized.
+ *
+ * Though the cpu is not in VMX operation now, there
+ * is no problem to enable the vmclear operation
+ * for the loaded_vmcss_on_cpu list is empty!
+ */
+ crash_enable_local_vmclear(cpu);
+
rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
test_bits = FEATURE_CONTROL_LOCKED;
@@ -2697,6 +2766,7 @@ static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment
if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) {
tmp.base = vmcs_readl(sf->base);
tmp.selector = vmcs_read16(sf->selector);
+ tmp.dpl = tmp.selector & SELECTOR_RPL_MASK;
tmp.s = 1;
}
vmx_set_segment(vcpu, &tmp, seg);
@@ -3246,7 +3316,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
* unrestricted guest like Westmere to older host that don't have
* unrestricted guest like Nehelem.
*/
- if (!enable_unrestricted_guest && vmx->rmode.vm86_active) {
+ if (vmx->rmode.vm86_active) {
switch (seg) {
case VCPU_SREG_CS:
vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
@@ -3897,8 +3967,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
set_cr4_guest_host_mask(vmx);
- kvm_write_tsc(&vmx->vcpu, 0);
-
return 0;
}
@@ -3908,8 +3976,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
u64 msr;
int ret;
- vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
-
vmx->rmode.vm86_active = 0;
vmx->soft_vnmi_blocked = 0;
@@ -3921,10 +3987,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
msr |= MSR_IA32_APICBASE_BSP;
kvm_set_apic_base(&vmx->vcpu, msr);
- ret = fx_init(&vmx->vcpu);
- if (ret != 0)
- goto out;
-
vmx_segment_cache_clear(vmx);
seg_setup(VCPU_SREG_CS);
@@ -3965,7 +4027,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
kvm_rip_write(vcpu, 0xfff0);
else
kvm_rip_write(vcpu, 0);
- kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
vmcs_writel(GUEST_GDTR_BASE, 0);
vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
@@ -4015,7 +4076,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
/* HACK: Don't enable emulation on guest boot/reset */
vmx->emulation_required = 0;
-out:
return ret;
}
@@ -4287,16 +4347,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)
if (is_machine_check(intr_info))
return handle_machine_check(vcpu);
- if ((vect_info & VECTORING_INFO_VALID_MASK) &&
- !is_page_fault(intr_info)) {
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
- vcpu->run->internal.ndata = 2;
- vcpu->run->internal.data[0] = vect_info;
- vcpu->run->internal.data[1] = intr_info;
- return 0;
- }
-
if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
return 1; /* already handled by vmx_vcpu_run() */
@@ -4315,6 +4365,22 @@ static int handle_exception(struct kvm_vcpu *vcpu)
error_code = 0;
if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+
+ /*
+ * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
+ * MMIO, it is better to report an internal error.
+ * See the comments in vmx_handle_exit.
+ */
+ if ((vect_info & VECTORING_INFO_VALID_MASK) &&
+ !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
+ vcpu->run->internal.ndata = 2;
+ vcpu->run->internal.data[0] = vect_info;
+ vcpu->run->internal.data[1] = intr_info;
+ return 0;
+ }
+
if (is_page_fault(intr_info)) {
/* EPT won't cause page fault directly */
BUG_ON(enable_ept);
@@ -4626,11 +4692,15 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu)
static int handle_wrmsr(struct kvm_vcpu *vcpu)
{
+ struct msr_data msr;
u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
| ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
- if (vmx_set_msr(vcpu, ecx, data) != 0) {
+ msr.data = data;
+ msr.index = ecx;
+ msr.host_initiated = false;
+ if (vmx_set_msr(vcpu, &msr) != 0) {
trace_kvm_msr_write_ex(ecx, data);
kvm_inject_gp(vcpu, 0);
return 1;
@@ -4827,11 +4897,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- if (exit_qualification & (1 << 6)) {
- printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
- return -EINVAL;
- }
-
gla_validity = (exit_qualification >> 7) & 0x3;
if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
@@ -5979,13 +6044,24 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
return 0;
}
+ /*
+ * Note:
+ * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
+ * delivery event since it indicates guest is accessing MMIO.
+ * The vm-exit can be triggered again after return to guest that
+ * will cause infinite loop.
+ */
if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
exit_reason != EXIT_REASON_EPT_VIOLATION &&
- exit_reason != EXIT_REASON_TASK_SWITCH))
- printk(KERN_WARNING "%s: unexpected, valid vectoring info "
- "(0x%x) and exit reason is 0x%x\n",
- __func__, vectoring_info, exit_reason);
+ exit_reason != EXIT_REASON_TASK_SWITCH)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
+ vcpu->run->internal.ndata = 2;
+ vcpu->run->internal.data[0] = vectoring_info;
+ vcpu->run->internal.data[1] = exit_reason;
+ return 0;
+ }
if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
!(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
@@ -7309,6 +7385,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
.set_tsc_khz = vmx_set_tsc_khz,
+ .read_tsc_offset = vmx_read_tsc_offset,
.write_tsc_offset = vmx_write_tsc_offset,
.adjust_tsc_offset = vmx_adjust_tsc_offset,
.compute_tsc_offset = vmx_compute_tsc_offset,
@@ -7367,6 +7444,11 @@ static int __init vmx_init(void)
if (r)
goto out3;
+#ifdef CONFIG_KEXEC
+ rcu_assign_pointer(crash_vmclear_loaded_vmcss,
+ crash_vmclear_local_loaded_vmcss);
+#endif
+
vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
@@ -7404,6 +7486,11 @@ static void __exit vmx_exit(void)
free_page((unsigned long)vmx_io_bitmap_b);
free_page((unsigned long)vmx_io_bitmap_a);
+#ifdef CONFIG_KEXEC
+ rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL);
+ synchronize_rcu();
+#endif
+
kvm_exit();
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4f7641756be2..76f54461f7cb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -46,6 +46,8 @@
#include <linux/uaccess.h>
#include <linux/hash.h>
#include <linux/pci.h>
+#include <linux/timekeeper_internal.h>
+#include <linux/pvclock_gtod.h>
#include <trace/events/kvm.h>
#define CREATE_TRACE_POINTS
@@ -158,7 +160,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
u64 __read_mostly host_xcr0;
-int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
+static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
+
+static int kvm_vcpu_reset(struct kvm_vcpu *vcpu);
static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
{
@@ -633,7 +637,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
}
if (is_long_mode(vcpu)) {
- if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) {
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {
if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
return 1;
} else
@@ -827,6 +831,7 @@ static u32 msrs_to_save[] = {
static unsigned num_msrs_to_save;
static const u32 emulated_msrs[] = {
+ MSR_IA32_TSC_ADJUST,
MSR_IA32_TSCDEADLINE,
MSR_IA32_MISC_ENABLE,
MSR_IA32_MCG_STATUS,
@@ -886,9 +891,9 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
- return kvm_x86_ops->set_msr(vcpu, msr_index, data);
+ return kvm_x86_ops->set_msr(vcpu, msr);
}
/*
@@ -896,9 +901,63 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
*/
static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
- return kvm_set_msr(vcpu, index, *data);
+ struct msr_data msr;
+
+ msr.data = *data;
+ msr.index = index;
+ msr.host_initiated = true;
+ return kvm_set_msr(vcpu, &msr);
}
+#ifdef CONFIG_X86_64
+struct pvclock_gtod_data {
+ seqcount_t seq;
+
+ struct { /* extract of a clocksource struct */
+ int vclock_mode;
+ cycle_t cycle_last;
+ cycle_t mask;
+ u32 mult;
+ u32 shift;
+ } clock;
+
+ /* open coded 'struct timespec' */
+ u64 monotonic_time_snsec;
+ time_t monotonic_time_sec;
+};
+
+static struct pvclock_gtod_data pvclock_gtod_data;
+
+static void update_pvclock_gtod(struct timekeeper *tk)
+{
+ struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
+
+ write_seqcount_begin(&vdata->seq);
+
+ /* copy pvclock gtod data */
+ vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode;
+ vdata->clock.cycle_last = tk->clock->cycle_last;
+ vdata->clock.mask = tk->clock->mask;
+ vdata->clock.mult = tk->mult;
+ vdata->clock.shift = tk->shift;
+
+ vdata->monotonic_time_sec = tk->xtime_sec
+ + tk->wall_to_monotonic.tv_sec;
+ vdata->monotonic_time_snsec = tk->xtime_nsec
+ + (tk->wall_to_monotonic.tv_nsec
+ << tk->shift);
+ while (vdata->monotonic_time_snsec >=
+ (((u64)NSEC_PER_SEC) << tk->shift)) {
+ vdata->monotonic_time_snsec -=
+ ((u64)NSEC_PER_SEC) << tk->shift;
+ vdata->monotonic_time_sec++;
+ }
+
+ write_seqcount_end(&vdata->seq);
+}
+#endif
+
+
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
int version;
@@ -995,6 +1054,10 @@ static inline u64 get_kernel_ns(void)
return timespec_to_ns(&ts);
}
+#ifdef CONFIG_X86_64
+static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
+#endif
+
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
unsigned long max_tsc_khz;
@@ -1046,12 +1109,47 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
return tsc;
}
-void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
+void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ bool vcpus_matched;
+ bool do_request = false;
+ struct kvm_arch *ka = &vcpu->kvm->arch;
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+
+ vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
+ atomic_read(&vcpu->kvm->online_vcpus));
+
+ if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC)
+ if (!ka->use_master_clock)
+ do_request = 1;
+
+ if (!vcpus_matched && ka->use_master_clock)
+ do_request = 1;
+
+ if (do_request)
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+
+ trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
+ atomic_read(&vcpu->kvm->online_vcpus),
+ ka->use_master_clock, gtod->clock.vclock_mode);
+#endif
+}
+
+static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
+{
+ u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
+ vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
+}
+
+void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
struct kvm *kvm = vcpu->kvm;
u64 offset, ns, elapsed;
unsigned long flags;
s64 usdiff;
+ bool matched;
+ u64 data = msr->data;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
@@ -1094,6 +1192,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
}
+ matched = true;
} else {
/*
* We split periods of matched TSC writes into generations.
@@ -1108,6 +1207,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
kvm->arch.cur_tsc_nsec = ns;
kvm->arch.cur_tsc_write = data;
kvm->arch.cur_tsc_offset = offset;
+ matched = false;
pr_debug("kvm: new tsc generation %u, clock %llu\n",
kvm->arch.cur_tsc_generation, data);
}
@@ -1129,26 +1229,195 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
+ if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
+ update_ia32_tsc_adjust_msr(vcpu, offset);
kvm_x86_ops->write_tsc_offset(vcpu, offset);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+
+ spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
+ if (matched)
+ kvm->arch.nr_vcpus_matched_tsc++;
+ else
+ kvm->arch.nr_vcpus_matched_tsc = 0;
+
+ kvm_track_tsc_matching(vcpu);
+ spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
}
EXPORT_SYMBOL_GPL(kvm_write_tsc);
+#ifdef CONFIG_X86_64
+
+static cycle_t read_tsc(void)
+{
+ cycle_t ret;
+ u64 last;
+
+ /*
+ * Empirically, a fence (of type that depends on the CPU)
+ * before rdtsc is enough to ensure that rdtsc is ordered
+ * with respect to loads. The various CPU manuals are unclear
+ * as to whether rdtsc can be reordered with later loads,
+ * but no one has ever seen it happen.
+ */
+ rdtsc_barrier();
+ ret = (cycle_t)vget_cycles();
+
+ last = pvclock_gtod_data.clock.cycle_last;
+
+ if (likely(ret >= last))
+ return ret;
+
+ /*
+ * GCC likes to generate cmov here, but this branch is extremely
+ * predictable (it's just a funciton of time and the likely is
+ * very likely) and there's a data dependence, so force GCC
+ * to generate a branch instead. I don't barrier() because
+ * we don't actually need a barrier, and if this function
+ * ever gets inlined it will generate worse code.
+ */
+ asm volatile ("");
+ return last;
+}
+
+static inline u64 vgettsc(cycle_t *cycle_now)
+{
+ long v;
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+
+ *cycle_now = read_tsc();
+
+ v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
+ return v * gtod->clock.mult;
+}
+
+static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
+{
+ unsigned long seq;
+ u64 ns;
+ int mode;
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+
+ ts->tv_nsec = 0;
+ do {
+ seq = read_seqcount_begin(&gtod->seq);
+ mode = gtod->clock.vclock_mode;
+ ts->tv_sec = gtod->monotonic_time_sec;
+ ns = gtod->monotonic_time_snsec;
+ ns += vgettsc(cycle_now);
+ ns >>= gtod->clock.shift;
+ } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+ timespec_add_ns(ts, ns);
+
+ return mode;
+}
+
+/* returns true if host is using tsc clocksource */
+static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
+{
+ struct timespec ts;
+
+ /* checked again under seqlock below */
+ if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
+ return false;
+
+ if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)
+ return false;
+
+ monotonic_to_bootbased(&ts);
+ *kernel_ns = timespec_to_ns(&ts);
+
+ return true;
+}
+#endif
+
+/*
+ *
+ * Assuming a stable TSC across physical CPUS, and a stable TSC
+ * across virtual CPUs, the following condition is possible.
+ * Each numbered line represents an event visible to both
+ * CPUs at the next numbered event.
+ *
+ * "timespecX" represents host monotonic time. "tscX" represents
+ * RDTSC value.
+ *
+ * VCPU0 on CPU0 | VCPU1 on CPU1
+ *
+ * 1. read timespec0,tsc0
+ * 2. | timespec1 = timespec0 + N
+ * | tsc1 = tsc0 + M
+ * 3. transition to guest | transition to guest
+ * 4. ret0 = timespec0 + (rdtsc - tsc0) |
+ * 5. | ret1 = timespec1 + (rdtsc - tsc1)
+ * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
+ *
+ * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
+ *
+ * - ret0 < ret1
+ * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
+ * ...
+ * - 0 < N - M => M < N
+ *
+ * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
+ * always the case (the difference between two distinct xtime instances
+ * might be smaller then the difference between corresponding TSC reads,
+ * when updating guest vcpus pvclock areas).
+ *
+ * To avoid that problem, do not allow visibility of distinct
+ * system_timestamp/tsc_timestamp values simultaneously: use a master
+ * copy of host monotonic time values. Update that master copy
+ * in lockstep.
+ *
+ * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
+ *
+ */
+
+static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+ struct kvm_arch *ka = &kvm->arch;
+ int vclock_mode;
+ bool host_tsc_clocksource, vcpus_matched;
+
+ vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
+ atomic_read(&kvm->online_vcpus));
+
+ /*
+ * If the host uses TSC clock, then passthrough TSC as stable
+ * to the guest.
+ */
+ host_tsc_clocksource = kvm_get_time_and_clockread(
+ &ka->master_kernel_ns,
+ &ka->master_cycle_now);
+
+ ka->use_master_clock = host_tsc_clocksource & vcpus_matched;
+
+ if (ka->use_master_clock)
+ atomic_set(&kvm_guest_has_master_clock, 1);
+
+ vclock_mode = pvclock_gtod_data.clock.vclock_mode;
+ trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
+ vcpus_matched);
+#endif
+}
+
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
- unsigned long flags;
+ unsigned long flags, this_tsc_khz;
struct kvm_vcpu_arch *vcpu = &v->arch;
+ struct kvm_arch *ka = &v->kvm->arch;
void *shared_kaddr;
- unsigned long this_tsc_khz;
s64 kernel_ns, max_kernel_ns;
- u64 tsc_timestamp;
+ u64 tsc_timestamp, host_tsc;
+ struct pvclock_vcpu_time_info *guest_hv_clock;
u8 pvclock_flags;
+ bool use_master_clock;
+
+ kernel_ns = 0;
+ host_tsc = 0;
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
- tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
- kernel_ns = get_kernel_ns();
this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
if (unlikely(this_tsc_khz == 0)) {
local_irq_restore(flags);
@@ -1157,6 +1426,24 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
}
/*
+ * If the host uses TSC clock, then passthrough TSC as stable
+ * to the guest.
+ */
+ spin_lock(&ka->pvclock_gtod_sync_lock);
+ use_master_clock = ka->use_master_clock;
+ if (use_master_clock) {
+ host_tsc = ka->master_cycle_now;
+ kernel_ns = ka->master_kernel_ns;
+ }
+ spin_unlock(&ka->pvclock_gtod_sync_lock);
+ if (!use_master_clock) {
+ host_tsc = native_read_tsc();
+ kernel_ns = get_kernel_ns();
+ }
+
+ tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
+
+ /*
* We may have to catch up the TSC to match elapsed wall clock
* time for two reasons, even if kvmclock is used.
* 1) CPU could have been running below the maximum TSC rate
@@ -1217,23 +1504,20 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hw_tsc_khz = this_tsc_khz;
}
- if (max_kernel_ns > kernel_ns)
- kernel_ns = max_kernel_ns;
-
+ /* with a master <monotonic time, tsc value> tuple,
+ * pvclock clock reads always increase at the (scaled) rate
+ * of guest TSC - no need to deal with sampling errors.
+ */
+ if (!use_master_clock) {
+ if (max_kernel_ns > kernel_ns)
+ kernel_ns = max_kernel_ns;
+ }
/* With all the info we got, fill in the values */
vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
vcpu->last_kernel_ns = kernel_ns;
vcpu->last_guest_tsc = tsc_timestamp;
- pvclock_flags = 0;
- if (vcpu->pvclock_set_guest_stopped_request) {
- pvclock_flags |= PVCLOCK_GUEST_STOPPED;
- vcpu->pvclock_set_guest_stopped_request = false;
- }
-
- vcpu->hv_clock.flags = pvclock_flags;
-
/*
* The interface expects us to write an even number signaling that the
* update is finished. Since the guest won't see the intermediate
@@ -1243,6 +1527,22 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
shared_kaddr = kmap_atomic(vcpu->time_page);
+ guest_hv_clock = shared_kaddr + vcpu->time_offset;
+
+ /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
+ pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
+
+ if (vcpu->pvclock_set_guest_stopped_request) {
+ pvclock_flags |= PVCLOCK_GUEST_STOPPED;
+ vcpu->pvclock_set_guest_stopped_request = false;
+ }
+
+ /* If the host uses TSC clocksource, then it is stable */
+ if (use_master_clock)
+ pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
+
+ vcpu->hv_clock.flags = pvclock_flags;
+
memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
sizeof(vcpu->hv_clock));
@@ -1572,9 +1872,11 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
}
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
bool pr = false;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
switch (msr) {
case MSR_EFER:
@@ -1625,6 +1927,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
case MSR_IA32_TSCDEADLINE:
kvm_set_lapic_tscdeadline_msr(vcpu, data);
break;
+ case MSR_IA32_TSC_ADJUST:
+ if (guest_cpuid_has_tsc_adjust(vcpu)) {
+ if (!msr_info->host_initiated) {
+ u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
+ kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true);
+ }
+ vcpu->arch.ia32_tsc_adjust_msr = data;
+ }
+ break;
case MSR_IA32_MISC_ENABLE:
vcpu->arch.ia32_misc_enable_msr = data;
break;
@@ -1984,6 +2295,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_TSCDEADLINE:
data = kvm_get_lapic_tscdeadline_msr(vcpu);
break;
+ case MSR_IA32_TSC_ADJUST:
+ data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
+ break;
case MSR_IA32_MISC_ENABLE:
data = vcpu->arch.ia32_misc_enable_msr;
break;
@@ -2342,7 +2656,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_x86_ops->write_tsc_offset(vcpu, offset);
vcpu->arch.tsc_catchup = 1;
}
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ /*
+ * On a host with synchronized TSC, there is no need to update
+ * kvmclock on vcpu->cpu migration
+ */
+ if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
if (vcpu->cpu != cpu)
kvm_migrate_timers(vcpu);
vcpu->cpu = cpu;
@@ -2691,15 +3010,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (!vcpu->arch.apic)
goto out;
u.lapic = memdup_user(argp, sizeof(*u.lapic));
- if (IS_ERR(u.lapic)) {
- r = PTR_ERR(u.lapic);
- goto out;
- }
+ if (IS_ERR(u.lapic))
+ return PTR_ERR(u.lapic);
r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
- if (r)
- goto out;
- r = 0;
break;
}
case KVM_INTERRUPT: {
@@ -2709,16 +3023,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (copy_from_user(&irq, argp, sizeof irq))
goto out;
r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
- if (r)
- goto out;
- r = 0;
break;
}
case KVM_NMI: {
r = kvm_vcpu_ioctl_nmi(vcpu);
- if (r)
- goto out;
- r = 0;
break;
}
case KVM_SET_CPUID: {
@@ -2729,8 +3037,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
goto out;
r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
- if (r)
- goto out;
break;
}
case KVM_SET_CPUID2: {
@@ -2742,8 +3048,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
goto out;
r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
cpuid_arg->entries);
- if (r)
- goto out;
break;
}
case KVM_GET_CPUID2: {
@@ -2875,10 +3179,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
case KVM_SET_XSAVE: {
u.xsave = memdup_user(argp, sizeof(*u.xsave));
- if (IS_ERR(u.xsave)) {
- r = PTR_ERR(u.xsave);
- goto out;
- }
+ if (IS_ERR(u.xsave))
+ return PTR_ERR(u.xsave);
r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
break;
@@ -2900,10 +3202,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
case KVM_SET_XCRS: {
u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
- if (IS_ERR(u.xcrs)) {
- r = PTR_ERR(u.xcrs);
- goto out;
- }
+ if (IS_ERR(u.xcrs))
+ return PTR_ERR(u.xcrs);
r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
break;
@@ -2951,7 +3251,7 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
int ret;
if (addr > (unsigned int)(-3 * PAGE_SIZE))
- return -1;
+ return -EINVAL;
ret = kvm_x86_ops->set_tss_addr(kvm, addr);
return ret;
}
@@ -3212,8 +3512,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
switch (ioctl) {
case KVM_SET_TSS_ADDR:
r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
- if (r < 0)
- goto out;
break;
case KVM_SET_IDENTITY_MAP_ADDR: {
u64 ident_addr;
@@ -3222,14 +3520,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
goto out;
r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
- if (r < 0)
- goto out;
break;
}
case KVM_SET_NR_MMU_PAGES:
r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
- if (r)
- goto out;
break;
case KVM_GET_NR_MMU_PAGES:
r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
@@ -3320,8 +3614,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = 0;
get_irqchip_out:
kfree(chip);
- if (r)
- goto out;
break;
}
case KVM_SET_IRQCHIP: {
@@ -3343,8 +3635,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = 0;
set_irqchip_out:
kfree(chip);
- if (r)
- goto out;
break;
}
case KVM_GET_PIT: {
@@ -3371,9 +3661,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (!kvm->arch.vpit)
goto out;
r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
- if (r)
- goto out;
- r = 0;
break;
}
case KVM_GET_PIT2: {
@@ -3397,9 +3684,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (!kvm->arch.vpit)
goto out;
r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
- if (r)
- goto out;
- r = 0;
break;
}
case KVM_REINJECT_CONTROL: {
@@ -3408,9 +3692,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (copy_from_user(&control, argp, sizeof(control)))
goto out;
r = kvm_vm_ioctl_reinject(kvm, &control);
- if (r)
- goto out;
- r = 0;
break;
}
case KVM_XEN_HVM_CONFIG: {
@@ -4273,7 +4554,12 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
u32 msr_index, u64 data)
{
- return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
+ struct msr_data msr;
+
+ msr.data = data;
+ msr.index = msr_index;
+ msr.host_initiated = false;
+ return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
}
static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
@@ -4495,7 +4781,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
* instruction -> ...
*/
pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
- if (!is_error_pfn(pfn)) {
+ if (!is_error_noslot_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
return true;
}
@@ -4881,6 +5167,50 @@ static void kvm_set_mmio_spte_mask(void)
kvm_mmu_set_mmio_spte_mask(mask);
}
+#ifdef CONFIG_X86_64
+static void pvclock_gtod_update_fn(struct work_struct *work)
+{
+ struct kvm *kvm;
+
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ raw_spin_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
+ atomic_set(&kvm_guest_has_master_clock, 0);
+ raw_spin_unlock(&kvm_lock);
+}
+
+static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
+
+/*
+ * Notification about pvclock gtod data update.
+ */
+static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
+ void *priv)
+{
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ struct timekeeper *tk = priv;
+
+ update_pvclock_gtod(tk);
+
+ /* disable master clock if host does not trust, or does not
+ * use, TSC clocksource
+ */
+ if (gtod->clock.vclock_mode != VCLOCK_TSC &&
+ atomic_read(&kvm_guest_has_master_clock) != 0)
+ queue_work(system_long_wq, &pvclock_gtod_work);
+
+ return 0;
+}
+
+static struct notifier_block pvclock_gtod_notifier = {
+ .notifier_call = pvclock_gtod_notify,
+};
+#endif
+
int kvm_arch_init(void *opaque)
{
int r;
@@ -4922,6 +5252,10 @@ int kvm_arch_init(void *opaque)
host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
kvm_lapic_init();
+#ifdef CONFIG_X86_64
+ pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
+#endif
+
return 0;
out:
@@ -4936,6 +5270,9 @@ void kvm_arch_exit(void)
cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
+#ifdef CONFIG_X86_64
+ pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
+#endif
kvm_x86_ops = NULL;
kvm_mmu_module_exit();
}
@@ -5059,7 +5396,7 @@ out:
}
EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
-int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
+static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
char instruction[3];
@@ -5235,6 +5572,29 @@ static void process_nmi(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
+static void kvm_gen_update_masterclock(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+ int i;
+ struct kvm_vcpu *vcpu;
+ struct kvm_arch *ka = &kvm->arch;
+
+ spin_lock(&ka->pvclock_gtod_sync_lock);
+ kvm_make_mclock_inprogress_request(kvm);
+ /* no guest entries from this point */
+ pvclock_update_vm_gtod_copy(kvm);
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+
+ /* guest entries allowed */
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
+
+ spin_unlock(&ka->pvclock_gtod_sync_lock);
+#endif
+}
+
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
@@ -5247,6 +5607,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_mmu_unload(vcpu);
if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
__kvm_migrate_timers(vcpu);
+ if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
+ kvm_gen_update_masterclock(vcpu->kvm);
if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
r = kvm_guest_time_update(vcpu);
if (unlikely(r))
@@ -5362,7 +5724,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (hw_breakpoint_active())
hw_breakpoint_restore();
- vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
+ vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
+ native_read_tsc());
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
@@ -5419,7 +5782,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
pr_debug("vcpu %d received sipi with vector # %x\n",
vcpu->vcpu_id, vcpu->arch.sipi_vector);
kvm_lapic_reset(vcpu);
- r = kvm_arch_vcpu_reset(vcpu);
+ r = kvm_vcpu_reset(vcpu);
if (r)
return r;
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -6047,7 +6410,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
r = vcpu_load(vcpu);
if (r)
return r;
- r = kvm_arch_vcpu_reset(vcpu);
+ r = kvm_vcpu_reset(vcpu);
if (r == 0)
r = kvm_mmu_setup(vcpu);
vcpu_put(vcpu);
@@ -6055,6 +6418,23 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
return r;
}
+int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+ int r;
+ struct msr_data msr;
+
+ r = vcpu_load(vcpu);
+ if (r)
+ return r;
+ msr.data = 0x0;
+ msr.index = MSR_IA32_TSC;
+ msr.host_initiated = true;
+ kvm_write_tsc(vcpu, &msr);
+ vcpu_put(vcpu);
+
+ return r;
+}
+
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
int r;
@@ -6069,7 +6449,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kvm_x86_ops->vcpu_free(vcpu);
}
-int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
+static int kvm_vcpu_reset(struct kvm_vcpu *vcpu)
{
atomic_set(&vcpu->arch.nmi_queued, 0);
vcpu->arch.nmi_pending = 0;
@@ -6092,6 +6472,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
kvm_pmu_reset(vcpu);
+ memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
+ vcpu->arch.regs_avail = ~0;
+ vcpu->arch.regs_dirty = ~0;
+
return kvm_x86_ops->vcpu_reset(vcpu);
}
@@ -6168,6 +6552,8 @@ int kvm_arch_hardware_enable(void *garbage)
kvm_for_each_vcpu(i, vcpu, kvm) {
vcpu->arch.tsc_offset_adjustment += delta_cyc;
vcpu->arch.last_host_tsc = local_tsc;
+ set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
+ &vcpu->requests);
}
/*
@@ -6258,10 +6644,17 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
goto fail_free_mce_banks;
+ r = fx_init(vcpu);
+ if (r)
+ goto fail_free_wbinvd_dirty_mask;
+
+ vcpu->arch.ia32_tsc_adjust_msr = 0x0;
kvm_async_pf_hash_reset(vcpu);
kvm_pmu_init(vcpu);
return 0;
+fail_free_wbinvd_dirty_mask:
+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
fail_free_mce_banks:
kfree(vcpu->arch.mce_banks);
fail_free_lapic:
@@ -6305,6 +6698,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
mutex_init(&kvm->arch.apic_map_lock);
+ spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+
+ pvclock_update_vm_gtod_copy(kvm);
return 0;
}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2b5219c12ac8..e224f7a671b6 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -112,7 +112,7 @@ void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
-void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
+void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
gva_t addr, void *val, unsigned int bytes,
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 4df6c373421a..205ad328aa52 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -22,6 +22,7 @@
#include <asm/hpet.h>
#include <asm/unistd.h>
#include <asm/io.h>
+#include <asm/pvclock.h>
#define gtod (&VVAR(vsyscall_gtod_data))
@@ -62,6 +63,76 @@ static notrace cycle_t vread_hpet(void)
return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
}
+#ifdef CONFIG_PARAVIRT_CLOCK
+
+static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
+{
+ const struct pvclock_vsyscall_time_info *pvti_base;
+ int idx = cpu / (PAGE_SIZE/PVTI_SIZE);
+ int offset = cpu % (PAGE_SIZE/PVTI_SIZE);
+
+ BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);
+
+ pvti_base = (struct pvclock_vsyscall_time_info *)
+ __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);
+
+ return &pvti_base[offset];
+}
+
+static notrace cycle_t vread_pvclock(int *mode)
+{
+ const struct pvclock_vsyscall_time_info *pvti;
+ cycle_t ret;
+ u64 last;
+ u32 version;
+ u32 migrate_count;
+ u8 flags;
+ unsigned cpu, cpu1;
+
+
+ /*
+ * When looping to get a consistent (time-info, tsc) pair, we
+ * also need to deal with the possibility we can switch vcpus,
+ * so make sure we always re-fetch time-info for the current vcpu.
+ */
+ do {
+ cpu = __getcpu() & VGETCPU_CPU_MASK;
+ /* TODO: We can put vcpu id into higher bits of pvti.version.
+ * This will save a couple of cycles by getting rid of
+ * __getcpu() calls (Gleb).
+ */
+
+ pvti = get_pvti(cpu);
+
+ migrate_count = pvti->migrate_count;
+
+ version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
+
+ /*
+ * Test we're still on the cpu as well as the version.
+ * We could have been migrated just after the first
+ * vgetcpu but before fetching the version, so we
+ * wouldn't notice a version change.
+ */
+ cpu1 = __getcpu() & VGETCPU_CPU_MASK;
+ } while (unlikely(cpu != cpu1 ||
+ (pvti->pvti.version & 1) ||
+ pvti->pvti.version != version ||
+ pvti->migrate_count != migrate_count));
+
+ if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
+ *mode = VCLOCK_NONE;
+
+ /* refer to tsc.c read_tsc() comment for rationale */
+ last = VVAR(vsyscall_gtod_data).clock.cycle_last;
+
+ if (likely(ret >= last))
+ return ret;
+
+ return last;
+}
+#endif
+
notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
{
long ret;
@@ -80,7 +151,7 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
}
-notrace static inline u64 vgetsns(void)
+notrace static inline u64 vgetsns(int *mode)
{
long v;
cycles_t cycles;
@@ -88,6 +159,10 @@ notrace static inline u64 vgetsns(void)
cycles = vread_tsc();
else if (gtod->clock.vclock_mode == VCLOCK_HPET)
cycles = vread_hpet();
+#ifdef CONFIG_PARAVIRT_CLOCK
+ else if (gtod->clock.vclock_mode == VCLOCK_PVCLOCK)
+ cycles = vread_pvclock(mode);
+#endif
else
return 0;
v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask;
@@ -107,7 +182,7 @@ notrace static int __always_inline do_realtime(struct timespec *ts)
mode = gtod->clock.vclock_mode;
ts->tv_sec = gtod->wall_time_sec;
ns = gtod->wall_time_snsec;
- ns += vgetsns();
+ ns += vgetsns(&mode);
ns >>= gtod->clock.shift;
} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
@@ -127,7 +202,7 @@ notrace static int do_monotonic(struct timespec *ts)
mode = gtod->clock.vclock_mode;
ts->tv_sec = gtod->monotonic_time_sec;
ns = gtod->monotonic_time_snsec;
- ns += vgetsns();
+ ns += vgetsns(&mode);
ns >>= gtod->clock.shift;
} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
timespec_add_ns(ts, ns);
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
index 5463ad558573..2f94b039e55b 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/vdso/vgetcpu.c
@@ -17,15 +17,10 @@ __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
{
unsigned int p;
- if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
- /* Load per CPU data from RDTSCP */
- native_read_tscp(&p);
- } else {
- /* Load per CPU data from GDT */
- asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
- }
+ p = __getcpu();
+
if (cpu)
- *cpu = p & 0xfff;
+ *cpu = p & VGETCPU_CPU_MASK;
if (node)
*node = p >> 12;
return 0;