summaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig9
-rw-r--r--arch/x86/kvm/Makefile6
-rw-r--r--arch/x86/kvm/cpuid.c17
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/emulate.c303
-rw-r--r--arch/x86/kvm/ioapic.c9
-rw-r--r--arch/x86/kvm/irq_comm.c14
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h5
-rw-r--r--arch/x86/kvm/lapic.c59
-rw-r--r--arch/x86/kvm/lapic.h15
-rw-r--r--arch/x86/kvm/mmu.c678
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/mmu_audit.c20
-rw-r--r--arch/x86/kvm/mmutrace.h2
-rw-r--r--arch/x86/kvm/mtrr.c699
-rw-r--r--arch/x86/kvm/paging_tmpl.h18
-rw-r--r--arch/x86/kvm/pmu.c553
-rw-r--r--arch/x86/kvm/pmu.h118
-rw-r--r--arch/x86/kvm/pmu_amd.c207
-rw-r--r--arch/x86/kvm/pmu_intel.c358
-rw-r--r--arch/x86/kvm/svm.c118
-rw-r--r--arch/x86/kvm/trace.h22
-rw-r--r--arch/x86/kvm/vmx.c370
-rw-r--r--arch/x86/kvm/x86.c958
-rw-r--r--arch/x86/kvm/x86.h8
25 files changed, 3242 insertions, 1334 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 413a7bf9efbb..d8a1d56276e1 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -86,15 +86,16 @@ config KVM_MMU_AUDIT
auditing of KVM MMU events at runtime.
config KVM_DEVICE_ASSIGNMENT
- bool "KVM legacy PCI device assignment support"
+ bool "KVM legacy PCI device assignment support (DEPRECATED)"
depends on KVM && PCI && IOMMU_API
- default y
+ default n
---help---
Provide support for legacy PCI device assignment through KVM. The
kernel now also supports a full featured userspace device driver
- framework through VFIO, which supersedes much of this support.
+ framework through VFIO, which supersedes this support and provides
+ better security.
- If unsure, say Y.
+ If unsure, say N.
# OK, it's a little counter-intuitive to do this, but it puts it neatly under
# the virtualization menu.
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 16e8f962eaad..67d215cb8953 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -12,10 +12,10 @@ kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
- i8254.o ioapic.o irq_comm.o cpuid.o pmu.o
+ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o
kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o
-kvm-intel-y += vmx.o
-kvm-amd-y += svm.o
+kvm-intel-y += vmx.o pmu_intel.o
+kvm-amd-y += svm.o pmu_amd.o
obj-$(CONFIG_KVM) += kvm.o
obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 1d08ad3582d0..64dd46793099 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -16,14 +16,14 @@
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
-#include <asm/i387.h> /* For use_eager_fpu. Ugh! */
-#include <asm/fpu-internal.h> /* For use_eager_fpu. Ugh! */
+#include <asm/fpu/internal.h> /* For use_eager_fpu. Ugh! */
#include <asm/user.h>
-#include <asm/xsave.h>
+#include <asm/fpu/xstate.h>
#include "cpuid.h"
#include "lapic.h"
#include "mmu.h"
#include "trace.h"
+#include "pmu.h"
static u32 xstate_required_size(u64 xstate_bv, bool compacted)
{
@@ -97,7 +97,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
- vcpu->arch.eager_fpu = guest_cpuid_has_mpx(vcpu);
+ vcpu->arch.eager_fpu = use_eager_fpu() || guest_cpuid_has_mpx(vcpu);
/*
* The existing code assumes virtual address is 48-bit in the canonical
@@ -111,7 +111,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
/* Update physical-address width */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
- kvm_pmu_cpuid_update(vcpu);
+ kvm_pmu_refresh(vcpu);
return 0;
}
@@ -415,6 +415,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
}
break;
}
+ case 6: /* Thermal management */
+ entry->eax = 0x4; /* allow ARAT */
+ entry->ebx = 0;
+ entry->ecx = 0;
+ entry->edx = 0;
+ break;
case 7: {
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
/* Mask ebx against host capability word 9 */
@@ -591,7 +597,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
break;
case 3: /* Processor serial number */
case 5: /* MONITOR/MWAIT */
- case 6: /* Thermal management */
case 0xC0000002:
case 0xC0000003:
case 0xC0000004:
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 496b3695d3d3..dd05b9cef6ae 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -70,6 +70,14 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu)
return best && (best->ebx & bit(X86_FEATURE_FSGSBASE));
}
+static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+ return best && (best->edx & bit(X86_FEATURE_LM));
+}
+
static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 630bcb0d7a04..e7a4fde5d631 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -25,6 +25,7 @@
#include <linux/module.h>
#include <asm/kvm_emulate.h>
#include <linux/stringify.h>
+#include <asm/debugreg.h>
#include "x86.h"
#include "tss.h"
@@ -523,13 +524,9 @@ static void masked_increment(ulong *reg, ulong mask, int inc)
static inline void
register_address_increment(struct x86_emulate_ctxt *ctxt, int reg, int inc)
{
- ulong mask;
+ ulong *preg = reg_rmw(ctxt, reg);
- if (ctxt->ad_bytes == sizeof(unsigned long))
- mask = ~0UL;
- else
- mask = ad_mask(ctxt);
- masked_increment(reg_rmw(ctxt, reg), mask, inc);
+ assign_register(preg, *preg + inc, ctxt->ad_bytes);
}
static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
@@ -2262,6 +2259,260 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt)
return rc;
}
+static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
+{
+ u32 eax, ebx, ecx, edx;
+
+ eax = 0x80000001;
+ ecx = 0;
+ ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx);
+ return edx & bit(X86_FEATURE_LM);
+}
+
+#define GET_SMSTATE(type, smbase, offset) \
+ ({ \
+ type __val; \
+ int r = ctxt->ops->read_std(ctxt, smbase + offset, &__val, \
+ sizeof(__val), NULL); \
+ if (r != X86EMUL_CONTINUE) \
+ return X86EMUL_UNHANDLEABLE; \
+ __val; \
+ })
+
+static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags)
+{
+ desc->g = (flags >> 23) & 1;
+ desc->d = (flags >> 22) & 1;
+ desc->l = (flags >> 21) & 1;
+ desc->avl = (flags >> 20) & 1;
+ desc->p = (flags >> 15) & 1;
+ desc->dpl = (flags >> 13) & 3;
+ desc->s = (flags >> 12) & 1;
+ desc->type = (flags >> 8) & 15;
+}
+
+static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
+{
+ struct desc_struct desc;
+ int offset;
+ u16 selector;
+
+ selector = GET_SMSTATE(u32, smbase, 0x7fa8 + n * 4);
+
+ if (n < 3)
+ offset = 0x7f84 + n * 12;
+ else
+ offset = 0x7f2c + (n - 3) * 12;
+
+ set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8));
+ set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4));
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, offset));
+ ctxt->ops->set_segment(ctxt, selector, &desc, 0, n);
+ return X86EMUL_CONTINUE;
+}
+
+static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
+{
+ struct desc_struct desc;
+ int offset;
+ u16 selector;
+ u32 base3;
+
+ offset = 0x7e00 + n * 16;
+
+ selector = GET_SMSTATE(u16, smbase, offset);
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smbase, offset + 2) << 8);
+ set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4));
+ set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8));
+ base3 = GET_SMSTATE(u32, smbase, offset + 12);
+
+ ctxt->ops->set_segment(ctxt, selector, &desc, base3, n);
+ return X86EMUL_CONTINUE;
+}
+
+static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
+ u64 cr0, u64 cr4)
+{
+ int bad;
+
+ /*
+ * First enable PAE, long mode needs it before CR0.PG = 1 is set.
+ * Then enable protected mode. However, PCID cannot be enabled
+ * if EFER.LMA=0, so set it separately.
+ */
+ bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ bad = ctxt->ops->set_cr(ctxt, 0, cr0);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+
+ if (cr4 & X86_CR4_PCIDE) {
+ bad = ctxt->ops->set_cr(ctxt, 4, cr4);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+ }
+
+ return X86EMUL_CONTINUE;
+}
+
+static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+ struct desc_struct desc;
+ struct desc_ptr dt;
+ u16 selector;
+ u32 val, cr0, cr4;
+ int i;
+
+ cr0 = GET_SMSTATE(u32, smbase, 0x7ffc);
+ ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8));
+ ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED;
+ ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0);
+
+ for (i = 0; i < 8; i++)
+ *reg_write(ctxt, i) = GET_SMSTATE(u32, smbase, 0x7fd0 + i * 4);
+
+ val = GET_SMSTATE(u32, smbase, 0x7fcc);
+ ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
+ val = GET_SMSTATE(u32, smbase, 0x7fc8);
+ ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
+
+ selector = GET_SMSTATE(u32, smbase, 0x7fc4);
+ set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f64));
+ set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f60));
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f5c));
+ ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR);
+
+ selector = GET_SMSTATE(u32, smbase, 0x7fc0);
+ set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f80));
+ set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f7c));
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f78));
+ ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR);
+
+ dt.address = GET_SMSTATE(u32, smbase, 0x7f74);
+ dt.size = GET_SMSTATE(u32, smbase, 0x7f70);
+ ctxt->ops->set_gdt(ctxt, &dt);
+
+ dt.address = GET_SMSTATE(u32, smbase, 0x7f58);
+ dt.size = GET_SMSTATE(u32, smbase, 0x7f54);
+ ctxt->ops->set_idt(ctxt, &dt);
+
+ for (i = 0; i < 6; i++) {
+ int r = rsm_load_seg_32(ctxt, smbase, i);
+ if (r != X86EMUL_CONTINUE)
+ return r;
+ }
+
+ cr4 = GET_SMSTATE(u32, smbase, 0x7f14);
+
+ ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8));
+
+ return rsm_enter_protected_mode(ctxt, cr0, cr4);
+}
+
+static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+ struct desc_struct desc;
+ struct desc_ptr dt;
+ u64 val, cr0, cr4;
+ u32 base3;
+ u16 selector;
+ int i;
+
+ for (i = 0; i < 16; i++)
+ *reg_write(ctxt, i) = GET_SMSTATE(u64, smbase, 0x7ff8 - i * 8);
+
+ ctxt->_eip = GET_SMSTATE(u64, smbase, 0x7f78);
+ ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7f70) | X86_EFLAGS_FIXED;
+
+ val = GET_SMSTATE(u32, smbase, 0x7f68);
+ ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1);
+ val = GET_SMSTATE(u32, smbase, 0x7f60);
+ ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
+
+ cr0 = GET_SMSTATE(u64, smbase, 0x7f58);
+ ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u64, smbase, 0x7f50));
+ cr4 = GET_SMSTATE(u64, smbase, 0x7f48);
+ ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00));
+ val = GET_SMSTATE(u64, smbase, 0x7ed0);
+ ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA);
+
+ selector = GET_SMSTATE(u32, smbase, 0x7e90);
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e92) << 8);
+ set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e94));
+ set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e98));
+ base3 = GET_SMSTATE(u32, smbase, 0x7e9c);
+ ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR);
+
+ dt.size = GET_SMSTATE(u32, smbase, 0x7e84);
+ dt.address = GET_SMSTATE(u64, smbase, 0x7e88);
+ ctxt->ops->set_idt(ctxt, &dt);
+
+ selector = GET_SMSTATE(u32, smbase, 0x7e70);
+ rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e72) << 8);
+ set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e74));
+ set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e78));
+ base3 = GET_SMSTATE(u32, smbase, 0x7e7c);
+ ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR);
+
+ dt.size = GET_SMSTATE(u32, smbase, 0x7e64);
+ dt.address = GET_SMSTATE(u64, smbase, 0x7e68);
+ ctxt->ops->set_gdt(ctxt, &dt);
+
+ for (i = 0; i < 6; i++) {
+ int r = rsm_load_seg_64(ctxt, smbase, i);
+ if (r != X86EMUL_CONTINUE)
+ return r;
+ }
+
+ return rsm_enter_protected_mode(ctxt, cr0, cr4);
+}
+
+static int em_rsm(struct x86_emulate_ctxt *ctxt)
+{
+ unsigned long cr0, cr4, efer;
+ u64 smbase;
+ int ret;
+
+ if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0)
+ return emulate_ud(ctxt);
+
+ /*
+ * Get back to real mode, to prepare a safe state in which to load
+ * CR0/CR3/CR4/EFER. Also this will ensure that addresses passed
+ * to read_std/write_std are not virtual.
+ *
+ * CR4.PCIDE must be zero, because it is a 64-bit mode only feature.
+ */
+ cr0 = ctxt->ops->get_cr(ctxt, 0);
+ if (cr0 & X86_CR0_PE)
+ ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
+ cr4 = ctxt->ops->get_cr(ctxt, 4);
+ if (cr4 & X86_CR4_PAE)
+ ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
+ efer = 0;
+ ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
+
+ smbase = ctxt->ops->get_smbase(ctxt);
+ if (emulator_has_longmode(ctxt))
+ ret = rsm_load_state_64(ctxt, smbase + 0x8000);
+ else
+ ret = rsm_load_state_32(ctxt, smbase + 0x8000);
+
+ if (ret != X86EMUL_CONTINUE) {
+ /* FIXME: should triple fault */
+ return X86EMUL_UNHANDLEABLE;
+ }
+
+ if ((ctxt->emul_flags & X86EMUL_SMM_INSIDE_NMI_MASK) == 0)
+ ctxt->ops->set_nmi_mask(ctxt, false);
+
+ ctxt->emul_flags &= ~X86EMUL_SMM_INSIDE_NMI_MASK;
+ ctxt->emul_flags &= ~X86EMUL_SMM_MASK;
+ return X86EMUL_CONTINUE;
+}
+
static void
setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
struct desc_struct *cs, struct desc_struct *ss)
@@ -2573,6 +2824,30 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
return true;
}
+static void string_registers_quirk(struct x86_emulate_ctxt *ctxt)
+{
+ /*
+ * Intel CPUs mask the counter and pointers in quite strange
+ * manner when ECX is zero due to REP-string optimizations.
+ */
+#ifdef CONFIG_X86_64
+ if (ctxt->ad_bytes != 4 || !vendor_intel(ctxt))
+ return;
+
+ *reg_write(ctxt, VCPU_REGS_RCX) = 0;
+
+ switch (ctxt->b) {
+ case 0xa4: /* movsb */
+ case 0xa5: /* movsd/w */
+ *reg_rmw(ctxt, VCPU_REGS_RSI) &= (u32)-1;
+ /* fall through */
+ case 0xaa: /* stosb */
+ case 0xab: /* stosd/w */
+ *reg_rmw(ctxt, VCPU_REGS_RDI) &= (u32)-1;
+ }
+#endif
+}
+
static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
struct tss_segment_16 *tss)
{
@@ -2849,7 +3124,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
ulong old_tss_base =
ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
u32 desc_limit;
- ulong desc_addr;
+ ulong desc_addr, dr7;
/* FIXME: old_tss_base == ~0 ? */
@@ -2934,6 +3209,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
ret = em_push(ctxt);
}
+ ops->get_dr(ctxt, 7, &dr7);
+ ops->set_dr(ctxt, 7, dr7 & ~(DR_LOCAL_ENABLE_MASK | DR_LOCAL_SLOWDOWN));
+
return ret;
}
@@ -3840,7 +4118,7 @@ static const struct opcode group5[] = {
F(DstMem | SrcNone | Lock, em_inc),
F(DstMem | SrcNone | Lock, em_dec),
I(SrcMem | NearBranch, em_call_near_abs),
- I(SrcMemFAddr | ImplicitOps | Stack, em_call_far),
+ I(SrcMemFAddr | ImplicitOps, em_call_far),
I(SrcMem | NearBranch, em_jmp_abs),
I(SrcMemFAddr | ImplicitOps, em_jmp_far),
I(SrcMem | Stack, em_push), D(Undefined),
@@ -4173,7 +4451,7 @@ static const struct opcode twobyte_table[256] = {
F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
/* 0xA8 - 0xAF */
I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
- DI(ImplicitOps, rsm),
+ II(No64 | EmulateOnUD | ImplicitOps, em_rsm, rsm),
F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
@@ -4871,7 +5149,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
fetch_possible_mmx_operand(ctxt, &ctxt->dst);
}
- if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+ if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_PRE_EXCEPT);
if (rc != X86EMUL_CONTINUE)
@@ -4900,7 +5178,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
- if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+ if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_POST_EXCEPT);
if (rc != X86EMUL_CONTINUE)
@@ -4910,6 +5188,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
if (ctxt->rep_prefix && (ctxt->d & String)) {
/* All REP prefixes have the same first termination condition */
if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
+ string_registers_quirk(ctxt);
ctxt->eip = ctxt->_eip;
ctxt->eflags &= ~X86_EFLAGS_RF;
goto done;
@@ -4953,7 +5232,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
special_insn:
- if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) {
+ if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) {
rc = emulator_check_intercept(ctxt, ctxt->intercept,
X86_ICPT_POST_MEMACCESS);
if (rc != X86EMUL_CONTINUE)
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 28146f03c514..856f79105bb5 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -349,6 +349,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
irqe.delivery_mode = entry->fields.delivery_mode << 8;
irqe.level = 1;
irqe.shorthand = 0;
+ irqe.msi_redir_hint = false;
if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
ioapic->irr_delivered |= 1 << irq;
@@ -637,11 +638,9 @@ void kvm_ioapic_destroy(struct kvm *kvm)
struct kvm_ioapic *ioapic = kvm->arch.vioapic;
cancel_delayed_work_sync(&ioapic->eoi_inject);
- if (ioapic) {
- kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
- kvm->arch.vioapic = NULL;
- kfree(ioapic);
- }
+ kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+ kvm->arch.vioapic = NULL;
+ kfree(ioapic);
}
int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 72298b3ac025..9efff9e5b58c 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -31,6 +31,8 @@
#include "ioapic.h"
+#include "lapic.h"
+
static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level,
bool line_status)
@@ -48,11 +50,6 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
line_status);
}
-inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
-{
- return irq->delivery_mode == APIC_DM_LOWEST;
-}
-
int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_lapic_irq *irq, unsigned long *dest_map)
{
@@ -60,7 +57,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
struct kvm_vcpu *vcpu, *lowest = NULL;
if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
- kvm_is_dm_lowest_prio(irq)) {
+ kvm_lowest_prio_delivery(irq)) {
printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
irq->delivery_mode = APIC_DM_FIXED;
}
@@ -76,7 +73,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
irq->dest_id, irq->dest_mode))
continue;
- if (!kvm_is_dm_lowest_prio(irq)) {
+ if (!kvm_lowest_prio_delivery(irq)) {
if (r < 0)
r = 0;
r += kvm_apic_set_irq(vcpu, irq, dest_map);
@@ -106,9 +103,10 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
irq->delivery_mode = e->msi.data & 0x700;
+ irq->msi_redir_hint = ((e->msi.address_lo
+ & MSI_ADDR_REDIRECTION_LOWPRI) > 0);
irq->level = 1;
irq->shorthand = 0;
- /* TODO Deal with RH bit of MSI message address */
}
int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 544076c4f44b..e1e89ee4af75 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -99,4 +99,9 @@ static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
return vcpu->arch.hflags & HF_GUEST_MASK;
}
+static inline bool is_smm(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hflags & HF_SMM_MASK;
+}
+
#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4c7deb4f78a1..36e9de1b4127 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -240,6 +240,15 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
recalculate_apic_map(apic->vcpu->kvm);
}
+static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u8 id)
+{
+ u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
+
+ apic_set_reg(apic, APIC_ID, id << 24);
+ apic_set_reg(apic, APIC_LDR, ldr);
+ recalculate_apic_map(apic->vcpu->kvm);
+}
+
static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type)
{
return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED);
@@ -728,7 +737,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
dst = map->logical_map[cid];
- if (irq->delivery_mode == APIC_DM_LOWEST) {
+ if (kvm_lowest_prio_delivery(irq)) {
int l = -1;
for_each_set_bit(i, &bitmap, 16) {
if (!dst[i])
@@ -799,7 +808,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
break;
case APIC_DM_SMI:
- apic_debug("Ignoring guest SMI\n");
+ result = 1;
+ kvm_make_request(KVM_REQ_SMI, vcpu);
+ kvm_vcpu_kick(vcpu);
break;
case APIC_DM_NMI:
@@ -914,9 +925,10 @@ static void apic_send_ipi(struct kvm_lapic *apic)
irq.vector = icr_low & APIC_VECTOR_MASK;
irq.delivery_mode = icr_low & APIC_MODE_MASK;
irq.dest_mode = icr_low & APIC_DEST_MASK;
- irq.level = icr_low & APIC_INT_ASSERT;
+ irq.level = (icr_low & APIC_INT_ASSERT) != 0;
irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
irq.shorthand = icr_low & APIC_SHORT_MASK;
+ irq.msi_redir_hint = false;
if (apic_x2apic_mode(apic))
irq.dest_id = icr_high;
else
@@ -926,10 +938,11 @@ static void apic_send_ipi(struct kvm_lapic *apic)
apic_debug("icr_high 0x%x, icr_low 0x%x, "
"short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
- "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
+ "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x, "
+ "msi_redir_hint 0x%x\n",
icr_high, icr_low, irq.shorthand, irq.dest_id,
irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
- irq.vector);
+ irq.vector, irq.msi_redir_hint);
kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL);
}
@@ -1541,9 +1554,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
if ((old_value ^ value) & X2APIC_ENABLE) {
if (value & X2APIC_ENABLE) {
- u32 id = kvm_apic_id(apic);
- u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
- kvm_apic_set_ldr(apic, ldr);
+ kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id);
kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true);
} else
kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false);
@@ -1562,7 +1573,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
}
-void kvm_lapic_reset(struct kvm_vcpu *vcpu)
+void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct kvm_lapic *apic;
int i;
@@ -1576,19 +1587,22 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
/* Stop the timer in case it's a reset to an active apic */
hrtimer_cancel(&apic->lapic_timer.timer);
- kvm_apic_set_id(apic, vcpu->vcpu_id);
+ if (!init_event)
+ kvm_apic_set_id(apic, vcpu->vcpu_id);
kvm_apic_set_version(apic->vcpu);
for (i = 0; i < APIC_LVT_NUM; i++)
apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
apic_update_lvtt(apic);
- apic_set_reg(apic, APIC_LVT0,
- SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
+ if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_LINT0_REENABLED))
+ apic_set_reg(apic, APIC_LVT0,
+ SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
apic_set_reg(apic, APIC_DFR, 0xffffffffU);
apic_set_spiv(apic, 0xff);
apic_set_reg(apic, APIC_TASKPRI, 0);
- kvm_apic_set_ldr(apic, 0);
+ if (!apic_x2apic_mode(apic))
+ kvm_apic_set_ldr(apic, 0);
apic_set_reg(apic, APIC_ESR, 0);
apic_set_reg(apic, APIC_ICR, 0);
apic_set_reg(apic, APIC_ICR2, 0);
@@ -1717,7 +1731,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
- kvm_lapic_reset(vcpu);
+ kvm_lapic_reset(vcpu, false);
kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
return 0;
@@ -2049,11 +2063,22 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events)
return;
- pe = xchg(&apic->pending_events, 0);
+ /*
+ * INITs are latched while in SMM. Because an SMM CPU cannot
+ * be in KVM_MP_STATE_INIT_RECEIVED state, just eat SIPIs
+ * and delay processing of INIT until the next RSM.
+ */
+ if (is_smm(vcpu)) {
+ WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED);
+ if (test_bit(KVM_APIC_SIPI, &apic->pending_events))
+ clear_bit(KVM_APIC_SIPI, &apic->pending_events);
+ return;
+ }
+ pe = xchg(&apic->pending_events, 0);
if (test_bit(KVM_APIC_INIT, &pe)) {
- kvm_lapic_reset(vcpu);
- kvm_vcpu_reset(vcpu);
+ kvm_lapic_reset(vcpu, true);
+ kvm_vcpu_reset(vcpu, true);
if (kvm_vcpu_is_bsp(apic->vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 9d28383fc1e7..f2f4e10ab772 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -48,7 +48,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
void kvm_apic_accept_events(struct kvm_vcpu *vcpu);
-void kvm_lapic_reset(struct kvm_vcpu *vcpu);
+void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event);
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
@@ -150,7 +150,18 @@ static inline bool kvm_apic_vid_enabled(struct kvm *kvm)
static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.apic->pending_events;
+ return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events;
+}
+
+static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
+{
+ return (irq->delivery_mode == APIC_DM_LOWEST ||
+ irq->msi_redir_hint);
+}
+
+static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
+{
+ return kvm_vcpu_has_lapic(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
}
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b73337634214..f807496b62c2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -223,15 +223,15 @@ static unsigned int get_mmio_spte_generation(u64 spte)
return gen;
}
-static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
+static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu)
{
- return kvm_memslots(kvm)->generation & MMIO_GEN_MASK;
+ return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK;
}
-static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
+static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
unsigned access)
{
- unsigned int gen = kvm_current_mmio_generation(kvm);
+ unsigned int gen = kvm_current_mmio_generation(vcpu);
u64 mask = generation_mmio_spte_mask(gen);
access &= ACC_WRITE_MASK | ACC_USER_MASK;
@@ -258,22 +258,22 @@ static unsigned get_mmio_spte_access(u64 spte)
return (spte & ~mask) & ~PAGE_MASK;
}
-static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
+static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
pfn_t pfn, unsigned access)
{
if (unlikely(is_noslot_pfn(pfn))) {
- mark_mmio_spte(kvm, sptep, gfn, access);
+ mark_mmio_spte(vcpu, sptep, gfn, access);
return true;
}
return false;
}
-static bool check_mmio_spte(struct kvm *kvm, u64 spte)
+static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
{
unsigned int kvm_gen, spte_gen;
- kvm_gen = kvm_current_mmio_generation(kvm);
+ kvm_gen = kvm_current_mmio_generation(vcpu);
spte_gen = get_mmio_spte_generation(spte);
trace_check_mmio_spte(spte, kvm_gen, spte_gen);
@@ -804,30 +804,36 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
return &slot->arch.lpage_info[level - 2][idx];
}
-static void account_shadowed(struct kvm *kvm, gfn_t gfn)
+static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
{
+ struct kvm_memslots *slots;
struct kvm_memory_slot *slot;
struct kvm_lpage_info *linfo;
+ gfn_t gfn;
int i;
- slot = gfn_to_memslot(kvm, gfn);
- for (i = PT_DIRECTORY_LEVEL;
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ gfn = sp->gfn;
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, gfn);
+ for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
linfo = lpage_info_slot(gfn, slot, i);
linfo->write_count += 1;
}
kvm->arch.indirect_shadow_pages++;
}
-static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
+static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
{
+ struct kvm_memslots *slots;
struct kvm_memory_slot *slot;
struct kvm_lpage_info *linfo;
+ gfn_t gfn;
int i;
- slot = gfn_to_memslot(kvm, gfn);
- for (i = PT_DIRECTORY_LEVEL;
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ gfn = sp->gfn;
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, gfn);
+ for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
linfo = lpage_info_slot(gfn, slot, i);
linfo->write_count -= 1;
WARN_ON(linfo->write_count < 0);
@@ -835,14 +841,14 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
kvm->arch.indirect_shadow_pages--;
}
-static int has_wrprotected_page(struct kvm *kvm,
+static int has_wrprotected_page(struct kvm_vcpu *vcpu,
gfn_t gfn,
int level)
{
struct kvm_memory_slot *slot;
struct kvm_lpage_info *linfo;
- slot = gfn_to_memslot(kvm, gfn);
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
if (slot) {
linfo = lpage_info_slot(gfn, slot, level);
return linfo->write_count;
@@ -858,8 +864,7 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
page_size = kvm_host_page_size(kvm, gfn);
- for (i = PT_PAGE_TABLE_LEVEL;
- i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
+ for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
if (page_size >= KVM_HPAGE_SIZE(i))
ret = i;
else
@@ -875,7 +880,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
{
struct kvm_memory_slot *slot;
- slot = gfn_to_memslot(vcpu->kvm, gfn);
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
(no_dirty_log && slot->dirty_bitmap))
slot = NULL;
@@ -900,7 +905,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
- if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
+ if (has_wrprotected_page(vcpu, large_gfn, level))
break;
return level - 1;
@@ -1042,12 +1047,14 @@ static unsigned long *__gfn_to_rmap(gfn_t gfn, int level,
/*
* Take gfn and return the reverse mapping to it.
*/
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, struct kvm_mmu_page *sp)
{
+ struct kvm_memslots *slots;
struct kvm_memory_slot *slot;
- slot = gfn_to_memslot(kvm, gfn);
- return __gfn_to_rmap(gfn, level, slot);
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, gfn);
+ return __gfn_to_rmap(gfn, sp->role.level, slot);
}
static bool rmap_can_add(struct kvm_vcpu *vcpu)
@@ -1065,7 +1072,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
sp = page_header(__pa(spte));
kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
- rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
+ rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp);
return pte_list_add(vcpu, spte, rmapp);
}
@@ -1077,7 +1084,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
sp = page_header(__pa(spte));
gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
- rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
+ rmapp = gfn_to_rmap(kvm, gfn, sp);
pte_list_remove(spte, rmapp);
}
@@ -1142,6 +1149,11 @@ static u64 *rmap_get_next(struct rmap_iterator *iter)
return NULL;
}
+#define for_each_rmap_spte(_rmap_, _iter_, _spte_) \
+ for (_spte_ = rmap_get_first(*_rmap_, _iter_); \
+ _spte_ && ({BUG_ON(!is_shadow_present_pte(*_spte_)); 1;}); \
+ _spte_ = rmap_get_next(_iter_))
+
static void drop_spte(struct kvm *kvm, u64 *sptep)
{
if (mmu_spte_clear_track_bits(sptep))
@@ -1205,12 +1217,8 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
struct rmap_iterator iter;
bool flush = false;
- for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
- BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+ for_each_rmap_spte(rmapp, &iter, sptep)
flush |= spte_write_protect(kvm, sptep, pt_protect);
- sptep = rmap_get_next(&iter);
- }
return flush;
}
@@ -1232,12 +1240,8 @@ static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp)
struct rmap_iterator iter;
bool flush = false;
- for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
- BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+ for_each_rmap_spte(rmapp, &iter, sptep)
flush |= spte_clear_dirty(kvm, sptep);
- sptep = rmap_get_next(&iter);
- }
return flush;
}
@@ -1259,12 +1263,8 @@ static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp)
struct rmap_iterator iter;
bool flush = false;
- for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
- BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+ for_each_rmap_spte(rmapp, &iter, sptep)
flush |= spte_set_dirty(kvm, sptep);
- sptep = rmap_get_next(&iter);
- }
return flush;
}
@@ -1342,42 +1342,45 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
}
-static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
+static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
{
struct kvm_memory_slot *slot;
unsigned long *rmapp;
int i;
bool write_protected = false;
- slot = gfn_to_memslot(kvm, gfn);
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- for (i = PT_PAGE_TABLE_LEVEL;
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+ for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
rmapp = __gfn_to_rmap(gfn, i, slot);
- write_protected |= __rmap_write_protect(kvm, rmapp, true);
+ write_protected |= __rmap_write_protect(vcpu->kvm, rmapp, true);
}
return write_protected;
}
-static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
- struct kvm_memory_slot *slot, gfn_t gfn, int level,
- unsigned long data)
+static bool kvm_zap_rmapp(struct kvm *kvm, unsigned long *rmapp)
{
u64 *sptep;
struct rmap_iterator iter;
- int need_tlb_flush = 0;
+ bool flush = false;
while ((sptep = rmap_get_first(*rmapp, &iter))) {
BUG_ON(!(*sptep & PT_PRESENT_MASK));
- rmap_printk("kvm_rmap_unmap_hva: spte %p %llx gfn %llx (%d)\n",
- sptep, *sptep, gfn, level);
+ rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
drop_spte(kvm, sptep);
- need_tlb_flush = 1;
+ flush = true;
}
- return need_tlb_flush;
+ return flush;
+}
+
+static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ struct kvm_memory_slot *slot, gfn_t gfn, int level,
+ unsigned long data)
+{
+ return kvm_zap_rmapp(kvm, rmapp);
}
static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
@@ -1394,8 +1397,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
WARN_ON(pte_huge(*ptep));
new_pfn = pte_pfn(*ptep);
- for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
- BUG_ON(!is_shadow_present_pte(*sptep));
+restart:
+ for_each_rmap_spte(rmapp, &iter, sptep) {
rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
sptep, *sptep, gfn, level);
@@ -1403,7 +1406,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
if (pte_write(*ptep)) {
drop_spte(kvm, sptep);
- sptep = rmap_get_first(*rmapp, &iter);
+ goto restart;
} else {
new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
new_spte |= (u64)new_pfn << PAGE_SHIFT;
@@ -1414,7 +1417,6 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
mmu_spte_clear_track_bits(sptep);
mmu_spte_set(sptep, new_spte);
- sptep = rmap_get_next(&iter);
}
}
@@ -1424,6 +1426,74 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
return 0;
}
+struct slot_rmap_walk_iterator {
+ /* input fields. */
+ struct kvm_memory_slot *slot;
+ gfn_t start_gfn;
+ gfn_t end_gfn;
+ int start_level;
+ int end_level;
+
+ /* output fields. */
+ gfn_t gfn;
+ unsigned long *rmap;
+ int level;
+
+ /* private field. */
+ unsigned long *end_rmap;
+};
+
+static void
+rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
+{
+ iterator->level = level;
+ iterator->gfn = iterator->start_gfn;
+ iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
+ iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
+ iterator->slot);
+}
+
+static void
+slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
+ struct kvm_memory_slot *slot, int start_level,
+ int end_level, gfn_t start_gfn, gfn_t end_gfn)
+{
+ iterator->slot = slot;
+ iterator->start_level = start_level;
+ iterator->end_level = end_level;
+ iterator->start_gfn = start_gfn;
+ iterator->end_gfn = end_gfn;
+
+ rmap_walk_init_level(iterator, iterator->start_level);
+}
+
+static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
+{
+ return !!iterator->rmap;
+}
+
+static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
+{
+ if (++iterator->rmap <= iterator->end_rmap) {
+ iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
+ return;
+ }
+
+ if (++iterator->level > iterator->end_level) {
+ iterator->rmap = NULL;
+ return;
+ }
+
+ rmap_walk_init_level(iterator, iterator->level);
+}
+
+#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \
+ _start_gfn, _end_gfn, _iter_) \
+ for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \
+ _end_level_, _start_gfn, _end_gfn); \
+ slot_rmap_walk_okay(_iter_); \
+ slot_rmap_walk_next(_iter_))
+
static int kvm_handle_hva_range(struct kvm *kvm,
unsigned long start,
unsigned long end,
@@ -1435,48 +1505,36 @@ static int kvm_handle_hva_range(struct kvm *kvm,
int level,
unsigned long data))
{
- int j;
- int ret = 0;
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
+ struct slot_rmap_walk_iterator iterator;
+ int ret = 0;
+ int i;
- slots = kvm_memslots(kvm);
-
- kvm_for_each_memslot(memslot, slots) {
- unsigned long hva_start, hva_end;
- gfn_t gfn_start, gfn_end;
-
- hva_start = max(start, memslot->userspace_addr);
- hva_end = min(end, memslot->userspace_addr +
- (memslot->npages << PAGE_SHIFT));
- if (hva_start >= hva_end)
- continue;
- /*
- * {gfn(page) | page intersects with [hva_start, hva_end)} =
- * {gfn_start, gfn_start+1, ..., gfn_end-1}.
- */
- gfn_start = hva_to_gfn_memslot(hva_start, memslot);
- gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
-
- for (j = PT_PAGE_TABLE_LEVEL;
- j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) {
- unsigned long idx, idx_end;
- unsigned long *rmapp;
- gfn_t gfn = gfn_start;
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(memslot, slots) {
+ unsigned long hva_start, hva_end;
+ gfn_t gfn_start, gfn_end;
+ hva_start = max(start, memslot->userspace_addr);
+ hva_end = min(end, memslot->userspace_addr +
+ (memslot->npages << PAGE_SHIFT));
+ if (hva_start >= hva_end)
+ continue;
/*
- * {idx(page_j) | page_j intersects with
- * [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}.
+ * {gfn(page) | page intersects with [hva_start, hva_end)} =
+ * {gfn_start, gfn_start+1, ..., gfn_end-1}.
*/
- idx = gfn_to_index(gfn_start, memslot->base_gfn, j);
- idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j);
-
- rmapp = __gfn_to_rmap(gfn_start, j, memslot);
-
- for (; idx <= idx_end;
- ++idx, gfn += (1UL << KVM_HPAGE_GFN_SHIFT(j)))
- ret |= handler(kvm, rmapp++, memslot,
- gfn, j, data);
+ gfn_start = hva_to_gfn_memslot(hva_start, memslot);
+ gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
+
+ for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
+ PT_MAX_HUGEPAGE_LEVEL,
+ gfn_start, gfn_end - 1,
+ &iterator)
+ ret |= handler(kvm, iterator.rmap, memslot,
+ iterator.gfn, iterator.level, data);
}
}
@@ -1518,16 +1576,13 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
BUG_ON(!shadow_accessed_mask);
- for (sptep = rmap_get_first(*rmapp, &iter); sptep;
- sptep = rmap_get_next(&iter)) {
- BUG_ON(!is_shadow_present_pte(*sptep));
-
+ for_each_rmap_spte(rmapp, &iter, sptep)
if (*sptep & shadow_accessed_mask) {
young = 1;
clear_bit((ffs(shadow_accessed_mask) - 1),
(unsigned long *)sptep);
}
- }
+
trace_kvm_age_page(gfn, level, slot, young);
return young;
}
@@ -1548,15 +1603,11 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
if (!shadow_accessed_mask)
goto out;
- for (sptep = rmap_get_first(*rmapp, &iter); sptep;
- sptep = rmap_get_next(&iter)) {
- BUG_ON(!is_shadow_present_pte(*sptep));
-
+ for_each_rmap_spte(rmapp, &iter, sptep)
if (*sptep & shadow_accessed_mask) {
young = 1;
break;
}
- }
out:
return young;
}
@@ -1570,7 +1621,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
sp = page_header(__pa(spte));
- rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
+ rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp);
kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, gfn, sp->role.level, 0);
kvm_flush_remote_tlbs(vcpu->kvm);
@@ -1990,7 +2041,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
bool protected = false;
for_each_sp(pages, sp, parents, i)
- protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
+ protected |= rmap_write_protect(vcpu, sp->gfn);
if (protected)
kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2088,12 +2139,12 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
hlist_add_head(&sp->hash_link,
&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
if (!direct) {
- if (rmap_write_protect(vcpu->kvm, gfn))
+ if (rmap_write_protect(vcpu, gfn))
kvm_flush_remote_tlbs(vcpu->kvm);
if (level > PT_PAGE_TABLE_LEVEL && need_sync)
kvm_sync_pages(vcpu, gfn);
- account_shadowed(vcpu->kvm, gfn);
+ account_shadowed(vcpu->kvm, sp);
}
sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
init_shadow_page_table(sp);
@@ -2274,7 +2325,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
kvm_mmu_unlink_parents(kvm, sp);
if (!sp->role.invalid && !sp->role.direct)
- unaccount_shadowed(kvm, sp->gfn);
+ unaccount_shadowed(kvm, sp);
if (sp->unsync)
kvm_unlink_unsync_page(kvm, sp);
@@ -2386,111 +2437,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
-/*
- * The function is based on mtrr_type_lookup() in
- * arch/x86/kernel/cpu/mtrr/generic.c
- */
-static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
- u64 start, u64 end)
-{
- int i;
- u64 base, mask;
- u8 prev_match, curr_match;
- int num_var_ranges = KVM_NR_VAR_MTRR;
-
- if (!mtrr_state->enabled)
- return 0xFF;
-
- /* Make end inclusive end, instead of exclusive */
- end--;
-
- /* Look in fixed ranges. Just return the type as per start */
- if (mtrr_state->have_fixed && (start < 0x100000)) {
- int idx;
-
- if (start < 0x80000) {
- idx = 0;
- idx += (start >> 16);
- return mtrr_state->fixed_ranges[idx];
- } else if (start < 0xC0000) {
- idx = 1 * 8;
- idx += ((start - 0x80000) >> 14);
- return mtrr_state->fixed_ranges[idx];
- } else if (start < 0x1000000) {
- idx = 3 * 8;
- idx += ((start - 0xC0000) >> 12);
- return mtrr_state->fixed_ranges[idx];
- }
- }
-
- /*
- * Look in variable ranges
- * Look of multiple ranges matching this address and pick type
- * as per MTRR precedence
- */
- if (!(mtrr_state->enabled & 2))
- return mtrr_state->def_type;
-
- prev_match = 0xFF;
- for (i = 0; i < num_var_ranges; ++i) {
- unsigned short start_state, end_state;
-
- if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
- continue;
-
- base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
- (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
- mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
- (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
-
- start_state = ((start & mask) == (base & mask));
- end_state = ((end & mask) == (base & mask));
- if (start_state != end_state)
- return 0xFE;
-
- if ((start & mask) != (base & mask))
- continue;
-
- curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
- if (prev_match == 0xFF) {
- prev_match = curr_match;
- continue;
- }
-
- if (prev_match == MTRR_TYPE_UNCACHABLE ||
- curr_match == MTRR_TYPE_UNCACHABLE)
- return MTRR_TYPE_UNCACHABLE;
-
- if ((prev_match == MTRR_TYPE_WRBACK &&
- curr_match == MTRR_TYPE_WRTHROUGH) ||
- (prev_match == MTRR_TYPE_WRTHROUGH &&
- curr_match == MTRR_TYPE_WRBACK)) {
- prev_match = MTRR_TYPE_WRTHROUGH;
- curr_match = MTRR_TYPE_WRTHROUGH;
- }
-
- if (prev_match != curr_match)
- return MTRR_TYPE_UNCACHABLE;
- }
-
- if (prev_match != 0xFF)
- return prev_match;
-
- return mtrr_state->def_type;
-}
-
-u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
-{
- u8 mtrr;
-
- mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
- (gfn << PAGE_SHIFT) + PAGE_SIZE);
- if (mtrr == 0xfe || mtrr == 0xff)
- mtrr = MTRR_TYPE_WRBACK;
- return mtrr;
-}
-EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
-
static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
trace_kvm_mmu_unsync_page(sp);
@@ -2541,7 +2487,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
u64 spte;
int ret = 0;
- if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access))
+ if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
return 0;
spte = PT_PRESENT_MASK;
@@ -2578,7 +2524,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
* be fixed if guest refault.
*/
if (level > PT_PAGE_TABLE_LEVEL &&
- has_wrprotected_page(vcpu->kvm, gfn, level))
+ has_wrprotected_page(vcpu, gfn, level))
goto done;
spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
@@ -2602,7 +2548,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
}
if (pte_access & ACC_WRITE_MASK) {
- mark_page_dirty(vcpu->kvm, gfn);
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
spte |= shadow_dirty_mask;
}
@@ -2692,15 +2638,17 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
u64 *start, u64 *end)
{
struct page *pages[PTE_PREFETCH_NUM];
+ struct kvm_memory_slot *slot;
unsigned access = sp->role.access;
int i, ret;
gfn_t gfn;
gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
- if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK))
+ slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
+ if (!slot)
return -1;
- ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
+ ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
if (ret <= 0)
return -1;
@@ -2818,7 +2766,7 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
return 1;
if (pfn == KVM_PFN_ERR_HWPOISON) {
- kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
+ kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
return 0;
}
@@ -2841,7 +2789,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
level == PT_PAGE_TABLE_LEVEL &&
PageTransCompound(pfn_to_page(pfn)) &&
- !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
+ !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
unsigned long mask;
/*
* mmu_notifier_retry was successful and we hold the
@@ -2933,7 +2881,7 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
* Compare with set_spte where instead shadow_dirty_mask is set.
*/
if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
- mark_page_dirty(vcpu->kvm, gfn);
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
return true;
}
@@ -3388,7 +3336,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
gfn_t gfn = get_mmio_spte_gfn(spte);
unsigned access = get_mmio_spte_access(spte);
- if (!check_mmio_spte(vcpu->kvm, spte))
+ if (!check_mmio_spte(vcpu, spte))
return RET_MMIO_PF_INVALID;
if (direct)
@@ -3460,7 +3408,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
arch.direct_map = vcpu->arch.mmu.direct_map;
arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
- return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu->kvm, gfn), &arch);
+ return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
}
static bool can_do_async_pf(struct kvm_vcpu *vcpu)
@@ -3475,10 +3423,12 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu)
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gva_t gva, pfn_t *pfn, bool write, bool *writable)
{
+ struct kvm_memory_slot *slot;
bool async;
- *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
-
+ slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ async = false;
+ *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
if (!async)
return false; /* *pfn has correct page already */
@@ -3492,11 +3442,20 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
return true;
}
- *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
-
+ *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
return false;
}
+static bool
+check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
+{
+ int page_num = KVM_PAGES_PER_HPAGE(level);
+
+ gfn &= ~(page_num - 1);
+
+ return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
+}
+
static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
bool prefault)
{
@@ -3522,9 +3481,17 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
if (r)
return r;
- force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+ if (mapping_level_dirty_bitmap(vcpu, gfn) ||
+ !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL))
+ force_pt_level = 1;
+ else
+ force_pt_level = 0;
+
if (likely(!force_pt_level)) {
level = mapping_level(vcpu, gfn);
+ if (level > PT_DIRECTORY_LEVEL &&
+ !check_hugepage_cache_consistency(vcpu, gfn, level))
+ level = PT_DIRECTORY_LEVEL;
gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
} else
level = PT_PAGE_TABLE_LEVEL;
@@ -3590,7 +3557,7 @@ static void inject_page_fault(struct kvm_vcpu *vcpu,
vcpu->arch.mmu.inject_page_fault(vcpu, fault);
}
-static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
+static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
unsigned access, int *nr_present)
{
if (unlikely(is_mmio_spte(*sptep))) {
@@ -3600,7 +3567,7 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn,
}
(*nr_present)++;
- mark_mmio_spte(kvm, sptep, gfn, access);
+ mark_mmio_spte(vcpu, sptep, gfn, access);
return true;
}
@@ -3878,6 +3845,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
struct kvm_mmu *context = &vcpu->arch.mmu;
context->base_role.word = 0;
+ context->base_role.smm = is_smm(vcpu);
context->page_fault = tdp_page_fault;
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
@@ -3939,6 +3907,7 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
= smep && !is_write_protection(vcpu);
context->base_role.smap_andnot_wp
= smap && !is_write_protection(vcpu);
+ context->base_role.smm = is_smm(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
@@ -4110,7 +4079,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
*gpa &= ~(gpa_t)7;
*bytes = 8;
- r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8);
+ r = kvm_vcpu_read_guest(vcpu, *gpa, &gentry, 8);
if (r)
gentry = 0;
new = (const u8 *)&gentry;
@@ -4222,6 +4191,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
mask.nxe = 1;
mask.smep_andnot_wp = 1;
mask.smap_andnot_wp = 1;
+ mask.smm = 1;
/*
* If we don't have indirect shadow pages, it means no page is
@@ -4420,36 +4390,115 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu)
init_kvm_mmu(vcpu);
}
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
- struct kvm_memory_slot *memslot)
+/* The return value indicates if tlb flush on all vcpus is needed. */
+typedef bool (*slot_level_handler) (struct kvm *kvm, unsigned long *rmap);
+
+/* The caller should hold mmu-lock before calling this function. */
+static bool
+slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, int start_level, int end_level,
+ gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
{
- gfn_t last_gfn;
- int i;
+ struct slot_rmap_walk_iterator iterator;
bool flush = false;
- last_gfn = memslot->base_gfn + memslot->npages - 1;
+ for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
+ end_gfn, &iterator) {
+ if (iterator.rmap)
+ flush |= fn(kvm, iterator.rmap);
- spin_lock(&kvm->mmu_lock);
+ if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+ if (flush && lock_flush_tlb) {
+ kvm_flush_remote_tlbs(kvm);
+ flush = false;
+ }
+ cond_resched_lock(&kvm->mmu_lock);
+ }
+ }
- for (i = PT_PAGE_TABLE_LEVEL;
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- unsigned long *rmapp;
- unsigned long last_index, index;
+ if (flush && lock_flush_tlb) {
+ kvm_flush_remote_tlbs(kvm);
+ flush = false;
+ }
- rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
- last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
+ return flush;
+}
- for (index = 0; index <= last_index; ++index, ++rmapp) {
- if (*rmapp)
- flush |= __rmap_write_protect(kvm, rmapp,
- false);
+static bool
+slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, int start_level, int end_level,
+ bool lock_flush_tlb)
+{
+ return slot_handle_level_range(kvm, memslot, fn, start_level,
+ end_level, memslot->base_gfn,
+ memslot->base_gfn + memslot->npages - 1,
+ lock_flush_tlb);
+}
+
+static bool
+slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, bool lock_flush_tlb)
+{
+ return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+ PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+}
+
+static bool
+slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, bool lock_flush_tlb)
+{
+ return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
+ PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+}
+
+static bool
+slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
+ slot_level_handler fn, bool lock_flush_tlb)
+{
+ return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+ PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
+}
- if (need_resched() || spin_needbreak(&kvm->mmu_lock))
- cond_resched_lock(&kvm->mmu_lock);
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+ int i;
+
+ spin_lock(&kvm->mmu_lock);
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot(memslot, slots) {
+ gfn_t start, end;
+
+ start = max(gfn_start, memslot->base_gfn);
+ end = min(gfn_end, memslot->base_gfn + memslot->npages);
+ if (start >= end)
+ continue;
+
+ slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
+ PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
+ start, end - 1, true);
}
}
spin_unlock(&kvm->mmu_lock);
+}
+
+static bool slot_rmap_write_protect(struct kvm *kvm, unsigned long *rmapp)
+{
+ return __rmap_write_protect(kvm, rmapp, false);
+}
+
+void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+ struct kvm_memory_slot *memslot)
+{
+ bool flush;
+
+ spin_lock(&kvm->mmu_lock);
+ flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
+ false);
+ spin_unlock(&kvm->mmu_lock);
/*
* kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
@@ -4482,9 +4531,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
pfn_t pfn;
struct kvm_mmu_page *sp;
- for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
- BUG_ON(!(*sptep & PT_PRESENT_MASK));
-
+restart:
+ for_each_rmap_spte(rmapp, &iter, sptep) {
sp = page_header(__pa(sptep));
pfn = spte_to_pfn(*sptep);
@@ -4499,71 +4547,31 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
!kvm_is_reserved_pfn(pfn) &&
PageTransCompound(pfn_to_page(pfn))) {
drop_spte(kvm, sptep);
- sptep = rmap_get_first(*rmapp, &iter);
need_tlb_flush = 1;
- } else
- sptep = rmap_get_next(&iter);
+ goto restart;
+ }
}
return need_tlb_flush;
}
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
- struct kvm_memory_slot *memslot)
+ const struct kvm_memory_slot *memslot)
{
- bool flush = false;
- unsigned long *rmapp;
- unsigned long last_index, index;
-
+ /* FIXME: const-ify all uses of struct kvm_memory_slot. */
spin_lock(&kvm->mmu_lock);
-
- rmapp = memslot->arch.rmap[0];
- last_index = gfn_to_index(memslot->base_gfn + memslot->npages - 1,
- memslot->base_gfn, PT_PAGE_TABLE_LEVEL);
-
- for (index = 0; index <= last_index; ++index, ++rmapp) {
- if (*rmapp)
- flush |= kvm_mmu_zap_collapsible_spte(kvm, rmapp);
-
- if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
- if (flush) {
- kvm_flush_remote_tlbs(kvm);
- flush = false;
- }
- cond_resched_lock(&kvm->mmu_lock);
- }
- }
-
- if (flush)
- kvm_flush_remote_tlbs(kvm);
-
+ slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
+ kvm_mmu_zap_collapsible_spte, true);
spin_unlock(&kvm->mmu_lock);
}
void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
- gfn_t last_gfn;
- unsigned long *rmapp;
- unsigned long last_index, index;
- bool flush = false;
-
- last_gfn = memslot->base_gfn + memslot->npages - 1;
+ bool flush;
spin_lock(&kvm->mmu_lock);
-
- rmapp = memslot->arch.rmap[PT_PAGE_TABLE_LEVEL - 1];
- last_index = gfn_to_index(last_gfn, memslot->base_gfn,
- PT_PAGE_TABLE_LEVEL);
-
- for (index = 0; index <= last_index; ++index, ++rmapp) {
- if (*rmapp)
- flush |= __rmap_clear_dirty(kvm, rmapp);
-
- if (need_resched() || spin_needbreak(&kvm->mmu_lock))
- cond_resched_lock(&kvm->mmu_lock);
- }
-
+ flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
spin_unlock(&kvm->mmu_lock);
lockdep_assert_held(&kvm->slots_lock);
@@ -4582,31 +4590,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
- gfn_t last_gfn;
- int i;
- bool flush = false;
-
- last_gfn = memslot->base_gfn + memslot->npages - 1;
+ bool flush;
spin_lock(&kvm->mmu_lock);
-
- for (i = PT_PAGE_TABLE_LEVEL + 1; /* skip rmap for 4K page */
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- unsigned long *rmapp;
- unsigned long last_index, index;
-
- rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
- last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
-
- for (index = 0; index <= last_index; ++index, ++rmapp) {
- if (*rmapp)
- flush |= __rmap_write_protect(kvm, rmapp,
- false);
-
- if (need_resched() || spin_needbreak(&kvm->mmu_lock))
- cond_resched_lock(&kvm->mmu_lock);
- }
- }
+ flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
+ false);
spin_unlock(&kvm->mmu_lock);
/* see kvm_mmu_slot_remove_write_access */
@@ -4620,31 +4608,10 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
void kvm_mmu_slot_set_dirty(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
- gfn_t last_gfn;
- int i;
- bool flush = false;
-
- last_gfn = memslot->base_gfn + memslot->npages - 1;
+ bool flush;
spin_lock(&kvm->mmu_lock);
-
- for (i = PT_PAGE_TABLE_LEVEL;
- i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- unsigned long *rmapp;
- unsigned long last_index, index;
-
- rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
- last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
-
- for (index = 0; index <= last_index; ++index, ++rmapp) {
- if (*rmapp)
- flush |= __rmap_set_dirty(kvm, rmapp);
-
- if (need_resched() || spin_needbreak(&kvm->mmu_lock))
- cond_resched_lock(&kvm->mmu_lock);
- }
- }
-
+ flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
spin_unlock(&kvm->mmu_lock);
lockdep_assert_held(&kvm->slots_lock);
@@ -4741,13 +4708,13 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
}
-void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm)
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
{
/*
* The very rare case: if the generation-number is round,
* zap all shadow pages.
*/
- if (unlikely(kvm_current_mmio_generation(kvm) == 0)) {
+ if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) {
printk_ratelimited(KERN_DEBUG "kvm: zapping shadow pages for mmio generation wraparound\n");
kvm_mmu_invalidate_zap_all_pages(kvm);
}
@@ -4869,15 +4836,18 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
unsigned int nr_pages = 0;
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
+ int i;
- slots = kvm_memslots(kvm);
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
- kvm_for_each_memslot(memslot, slots)
- nr_pages += memslot->npages;
+ kvm_for_each_memslot(memslot, slots)
+ nr_pages += memslot->npages;
+ }
nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
nr_mmu_pages = max(nr_mmu_pages,
- (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
+ (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
return nr_mmu_pages;
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 0ada65ecddcf..398d21c0f6dd 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -43,6 +43,7 @@
#define PT_PDPE_LEVEL 3
#define PT_DIRECTORY_LEVEL 2
#define PT_PAGE_TABLE_LEVEL 1
+#define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES - 1)
static inline u64 rsvd_bits(int s, int e)
{
@@ -170,4 +171,5 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
}
void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
+void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
#endif
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 9ade5cfb5a4c..a4f62e6f2db2 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -114,7 +114,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
return;
gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
- pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+ pfn = kvm_vcpu_gfn_to_pfn_atomic(vcpu, gfn);
if (is_error_pfn(pfn))
return;
@@ -131,12 +131,16 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
unsigned long *rmapp;
struct kvm_mmu_page *rev_sp;
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
gfn_t gfn;
rev_sp = page_header(__pa(sptep));
gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
- if (!gfn_to_memslot(kvm, gfn)) {
+ slots = kvm_memslots_for_spte_role(kvm, rev_sp->role);
+ slot = __gfn_to_memslot(slots, gfn);
+ if (!slot) {
if (!__ratelimit(&ratelimit_state))
return;
audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
@@ -146,7 +150,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
return;
}
- rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
+ rmapp = __gfn_to_rmap(gfn, rev_sp->role.level, slot);
if (!*rmapp) {
if (!__ratelimit(&ratelimit_state))
return;
@@ -191,19 +195,21 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
unsigned long *rmapp;
u64 *sptep;
struct rmap_iterator iter;
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *slot;
if (sp->role.direct || sp->unsync || sp->role.invalid)
return;
- rmapp = gfn_to_rmap(kvm, sp->gfn, PT_PAGE_TABLE_LEVEL);
+ slots = kvm_memslots_for_spte_role(kvm, sp->role);
+ slot = __gfn_to_memslot(slots, sp->gfn);
+ rmapp = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot);
- for (sptep = rmap_get_first(*rmapp, &iter); sptep;
- sptep = rmap_get_next(&iter)) {
+ for_each_rmap_spte(rmapp, &iter, sptep)
if (is_writable_pte(*sptep))
audit_printk(kvm, "shadow page has writable "
"mappings: gfn %llx role %x\n",
sp->gfn, sp->role.word);
- }
}
static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index ce463a9cc8fb..5a24b846a1cb 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -2,7 +2,7 @@
#define _TRACE_KVMMMU_H
#include <linux/tracepoint.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvmmmu
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
new file mode 100644
index 000000000000..de1d2d8062e2
--- /dev/null
+++ b/arch/x86/kvm/mtrr.c
@@ -0,0 +1,699 @@
+/*
+ * vMTRR implementation
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ * Copyright(C) 2015 Intel Corporation.
+ *
+ * Authors:
+ * Yaniv Kamay <yaniv@qumranet.com>
+ * Avi Kivity <avi@qumranet.com>
+ * Marcelo Tosatti <mtosatti@redhat.com>
+ * Paolo Bonzini <pbonzini@redhat.com>
+ * Xiao Guangrong <guangrong.xiao@linux.intel.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/mtrr.h>
+
+#include "cpuid.h"
+#include "mmu.h"
+
+#define IA32_MTRR_DEF_TYPE_E (1ULL << 11)
+#define IA32_MTRR_DEF_TYPE_FE (1ULL << 10)
+#define IA32_MTRR_DEF_TYPE_TYPE_MASK (0xff)
+
+static bool msr_mtrr_valid(unsigned msr)
+{
+ switch (msr) {
+ case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
+ case MSR_MTRRfix64K_00000:
+ case MSR_MTRRfix16K_80000:
+ case MSR_MTRRfix16K_A0000:
+ case MSR_MTRRfix4K_C0000:
+ case MSR_MTRRfix4K_C8000:
+ case MSR_MTRRfix4K_D0000:
+ case MSR_MTRRfix4K_D8000:
+ case MSR_MTRRfix4K_E0000:
+ case MSR_MTRRfix4K_E8000:
+ case MSR_MTRRfix4K_F0000:
+ case MSR_MTRRfix4K_F8000:
+ case MSR_MTRRdefType:
+ case MSR_IA32_CR_PAT:
+ return true;
+ case 0x2f8:
+ return true;
+ }
+ return false;
+}
+
+static bool valid_pat_type(unsigned t)
+{
+ return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
+}
+
+static bool valid_mtrr_type(unsigned t)
+{
+ return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
+}
+
+bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ int i;
+ u64 mask;
+
+ if (!msr_mtrr_valid(msr))
+ return false;
+
+ if (msr == MSR_IA32_CR_PAT) {
+ for (i = 0; i < 8; i++)
+ if (!valid_pat_type((data >> (i * 8)) & 0xff))
+ return false;
+ return true;
+ } else if (msr == MSR_MTRRdefType) {
+ if (data & ~0xcff)
+ return false;
+ return valid_mtrr_type(data & 0xff);
+ } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
+ for (i = 0; i < 8 ; i++)
+ if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
+ return false;
+ return true;
+ }
+
+ /* variable MTRRs */
+ WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR));
+
+ mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
+ if ((msr & 1) == 0) {
+ /* MTRR base */
+ if (!valid_mtrr_type(data & 0xff))
+ return false;
+ mask |= 0xf00;
+ } else
+ /* MTRR mask */
+ mask |= 0x7ff;
+ if (data & mask) {
+ kvm_inject_gp(vcpu, 0);
+ return false;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(kvm_mtrr_valid);
+
+static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
+{
+ return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_E);
+}
+
+static bool fixed_mtrr_is_enabled(struct kvm_mtrr *mtrr_state)
+{
+ return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_FE);
+}
+
+static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state)
+{
+ return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK;
+}
+
+/*
+* Three terms are used in the following code:
+* - segment, it indicates the address segments covered by fixed MTRRs.
+* - unit, it corresponds to the MSR entry in the segment.
+* - range, a range is covered in one memory cache type.
+*/
+struct fixed_mtrr_segment {
+ u64 start;
+ u64 end;
+
+ int range_shift;
+
+ /* the start position in kvm_mtrr.fixed_ranges[]. */
+ int range_start;
+};
+
+static struct fixed_mtrr_segment fixed_seg_table[] = {
+ /* MSR_MTRRfix64K_00000, 1 unit. 64K fixed mtrr. */
+ {
+ .start = 0x0,
+ .end = 0x80000,
+ .range_shift = 16, /* 64K */
+ .range_start = 0,
+ },
+
+ /*
+ * MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000, 2 units,
+ * 16K fixed mtrr.
+ */
+ {
+ .start = 0x80000,
+ .end = 0xc0000,
+ .range_shift = 14, /* 16K */
+ .range_start = 8,
+ },
+
+ /*
+ * MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000, 8 units,
+ * 4K fixed mtrr.
+ */
+ {
+ .start = 0xc0000,
+ .end = 0x100000,
+ .range_shift = 12, /* 12K */
+ .range_start = 24,
+ }
+};
+
+/*
+ * The size of unit is covered in one MSR, one MSR entry contains
+ * 8 ranges so that unit size is always 8 * 2^range_shift.
+ */
+static u64 fixed_mtrr_seg_unit_size(int seg)
+{
+ return 8 << fixed_seg_table[seg].range_shift;
+}
+
+static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit)
+{
+ switch (msr) {
+ case MSR_MTRRfix64K_00000:
+ *seg = 0;
+ *unit = 0;
+ break;
+ case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000:
+ *seg = 1;
+ *unit = msr - MSR_MTRRfix16K_80000;
+ break;
+ case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
+ *seg = 2;
+ *unit = msr - MSR_MTRRfix4K_C0000;
+ break;
+ default:
+ return false;
+ }
+
+ return true;
+}
+
+static void fixed_mtrr_seg_unit_range(int seg, int unit, u64 *start, u64 *end)
+{
+ struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+ u64 unit_size = fixed_mtrr_seg_unit_size(seg);
+
+ *start = mtrr_seg->start + unit * unit_size;
+ *end = *start + unit_size;
+ WARN_ON(*end > mtrr_seg->end);
+}
+
+static int fixed_mtrr_seg_unit_range_index(int seg, int unit)
+{
+ struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+
+ WARN_ON(mtrr_seg->start + unit * fixed_mtrr_seg_unit_size(seg)
+ > mtrr_seg->end);
+
+ /* each unit has 8 ranges. */
+ return mtrr_seg->range_start + 8 * unit;
+}
+
+static int fixed_mtrr_seg_end_range_index(int seg)
+{
+ struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+ int n;
+
+ n = (mtrr_seg->end - mtrr_seg->start) >> mtrr_seg->range_shift;
+ return mtrr_seg->range_start + n - 1;
+}
+
+static bool fixed_msr_to_range(u32 msr, u64 *start, u64 *end)
+{
+ int seg, unit;
+
+ if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
+ return false;
+
+ fixed_mtrr_seg_unit_range(seg, unit, start, end);
+ return true;
+}
+
+static int fixed_msr_to_range_index(u32 msr)
+{
+ int seg, unit;
+
+ if (!fixed_msr_to_seg_unit(msr, &seg, &unit))
+ return -1;
+
+ return fixed_mtrr_seg_unit_range_index(seg, unit);
+}
+
+static int fixed_mtrr_addr_to_seg(u64 addr)
+{
+ struct fixed_mtrr_segment *mtrr_seg;
+ int seg, seg_num = ARRAY_SIZE(fixed_seg_table);
+
+ for (seg = 0; seg < seg_num; seg++) {
+ mtrr_seg = &fixed_seg_table[seg];
+ if (mtrr_seg->start >= addr && addr < mtrr_seg->end)
+ return seg;
+ }
+
+ return -1;
+}
+
+static int fixed_mtrr_addr_seg_to_range_index(u64 addr, int seg)
+{
+ struct fixed_mtrr_segment *mtrr_seg;
+ int index;
+
+ mtrr_seg = &fixed_seg_table[seg];
+ index = mtrr_seg->range_start;
+ index += (addr - mtrr_seg->start) >> mtrr_seg->range_shift;
+ return index;
+}
+
+static u64 fixed_mtrr_range_end_addr(int seg, int index)
+{
+ struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg];
+ int pos = index - mtrr_seg->range_start;
+
+ return mtrr_seg->start + ((pos + 1) << mtrr_seg->range_shift);
+}
+
+static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end)
+{
+ u64 mask;
+
+ *start = range->base & PAGE_MASK;
+
+ mask = range->mask & PAGE_MASK;
+ mask |= ~0ULL << boot_cpu_data.x86_phys_bits;
+
+ /* This cannot overflow because writing to the reserved bits of
+ * variable MTRRs causes a #GP.
+ */
+ *end = (*start | ~mask) + 1;
+}
+
+static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+ gfn_t start, end;
+ int index;
+
+ if (msr == MSR_IA32_CR_PAT || !tdp_enabled ||
+ !kvm_arch_has_noncoherent_dma(vcpu->kvm))
+ return;
+
+ if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType)
+ return;
+
+ /* fixed MTRRs. */
+ if (fixed_msr_to_range(msr, &start, &end)) {
+ if (!fixed_mtrr_is_enabled(mtrr_state))
+ return;
+ } else if (msr == MSR_MTRRdefType) {
+ start = 0x0;
+ end = ~0ULL;
+ } else {
+ /* variable range MTRRs. */
+ index = (msr - 0x200) / 2;
+ var_mtrr_range(&mtrr_state->var_ranges[index], &start, &end);
+ }
+
+ kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end));
+}
+
+static bool var_mtrr_range_is_valid(struct kvm_mtrr_range *range)
+{
+ return (range->mask & (1 << 11)) != 0;
+}
+
+static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+ struct kvm_mtrr_range *tmp, *cur;
+ int index, is_mtrr_mask;
+
+ index = (msr - 0x200) / 2;
+ is_mtrr_mask = msr - 0x200 - 2 * index;
+ cur = &mtrr_state->var_ranges[index];
+
+ /* remove the entry if it's in the list. */
+ if (var_mtrr_range_is_valid(cur))
+ list_del(&mtrr_state->var_ranges[index].node);
+
+ if (!is_mtrr_mask)
+ cur->base = data;
+ else
+ cur->mask = data;
+
+ /* add it to the list if it's enabled. */
+ if (var_mtrr_range_is_valid(cur)) {
+ list_for_each_entry(tmp, &mtrr_state->head, node)
+ if (cur->base >= tmp->base)
+ break;
+ list_add_tail(&cur->node, &tmp->node);
+ }
+}
+
+int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+{
+ int index;
+
+ if (!kvm_mtrr_valid(vcpu, msr, data))
+ return 1;
+
+ index = fixed_msr_to_range_index(msr);
+ if (index >= 0)
+ *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index] = data;
+ else if (msr == MSR_MTRRdefType)
+ vcpu->arch.mtrr_state.deftype = data;
+ else if (msr == MSR_IA32_CR_PAT)
+ vcpu->arch.pat = data;
+ else
+ set_var_mtrr_msr(vcpu, msr, data);
+
+ update_mtrr(vcpu, msr);
+ return 0;
+}
+
+int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+{
+ int index;
+
+ /* MSR_MTRRcap is a readonly MSR. */
+ if (msr == MSR_MTRRcap) {
+ /*
+ * SMRR = 0
+ * WC = 1
+ * FIX = 1
+ * VCNT = KVM_NR_VAR_MTRR
+ */
+ *pdata = 0x500 | KVM_NR_VAR_MTRR;
+ return 0;
+ }
+
+ if (!msr_mtrr_valid(msr))
+ return 1;
+
+ index = fixed_msr_to_range_index(msr);
+ if (index >= 0)
+ *pdata = *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index];
+ else if (msr == MSR_MTRRdefType)
+ *pdata = vcpu->arch.mtrr_state.deftype;
+ else if (msr == MSR_IA32_CR_PAT)
+ *pdata = vcpu->arch.pat;
+ else { /* Variable MTRRs */
+ int is_mtrr_mask;
+
+ index = (msr - 0x200) / 2;
+ is_mtrr_mask = msr - 0x200 - 2 * index;
+ if (!is_mtrr_mask)
+ *pdata = vcpu->arch.mtrr_state.var_ranges[index].base;
+ else
+ *pdata = vcpu->arch.mtrr_state.var_ranges[index].mask;
+ }
+
+ return 0;
+}
+
+void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu)
+{
+ INIT_LIST_HEAD(&vcpu->arch.mtrr_state.head);
+}
+
+struct mtrr_iter {
+ /* input fields. */
+ struct kvm_mtrr *mtrr_state;
+ u64 start;
+ u64 end;
+
+ /* output fields. */
+ int mem_type;
+ /* [start, end) is not fully covered in MTRRs? */
+ bool partial_map;
+
+ /* private fields. */
+ union {
+ /* used for fixed MTRRs. */
+ struct {
+ int index;
+ int seg;
+ };
+
+ /* used for var MTRRs. */
+ struct {
+ struct kvm_mtrr_range *range;
+ /* max address has been covered in var MTRRs. */
+ u64 start_max;
+ };
+ };
+
+ bool fixed;
+};
+
+static bool mtrr_lookup_fixed_start(struct mtrr_iter *iter)
+{
+ int seg, index;
+
+ if (!fixed_mtrr_is_enabled(iter->mtrr_state))
+ return false;
+
+ seg = fixed_mtrr_addr_to_seg(iter->start);
+ if (seg < 0)
+ return false;
+
+ iter->fixed = true;
+ index = fixed_mtrr_addr_seg_to_range_index(iter->start, seg);
+ iter->index = index;
+ iter->seg = seg;
+ return true;
+}
+
+static bool match_var_range(struct mtrr_iter *iter,
+ struct kvm_mtrr_range *range)
+{
+ u64 start, end;
+
+ var_mtrr_range(range, &start, &end);
+ if (!(start >= iter->end || end <= iter->start)) {
+ iter->range = range;
+
+ /*
+ * the function is called when we do kvm_mtrr.head walking.
+ * Range has the minimum base address which interleaves
+ * [looker->start_max, looker->end).
+ */
+ iter->partial_map |= iter->start_max < start;
+
+ /* update the max address has been covered. */
+ iter->start_max = max(iter->start_max, end);
+ return true;
+ }
+
+ return false;
+}
+
+static void __mtrr_lookup_var_next(struct mtrr_iter *iter)
+{
+ struct kvm_mtrr *mtrr_state = iter->mtrr_state;
+
+ list_for_each_entry_continue(iter->range, &mtrr_state->head, node)
+ if (match_var_range(iter, iter->range))
+ return;
+
+ iter->range = NULL;
+ iter->partial_map |= iter->start_max < iter->end;
+}
+
+static void mtrr_lookup_var_start(struct mtrr_iter *iter)
+{
+ struct kvm_mtrr *mtrr_state = iter->mtrr_state;
+
+ iter->fixed = false;
+ iter->start_max = iter->start;
+ iter->range = list_prepare_entry(iter->range, &mtrr_state->head, node);
+
+ __mtrr_lookup_var_next(iter);
+}
+
+static void mtrr_lookup_fixed_next(struct mtrr_iter *iter)
+{
+ /* terminate the lookup. */
+ if (fixed_mtrr_range_end_addr(iter->seg, iter->index) >= iter->end) {
+ iter->fixed = false;
+ iter->range = NULL;
+ return;
+ }
+
+ iter->index++;
+
+ /* have looked up for all fixed MTRRs. */
+ if (iter->index >= ARRAY_SIZE(iter->mtrr_state->fixed_ranges))
+ return mtrr_lookup_var_start(iter);
+
+ /* switch to next segment. */
+ if (iter->index > fixed_mtrr_seg_end_range_index(iter->seg))
+ iter->seg++;
+}
+
+static void mtrr_lookup_var_next(struct mtrr_iter *iter)
+{
+ __mtrr_lookup_var_next(iter);
+}
+
+static void mtrr_lookup_start(struct mtrr_iter *iter)
+{
+ if (!mtrr_is_enabled(iter->mtrr_state)) {
+ iter->partial_map = true;
+ return;
+ }
+
+ if (!mtrr_lookup_fixed_start(iter))
+ mtrr_lookup_var_start(iter);
+}
+
+static void mtrr_lookup_init(struct mtrr_iter *iter,
+ struct kvm_mtrr *mtrr_state, u64 start, u64 end)
+{
+ iter->mtrr_state = mtrr_state;
+ iter->start = start;
+ iter->end = end;
+ iter->partial_map = false;
+ iter->fixed = false;
+ iter->range = NULL;
+
+ mtrr_lookup_start(iter);
+}
+
+static bool mtrr_lookup_okay(struct mtrr_iter *iter)
+{
+ if (iter->fixed) {
+ iter->mem_type = iter->mtrr_state->fixed_ranges[iter->index];
+ return true;
+ }
+
+ if (iter->range) {
+ iter->mem_type = iter->range->base & 0xff;
+ return true;
+ }
+
+ return false;
+}
+
+static void mtrr_lookup_next(struct mtrr_iter *iter)
+{
+ if (iter->fixed)
+ mtrr_lookup_fixed_next(iter);
+ else
+ mtrr_lookup_var_next(iter);
+}
+
+#define mtrr_for_each_mem_type(_iter_, _mtrr_, _gpa_start_, _gpa_end_) \
+ for (mtrr_lookup_init(_iter_, _mtrr_, _gpa_start_, _gpa_end_); \
+ mtrr_lookup_okay(_iter_); mtrr_lookup_next(_iter_))
+
+u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+ struct mtrr_iter iter;
+ u64 start, end;
+ int type = -1;
+ const int wt_wb_mask = (1 << MTRR_TYPE_WRBACK)
+ | (1 << MTRR_TYPE_WRTHROUGH);
+
+ start = gfn_to_gpa(gfn);
+ end = start + PAGE_SIZE;
+
+ mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
+ int curr_type = iter.mem_type;
+
+ /*
+ * Please refer to Intel SDM Volume 3: 11.11.4.1 MTRR
+ * Precedences.
+ */
+
+ if (type == -1) {
+ type = curr_type;
+ continue;
+ }
+
+ /*
+ * If two or more variable memory ranges match and the
+ * memory types are identical, then that memory type is
+ * used.
+ */
+ if (type == curr_type)
+ continue;
+
+ /*
+ * If two or more variable memory ranges match and one of
+ * the memory types is UC, the UC memory type used.
+ */
+ if (curr_type == MTRR_TYPE_UNCACHABLE)
+ return MTRR_TYPE_UNCACHABLE;
+
+ /*
+ * If two or more variable memory ranges match and the
+ * memory types are WT and WB, the WT memory type is used.
+ */
+ if (((1 << type) & wt_wb_mask) &&
+ ((1 << curr_type) & wt_wb_mask)) {
+ type = MTRR_TYPE_WRTHROUGH;
+ continue;
+ }
+
+ /*
+ * For overlaps not defined by the above rules, processor
+ * behavior is undefined.
+ */
+
+ /* We use WB for this undefined behavior. :( */
+ return MTRR_TYPE_WRBACK;
+ }
+
+ /* It is not covered by MTRRs. */
+ if (iter.partial_map) {
+ /*
+ * We just check one page, partially covered by MTRRs is
+ * impossible.
+ */
+ WARN_ON(type != -1);
+ type = mtrr_default_type(mtrr_state);
+ }
+ return type;
+}
+EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type);
+
+bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
+ int page_num)
+{
+ struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state;
+ struct mtrr_iter iter;
+ u64 start, end;
+ int type = -1;
+
+ start = gfn_to_gpa(gfn);
+ end = gfn_to_gpa(gfn + page_num);
+ mtrr_for_each_mem_type(&iter, mtrr_state, start, end) {
+ if (type == -1) {
+ type = iter.mem_type;
+ continue;
+ }
+
+ if (type != iter.mem_type)
+ return false;
+ }
+
+ if (!iter.partial_map)
+ return true;
+
+ if (type == -1)
+ return true;
+
+ return type == mtrr_default_type(mtrr_state);
+}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6e6d115fe9b5..0f67d7e24800 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -256,7 +256,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
if (ret)
return ret;
- mark_page_dirty(vcpu->kvm, table_gfn);
+ kvm_vcpu_mark_page_dirty(vcpu, table_gfn);
walker->ptes[level] = pte;
}
return 0;
@@ -338,7 +338,7 @@ retry_walk:
real_gfn = gpa_to_gfn(real_gfn);
- host_addr = gfn_to_hva_prot(vcpu->kvm, real_gfn,
+ host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, real_gfn,
&walker->pte_writable[walker->level - 1]);
if (unlikely(kvm_is_error_hva(host_addr)))
goto error;
@@ -511,11 +511,11 @@ static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
base_gpa = pte_gpa & ~mask;
index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
- r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
+ r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa,
gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
curr_pte = gw->prefetch_ptes[index];
} else
- r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
+ r = kvm_vcpu_read_guest_atomic(vcpu, pte_gpa,
&curr_pte, sizeof(curr_pte));
return r || curr_pte != gw->ptes[level - 1];
@@ -869,8 +869,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
if (!rmap_can_add(vcpu))
break;
- if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
- sizeof(pt_element_t)))
+ if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
+ sizeof(pt_element_t)))
break;
FNAME(update_pte)(vcpu, sp, sptep, &gpte);
@@ -956,8 +956,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
- if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
- sizeof(pt_element_t)))
+ if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
+ sizeof(pt_element_t)))
return -EINVAL;
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
@@ -970,7 +970,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
pte_access &= FNAME(gpte_access)(vcpu, gpte);
FNAME(protect_clean_gpte)(&pte_access, gpte);
- if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access,
+ if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access,
&nr_present))
continue;
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 29fbf9dfdc54..31aa2c85dc97 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -1,11 +1,12 @@
/*
* Kernel-based Virtual Machine -- Performance Monitoring Unit support
*
- * Copyright 2011 Red Hat, Inc. and/or its affiliates.
+ * Copyright 2015 Red Hat, Inc. and/or its affiliates.
*
* Authors:
* Avi Kivity <avi@redhat.com>
* Gleb Natapov <gleb@redhat.com>
+ * Wei Huang <wei@redhat.com>
*
* This work is licensed under the terms of the GNU GPL, version 2. See
* the COPYING file in the top-level directory.
@@ -19,88 +20,39 @@
#include "x86.h"
#include "cpuid.h"
#include "lapic.h"
+#include "pmu.h"
+
+/* NOTE:
+ * - Each perf counter is defined as "struct kvm_pmc";
+ * - There are two types of perf counters: general purpose (gp) and fixed.
+ * gp counters are stored in gp_counters[] and fixed counters are stored
+ * in fixed_counters[] respectively. Both of them are part of "struct
+ * kvm_pmu";
+ * - pmu.c understands the difference between gp counters and fixed counters.
+ * However AMD doesn't support fixed-counters;
+ * - There are three types of index to access perf counters (PMC):
+ * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
+ * has MSR_K7_PERFCTRn.
+ * 2. MSR Index (named idx): This normally is used by RDPMC instruction.
+ * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
+ * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
+ * that it also supports fixed counters. idx can be used to as index to
+ * gp and fixed counters.
+ * 3. Global PMC Index (named pmc): pmc is an index specific to PMU
+ * code. Each pmc, stored in kvm_pmc.idx field, is unique across
+ * all perf counters (both gp and fixed). The mapping relationship
+ * between pmc and perf counters is as the following:
+ * * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters
+ * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed
+ * * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters
+ */
-static struct kvm_arch_event_perf_mapping {
- u8 eventsel;
- u8 unit_mask;
- unsigned event_type;
- bool inexact;
-} arch_events[] = {
- /* Index must match CPUID 0x0A.EBX bit vector */
- [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
- [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
- [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES },
- [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
- [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
- [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
- [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
- [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES },
-};
-
-/* mapping between fixed pmc index and arch_events array */
-static int fixed_pmc_events[] = {1, 0, 7};
-
-static bool pmc_is_gp(struct kvm_pmc *pmc)
-{
- return pmc->type == KVM_PMC_GP;
-}
-
-static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
-{
- struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
-
- return pmu->counter_bitmask[pmc->type];
-}
-
-static inline bool pmc_enabled(struct kvm_pmc *pmc)
-{
- struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
- return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
-}
-
-static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
- u32 base)
-{
- if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
- return &pmu->gp_counters[msr - base];
- return NULL;
-}
-
-static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
-{
- int base = MSR_CORE_PERF_FIXED_CTR0;
- if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
- return &pmu->fixed_counters[msr - base];
- return NULL;
-}
-
-static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx)
-{
- return get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + idx);
-}
-
-static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx)
-{
- if (idx < INTEL_PMC_IDX_FIXED)
- return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0);
- else
- return get_fixed_pmc_idx(pmu, idx - INTEL_PMC_IDX_FIXED);
-}
-
-void kvm_deliver_pmi(struct kvm_vcpu *vcpu)
-{
- if (vcpu->arch.apic)
- kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
-}
-
-static void trigger_pmi(struct irq_work *irq_work)
+static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
{
- struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu,
- irq_work);
- struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu,
- arch.pmu);
+ struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
+ struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
- kvm_deliver_pmi(vcpu);
+ kvm_pmu_deliver_pmi(vcpu);
}
static void kvm_perf_overflow(struct perf_event *perf_event,
@@ -108,63 +60,46 @@ static void kvm_perf_overflow(struct perf_event *perf_event,
struct pt_regs *regs)
{
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
- struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
- if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) {
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ if (!test_and_set_bit(pmc->idx,
+ (unsigned long *)&pmu->reprogram_pmi)) {
__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
}
}
static void kvm_perf_overflow_intr(struct perf_event *perf_event,
- struct perf_sample_data *data, struct pt_regs *regs)
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
{
struct kvm_pmc *pmc = perf_event->overflow_handler_context;
- struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu;
- if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) {
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ if (!test_and_set_bit(pmc->idx,
+ (unsigned long *)&pmu->reprogram_pmi)) {
__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
kvm_make_request(KVM_REQ_PMU, pmc->vcpu);
+
/*
* Inject PMI. If vcpu was in a guest mode during NMI PMI
* can be ejected on a guest mode re-entry. Otherwise we can't
* be sure that vcpu wasn't executing hlt instruction at the
- * time of vmexit and is not going to re-enter guest mode until,
+ * time of vmexit and is not going to re-enter guest mode until
* woken up. So we should wake it, but this is impossible from
* NMI context. Do it from irq work instead.
*/
if (!kvm_is_in_guest())
- irq_work_queue(&pmc->vcpu->arch.pmu.irq_work);
+ irq_work_queue(&pmc_to_pmu(pmc)->irq_work);
else
kvm_make_request(KVM_REQ_PMI, pmc->vcpu);
}
}
-static u64 read_pmc(struct kvm_pmc *pmc)
-{
- u64 counter, enabled, running;
-
- counter = pmc->counter;
-
- if (pmc->perf_event)
- counter += perf_event_read_value(pmc->perf_event,
- &enabled, &running);
-
- /* FIXME: Scaling needed? */
-
- return counter & pmc_bitmask(pmc);
-}
-
-static void stop_counter(struct kvm_pmc *pmc)
-{
- if (pmc->perf_event) {
- pmc->counter = read_pmc(pmc);
- perf_event_release_kernel(pmc->perf_event);
- pmc->perf_event = NULL;
- }
-}
-
-static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
- unsigned config, bool exclude_user, bool exclude_kernel,
- bool intr, bool in_tx, bool in_tx_cp)
+static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
+ unsigned config, bool exclude_user,
+ bool exclude_kernel, bool intr,
+ bool in_tx, bool in_tx_cp)
{
struct perf_event *event;
struct perf_event_attr attr = {
@@ -177,6 +112,7 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
.exclude_kernel = exclude_kernel,
.config = config,
};
+
if (in_tx)
attr.config |= HSW_IN_TX;
if (in_tx_cp)
@@ -188,33 +124,16 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type,
intr ? kvm_perf_overflow_intr :
kvm_perf_overflow, pmc);
if (IS_ERR(event)) {
- printk_once("kvm: pmu event creation failed %ld\n",
- PTR_ERR(event));
+ printk_once("kvm_pmu: event creation failed %ld\n",
+ PTR_ERR(event));
return;
}
pmc->perf_event = event;
- clear_bit(pmc->idx, (unsigned long*)&pmc->vcpu->arch.pmu.reprogram_pmi);
-}
-
-static unsigned find_arch_event(struct kvm_pmu *pmu, u8 event_select,
- u8 unit_mask)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(arch_events); i++)
- if (arch_events[i].eventsel == event_select
- && arch_events[i].unit_mask == unit_mask
- && (pmu->available_event_types & (1 << i)))
- break;
-
- if (i == ARRAY_SIZE(arch_events))
- return PERF_COUNT_HW_MAX;
-
- return arch_events[i].event_type;
+ clear_bit(pmc->idx, (unsigned long*)&pmc_to_pmu(pmc)->reprogram_pmi);
}
-static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
+void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
{
unsigned config, type = PERF_TYPE_RAW;
u8 event_select, unit_mask;
@@ -224,21 +143,22 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
pmc->eventsel = eventsel;
- stop_counter(pmc);
+ pmc_stop_counter(pmc);
- if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_enabled(pmc))
+ if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc))
return;
event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE |
- ARCH_PERFMON_EVENTSEL_INV |
- ARCH_PERFMON_EVENTSEL_CMASK |
- HSW_IN_TX |
- HSW_IN_TX_CHECKPOINTED))) {
- config = find_arch_event(&pmc->vcpu->arch.pmu, event_select,
- unit_mask);
+ ARCH_PERFMON_EVENTSEL_INV |
+ ARCH_PERFMON_EVENTSEL_CMASK |
+ HSW_IN_TX |
+ HSW_IN_TX_CHECKPOINTED))) {
+ config = kvm_x86_ops->pmu_ops->find_arch_event(pmc_to_pmu(pmc),
+ event_select,
+ unit_mask);
if (config != PERF_COUNT_HW_MAX)
type = PERF_TYPE_HARDWARE;
}
@@ -246,56 +166,36 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
if (type == PERF_TYPE_RAW)
config = eventsel & X86_RAW_EVENT_MASK;
- reprogram_counter(pmc, type, config,
- !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
- !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
- eventsel & ARCH_PERFMON_EVENTSEL_INT,
- (eventsel & HSW_IN_TX),
- (eventsel & HSW_IN_TX_CHECKPOINTED));
+ pmc_reprogram_counter(pmc, type, config,
+ !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
+ !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
+ eventsel & ARCH_PERFMON_EVENTSEL_INT,
+ (eventsel & HSW_IN_TX),
+ (eventsel & HSW_IN_TX_CHECKPOINTED));
}
+EXPORT_SYMBOL_GPL(reprogram_gp_counter);
-static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx)
+void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
{
- unsigned en = en_pmi & 0x3;
- bool pmi = en_pmi & 0x8;
+ unsigned en_field = ctrl & 0x3;
+ bool pmi = ctrl & 0x8;
- stop_counter(pmc);
+ pmc_stop_counter(pmc);
- if (!en || !pmc_enabled(pmc))
+ if (!en_field || !pmc_is_enabled(pmc))
return;
- reprogram_counter(pmc, PERF_TYPE_HARDWARE,
- arch_events[fixed_pmc_events[idx]].event_type,
- !(en & 0x2), /* exclude user */
- !(en & 0x1), /* exclude kernel */
- pmi, false, false);
+ pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE,
+ kvm_x86_ops->pmu_ops->find_fixed_event(idx),
+ !(en_field & 0x2), /* exclude user */
+ !(en_field & 0x1), /* exclude kernel */
+ pmi, false, false);
}
+EXPORT_SYMBOL_GPL(reprogram_fixed_counter);
-static inline u8 fixed_en_pmi(u64 ctrl, int idx)
+void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx)
{
- return (ctrl >> (idx * 4)) & 0xf;
-}
-
-static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
-{
- int i;
-
- for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
- u8 en_pmi = fixed_en_pmi(data, i);
- struct kvm_pmc *pmc = get_fixed_pmc_idx(pmu, i);
-
- if (fixed_en_pmi(pmu->fixed_ctr_ctrl, i) == en_pmi)
- continue;
-
- reprogram_fixed_counter(pmc, en_pmi, i);
- }
-
- pmu->fixed_ctr_ctrl = data;
-}
-
-static void reprogram_idx(struct kvm_pmu *pmu, int idx)
-{
- struct kvm_pmc *pmc = global_idx_to_pmc(pmu, idx);
+ struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx);
if (!pmc)
return;
@@ -303,274 +203,107 @@ static void reprogram_idx(struct kvm_pmu *pmu, int idx)
if (pmc_is_gp(pmc))
reprogram_gp_counter(pmc, pmc->eventsel);
else {
- int fidx = idx - INTEL_PMC_IDX_FIXED;
- reprogram_fixed_counter(pmc,
- fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx);
+ int idx = pmc_idx - INTEL_PMC_IDX_FIXED;
+ u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx);
+
+ reprogram_fixed_counter(pmc, ctrl, idx);
}
}
+EXPORT_SYMBOL_GPL(reprogram_counter);
-static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
+void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ u64 bitmask;
int bit;
- u64 diff = pmu->global_ctrl ^ data;
-
- pmu->global_ctrl = data;
-
- for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
- reprogram_idx(pmu, bit);
-}
-bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr)
-{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- int ret;
-
- switch (msr) {
- case MSR_CORE_PERF_FIXED_CTR_CTRL:
- case MSR_CORE_PERF_GLOBAL_STATUS:
- case MSR_CORE_PERF_GLOBAL_CTRL:
- case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- ret = pmu->version > 1;
- break;
- default:
- ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)
- || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0)
- || get_fixed_pmc(pmu, msr);
- break;
- }
- return ret;
-}
+ bitmask = pmu->reprogram_pmi;
-int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
-{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- struct kvm_pmc *pmc;
+ for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) {
+ struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, bit);
- switch (index) {
- case MSR_CORE_PERF_FIXED_CTR_CTRL:
- *data = pmu->fixed_ctr_ctrl;
- return 0;
- case MSR_CORE_PERF_GLOBAL_STATUS:
- *data = pmu->global_status;
- return 0;
- case MSR_CORE_PERF_GLOBAL_CTRL:
- *data = pmu->global_ctrl;
- return 0;
- case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- *data = pmu->global_ovf_ctrl;
- return 0;
- default:
- if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
- (pmc = get_fixed_pmc(pmu, index))) {
- *data = read_pmc(pmc);
- return 0;
- } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
- *data = pmc->eventsel;
- return 0;
+ if (unlikely(!pmc || !pmc->perf_event)) {
+ clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi);
+ continue;
}
- }
- return 1;
-}
-int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
-{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- struct kvm_pmc *pmc;
- u32 index = msr_info->index;
- u64 data = msr_info->data;
-
- switch (index) {
- case MSR_CORE_PERF_FIXED_CTR_CTRL:
- if (pmu->fixed_ctr_ctrl == data)
- return 0;
- if (!(data & 0xfffffffffffff444ull)) {
- reprogram_fixed_counters(pmu, data);
- return 0;
- }
- break;
- case MSR_CORE_PERF_GLOBAL_STATUS:
- if (msr_info->host_initiated) {
- pmu->global_status = data;
- return 0;
- }
- break; /* RO MSR */
- case MSR_CORE_PERF_GLOBAL_CTRL:
- if (pmu->global_ctrl == data)
- return 0;
- if (!(data & pmu->global_ctrl_mask)) {
- global_ctrl_changed(pmu, data);
- return 0;
- }
- break;
- case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
- if (!msr_info->host_initiated)
- pmu->global_status &= ~data;
- pmu->global_ovf_ctrl = data;
- return 0;
- }
- break;
- default:
- if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) ||
- (pmc = get_fixed_pmc(pmu, index))) {
- if (!msr_info->host_initiated)
- data = (s64)(s32)data;
- pmc->counter += data - read_pmc(pmc);
- return 0;
- } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) {
- if (data == pmc->eventsel)
- return 0;
- if (!(data & pmu->reserved_bits)) {
- reprogram_gp_counter(pmc, data);
- return 0;
- }
- }
+ reprogram_counter(pmu, bit);
}
- return 1;
}
-int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc)
+/* check if idx is a valid index to access PMU */
+int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- bool fixed = pmc & (1u << 30);
- pmc &= ~(3u << 30);
- return (!fixed && pmc >= pmu->nr_arch_gp_counters) ||
- (fixed && pmc >= pmu->nr_arch_fixed_counters);
+ return kvm_x86_ops->pmu_ops->is_valid_msr_idx(vcpu, idx);
}
-int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data)
+int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- bool fast_mode = pmc & (1u << 31);
- bool fixed = pmc & (1u << 30);
- struct kvm_pmc *counters;
- u64 ctr;
-
- pmc &= ~(3u << 30);
- if (!fixed && pmc >= pmu->nr_arch_gp_counters)
- return 1;
- if (fixed && pmc >= pmu->nr_arch_fixed_counters)
+ bool fast_mode = idx & (1u << 31);
+ struct kvm_pmc *pmc;
+ u64 ctr_val;
+
+ pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx);
+ if (!pmc)
return 1;
- counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
- ctr = read_pmc(&counters[pmc]);
+
+ ctr_val = pmc_read_counter(pmc);
if (fast_mode)
- ctr = (u32)ctr;
- *data = ctr;
+ ctr_val = (u32)ctr_val;
+ *data = ctr_val;
return 0;
}
-void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu)
+void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- struct kvm_cpuid_entry2 *entry;
- union cpuid10_eax eax;
- union cpuid10_edx edx;
-
- pmu->nr_arch_gp_counters = 0;
- pmu->nr_arch_fixed_counters = 0;
- pmu->counter_bitmask[KVM_PMC_GP] = 0;
- pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
- pmu->version = 0;
- pmu->reserved_bits = 0xffffffff00200000ull;
-
- entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
- if (!entry)
- return;
- eax.full = entry->eax;
- edx.full = entry->edx;
-
- pmu->version = eax.split.version_id;
- if (!pmu->version)
- return;
-
- pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
- INTEL_PMC_MAX_GENERIC);
- pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
- pmu->available_event_types = ~entry->ebx &
- ((1ull << eax.split.mask_length) - 1);
-
- if (pmu->version == 1) {
- pmu->nr_arch_fixed_counters = 0;
- } else {
- pmu->nr_arch_fixed_counters =
- min_t(int, edx.split.num_counters_fixed,
- INTEL_PMC_MAX_FIXED);
- pmu->counter_bitmask[KVM_PMC_FIXED] =
- ((u64)1 << edx.split.bit_width_fixed) - 1;
- }
+ if (vcpu->arch.apic)
+ kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
+}
- pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
- (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
- pmu->global_ctrl_mask = ~pmu->global_ctrl;
+bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ return kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, msr);
+}
- entry = kvm_find_cpuid_entry(vcpu, 7, 0);
- if (entry &&
- (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
- (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
- pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+ return kvm_x86_ops->pmu_ops->get_msr(vcpu, msr, data);
}
-void kvm_pmu_init(struct kvm_vcpu *vcpu)
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
- int i;
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
+ return kvm_x86_ops->pmu_ops->set_msr(vcpu, msr_info);
+}
- memset(pmu, 0, sizeof(*pmu));
- for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
- pmu->gp_counters[i].type = KVM_PMC_GP;
- pmu->gp_counters[i].vcpu = vcpu;
- pmu->gp_counters[i].idx = i;
- }
- for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
- pmu->fixed_counters[i].type = KVM_PMC_FIXED;
- pmu->fixed_counters[i].vcpu = vcpu;
- pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
- }
- init_irq_work(&pmu->irq_work, trigger_pmi);
- kvm_pmu_cpuid_update(vcpu);
+/* refresh PMU settings. This function generally is called when underlying
+ * settings are changed (such as changes of PMU CPUID by guest VMs), which
+ * should rarely happen.
+ */
+void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops->pmu_ops->refresh(vcpu);
}
void kvm_pmu_reset(struct kvm_vcpu *vcpu)
{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- int i;
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
irq_work_sync(&pmu->irq_work);
- for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
- struct kvm_pmc *pmc = &pmu->gp_counters[i];
- stop_counter(pmc);
- pmc->counter = pmc->eventsel = 0;
- }
+ kvm_x86_ops->pmu_ops->reset(vcpu);
+}
- for (i = 0; i < INTEL_PMC_MAX_FIXED; i++)
- stop_counter(&pmu->fixed_counters[i]);
+void kvm_pmu_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
- pmu->global_ovf_ctrl = 0;
+ memset(pmu, 0, sizeof(*pmu));
+ kvm_x86_ops->pmu_ops->init(vcpu);
+ init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
+ kvm_pmu_refresh(vcpu);
}
void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
{
kvm_pmu_reset(vcpu);
}
-
-void kvm_handle_pmu_event(struct kvm_vcpu *vcpu)
-{
- struct kvm_pmu *pmu = &vcpu->arch.pmu;
- u64 bitmask;
- int bit;
-
- bitmask = pmu->reprogram_pmi;
-
- for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) {
- struct kvm_pmc *pmc = global_idx_to_pmc(pmu, bit);
-
- if (unlikely(!pmc || !pmc->perf_event)) {
- clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi);
- continue;
- }
-
- reprogram_idx(pmu, bit);
- }
-}
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
new file mode 100644
index 000000000000..f96e1f962587
--- /dev/null
+++ b/arch/x86/kvm/pmu.h
@@ -0,0 +1,118 @@
+#ifndef __KVM_X86_PMU_H
+#define __KVM_X86_PMU_H
+
+#define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu)
+#define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu))
+#define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu)
+
+/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
+#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf)
+
+struct kvm_event_hw_type_mapping {
+ u8 eventsel;
+ u8 unit_mask;
+ unsigned event_type;
+};
+
+struct kvm_pmu_ops {
+ unsigned (*find_arch_event)(struct kvm_pmu *pmu, u8 event_select,
+ u8 unit_mask);
+ unsigned (*find_fixed_event)(int idx);
+ bool (*pmc_is_enabled)(struct kvm_pmc *pmc);
+ struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx);
+ struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, unsigned idx);
+ int (*is_valid_msr_idx)(struct kvm_vcpu *vcpu, unsigned idx);
+ bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr);
+ int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+ int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+ void (*refresh)(struct kvm_vcpu *vcpu);
+ void (*init)(struct kvm_vcpu *vcpu);
+ void (*reset)(struct kvm_vcpu *vcpu);
+};
+
+static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ return pmu->counter_bitmask[pmc->type];
+}
+
+static inline u64 pmc_read_counter(struct kvm_pmc *pmc)
+{
+ u64 counter, enabled, running;
+
+ counter = pmc->counter;
+ if (pmc->perf_event)
+ counter += perf_event_read_value(pmc->perf_event,
+ &enabled, &running);
+ /* FIXME: Scaling needed? */
+ return counter & pmc_bitmask(pmc);
+}
+
+static inline void pmc_stop_counter(struct kvm_pmc *pmc)
+{
+ if (pmc->perf_event) {
+ pmc->counter = pmc_read_counter(pmc);
+ perf_event_release_kernel(pmc->perf_event);
+ pmc->perf_event = NULL;
+ }
+}
+
+static inline bool pmc_is_gp(struct kvm_pmc *pmc)
+{
+ return pmc->type == KVM_PMC_GP;
+}
+
+static inline bool pmc_is_fixed(struct kvm_pmc *pmc)
+{
+ return pmc->type == KVM_PMC_FIXED;
+}
+
+static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
+{
+ return kvm_x86_ops->pmu_ops->pmc_is_enabled(pmc);
+}
+
+/* returns general purpose PMC with the specified MSR. Note that it can be
+ * used for both PERFCTRn and EVNTSELn; that is why it accepts base as a
+ * paramenter to tell them apart.
+ */
+static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
+ u32 base)
+{
+ if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
+ return &pmu->gp_counters[msr - base];
+
+ return NULL;
+}
+
+/* returns fixed PMC with the specified MSR */
+static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr)
+{
+ int base = MSR_CORE_PERF_FIXED_CTR0;
+
+ if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
+ return &pmu->fixed_counters[msr - base];
+
+ return NULL;
+}
+
+void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel);
+void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx);
+void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx);
+
+void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu);
+void kvm_pmu_handle_event(struct kvm_vcpu *vcpu);
+int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data);
+int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx);
+bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr);
+int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
+int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
+void kvm_pmu_refresh(struct kvm_vcpu *vcpu);
+void kvm_pmu_reset(struct kvm_vcpu *vcpu);
+void kvm_pmu_init(struct kvm_vcpu *vcpu);
+void kvm_pmu_destroy(struct kvm_vcpu *vcpu);
+
+extern struct kvm_pmu_ops intel_pmu_ops;
+extern struct kvm_pmu_ops amd_pmu_ops;
+#endif /* __KVM_X86_PMU_H */
diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c
new file mode 100644
index 000000000000..886aa25a7131
--- /dev/null
+++ b/arch/x86/kvm/pmu_amd.c
@@ -0,0 +1,207 @@
+/*
+ * KVM PMU support for AMD
+ *
+ * Copyright 2015, Red Hat, Inc. and/or its affiliates.
+ *
+ * Author:
+ * Wei Huang <wei@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Implementation is based on pmu_intel.c file
+ */
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "pmu.h"
+
+/* duplicated from amd_perfmon_event_map, K7 and above should work. */
+static struct kvm_event_hw_type_mapping amd_event_mapping[] = {
+ [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES },
+ [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
+ [2] = { 0x80, 0x00, PERF_COUNT_HW_CACHE_REFERENCES },
+ [3] = { 0x81, 0x00, PERF_COUNT_HW_CACHE_MISSES },
+ [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
+ [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+ [6] = { 0xd0, 0x00, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
+ [7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
+};
+
+static unsigned amd_find_arch_event(struct kvm_pmu *pmu,
+ u8 event_select,
+ u8 unit_mask)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++)
+ if (amd_event_mapping[i].eventsel == event_select
+ && amd_event_mapping[i].unit_mask == unit_mask)
+ break;
+
+ if (i == ARRAY_SIZE(amd_event_mapping))
+ return PERF_COUNT_HW_MAX;
+
+ return amd_event_mapping[i].event_type;
+}
+
+/* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */
+static unsigned amd_find_fixed_event(int idx)
+{
+ return PERF_COUNT_HW_MAX;
+}
+
+/* check if a PMC is enabled by comparing it against global_ctrl bits. Because
+ * AMD CPU doesn't have global_ctrl MSR, all PMCs are enabled (return TRUE).
+ */
+static bool amd_pmc_is_enabled(struct kvm_pmc *pmc)
+{
+ return true;
+}
+
+static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
+{
+ return get_gp_pmc(pmu, MSR_K7_EVNTSEL0 + pmc_idx, MSR_K7_EVNTSEL0);
+}
+
+/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
+static int amd_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ idx &= ~(3u << 30);
+
+ return (idx >= pmu->nr_arch_gp_counters);
+}
+
+/* idx is the ECX register of RDPMC instruction */
+static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, unsigned idx)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *counters;
+
+ idx &= ~(3u << 30);
+ if (idx >= pmu->nr_arch_gp_counters)
+ return NULL;
+ counters = pmu->gp_counters;
+
+ return &counters[idx];
+}
+
+static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int ret = false;
+
+ ret = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0) ||
+ get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0);
+
+ return ret;
+}
+
+static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+
+ /* MSR_K7_PERFCTRn */
+ pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0);
+ if (pmc) {
+ *data = pmc_read_counter(pmc);
+ return 0;
+ }
+ /* MSR_K7_EVNTSELn */
+ pmc = get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0);
+ if (pmc) {
+ *data = pmc->eventsel;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+
+ /* MSR_K7_PERFCTRn */
+ pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0);
+ if (pmc) {
+ if (!msr_info->host_initiated)
+ data = (s64)data;
+ pmc->counter += data - pmc_read_counter(pmc);
+ return 0;
+ }
+ /* MSR_K7_EVNTSELn */
+ pmc = get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0);
+ if (pmc) {
+ if (data == pmc->eventsel)
+ return 0;
+ if (!(data & pmu->reserved_bits)) {
+ reprogram_gp_counter(pmc, data);
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
+ pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
+ pmu->reserved_bits = 0xffffffff00200000ull;
+ /* not applicable to AMD; but clean them to prevent any fall out */
+ pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+ pmu->nr_arch_fixed_counters = 0;
+ pmu->version = 0;
+ pmu->global_status = 0;
+}
+
+static void amd_pmu_init(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int i;
+
+ for (i = 0; i < AMD64_NUM_COUNTERS ; i++) {
+ pmu->gp_counters[i].type = KVM_PMC_GP;
+ pmu->gp_counters[i].vcpu = vcpu;
+ pmu->gp_counters[i].idx = i;
+ }
+}
+
+static void amd_pmu_reset(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int i;
+
+ for (i = 0; i < AMD64_NUM_COUNTERS; i++) {
+ struct kvm_pmc *pmc = &pmu->gp_counters[i];
+
+ pmc_stop_counter(pmc);
+ pmc->counter = pmc->eventsel = 0;
+ }
+}
+
+struct kvm_pmu_ops amd_pmu_ops = {
+ .find_arch_event = amd_find_arch_event,
+ .find_fixed_event = amd_find_fixed_event,
+ .pmc_is_enabled = amd_pmc_is_enabled,
+ .pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
+ .msr_idx_to_pmc = amd_msr_idx_to_pmc,
+ .is_valid_msr_idx = amd_is_valid_msr_idx,
+ .is_valid_msr = amd_is_valid_msr,
+ .get_msr = amd_pmu_get_msr,
+ .set_msr = amd_pmu_set_msr,
+ .refresh = amd_pmu_refresh,
+ .init = amd_pmu_init,
+ .reset = amd_pmu_reset,
+};
diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c
new file mode 100644
index 000000000000..ab38af4f4947
--- /dev/null
+++ b/arch/x86/kvm/pmu_intel.c
@@ -0,0 +1,358 @@
+/*
+ * KVM PMU support for Intel CPUs
+ *
+ * Copyright 2011 Red Hat, Inc. and/or its affiliates.
+ *
+ * Authors:
+ * Avi Kivity <avi@redhat.com>
+ * Gleb Natapov <gleb@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+#include <linux/perf_event.h>
+#include <asm/perf_event.h>
+#include "x86.h"
+#include "cpuid.h"
+#include "lapic.h"
+#include "pmu.h"
+
+static struct kvm_event_hw_type_mapping intel_arch_events[] = {
+ /* Index must match CPUID 0x0A.EBX bit vector */
+ [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
+ [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
+ [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES },
+ [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES },
+ [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
+ [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
+ [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+ [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES },
+};
+
+/* mapping between fixed pmc index and intel_arch_events array */
+static int fixed_pmc_events[] = {1, 0, 7};
+
+static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data)
+{
+ int i;
+
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++) {
+ u8 new_ctrl = fixed_ctrl_field(data, i);
+ u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i);
+ struct kvm_pmc *pmc;
+
+ pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i);
+
+ if (old_ctrl == new_ctrl)
+ continue;
+
+ reprogram_fixed_counter(pmc, new_ctrl, i);
+ }
+
+ pmu->fixed_ctr_ctrl = data;
+}
+
+/* function is called when global control register has been updated. */
+static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data)
+{
+ int bit;
+ u64 diff = pmu->global_ctrl ^ data;
+
+ pmu->global_ctrl = data;
+
+ for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX)
+ reprogram_counter(pmu, bit);
+}
+
+static unsigned intel_find_arch_event(struct kvm_pmu *pmu,
+ u8 event_select,
+ u8 unit_mask)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++)
+ if (intel_arch_events[i].eventsel == event_select
+ && intel_arch_events[i].unit_mask == unit_mask
+ && (pmu->available_event_types & (1 << i)))
+ break;
+
+ if (i == ARRAY_SIZE(intel_arch_events))
+ return PERF_COUNT_HW_MAX;
+
+ return intel_arch_events[i].event_type;
+}
+
+static unsigned intel_find_fixed_event(int idx)
+{
+ if (idx >= ARRAY_SIZE(fixed_pmc_events))
+ return PERF_COUNT_HW_MAX;
+
+ return intel_arch_events[fixed_pmc_events[idx]].event_type;
+}
+
+/* check if a PMC is enabled by comparising it with globl_ctrl bits. */
+static bool intel_pmc_is_enabled(struct kvm_pmc *pmc)
+{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+
+ return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
+}
+
+static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx)
+{
+ if (pmc_idx < INTEL_PMC_IDX_FIXED)
+ return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx,
+ MSR_P6_EVNTSEL0);
+ else {
+ u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED;
+
+ return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0);
+ }
+}
+
+/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
+static int intel_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ bool fixed = idx & (1u << 30);
+
+ idx &= ~(3u << 30);
+
+ return (!fixed && idx >= pmu->nr_arch_gp_counters) ||
+ (fixed && idx >= pmu->nr_arch_fixed_counters);
+}
+
+static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu,
+ unsigned idx)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ bool fixed = idx & (1u << 30);
+ struct kvm_pmc *counters;
+
+ idx &= ~(3u << 30);
+ if (!fixed && idx >= pmu->nr_arch_gp_counters)
+ return NULL;
+ if (fixed && idx >= pmu->nr_arch_fixed_counters)
+ return NULL;
+ counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
+
+ return &counters[idx];
+}
+
+static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int ret;
+
+ switch (msr) {
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ ret = pmu->version > 1;
+ break;
+ default:
+ ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
+ get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
+ get_fixed_pmc(pmu, msr);
+ break;
+ }
+
+ return ret;
+}
+
+static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+
+ switch (msr) {
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ *data = pmu->fixed_ctr_ctrl;
+ return 0;
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ *data = pmu->global_status;
+ return 0;
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ *data = pmu->global_ctrl;
+ return 0;
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ *data = pmu->global_ovf_ctrl;
+ return 0;
+ default:
+ if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
+ (pmc = get_fixed_pmc(pmu, msr))) {
+ *data = pmc_read_counter(pmc);
+ return 0;
+ } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
+ *data = pmc->eventsel;
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
+
+ switch (msr) {
+ case MSR_CORE_PERF_FIXED_CTR_CTRL:
+ if (pmu->fixed_ctr_ctrl == data)
+ return 0;
+ if (!(data & 0xfffffffffffff444ull)) {
+ reprogram_fixed_counters(pmu, data);
+ return 0;
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_STATUS:
+ if (msr_info->host_initiated) {
+ pmu->global_status = data;
+ return 0;
+ }
+ break; /* RO MSR */
+ case MSR_CORE_PERF_GLOBAL_CTRL:
+ if (pmu->global_ctrl == data)
+ return 0;
+ if (!(data & pmu->global_ctrl_mask)) {
+ global_ctrl_changed(pmu, data);
+ return 0;
+ }
+ break;
+ case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
+ if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) {
+ if (!msr_info->host_initiated)
+ pmu->global_status &= ~data;
+ pmu->global_ovf_ctrl = data;
+ return 0;
+ }
+ break;
+ default:
+ if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
+ (pmc = get_fixed_pmc(pmu, msr))) {
+ if (!msr_info->host_initiated)
+ data = (s64)(s32)data;
+ pmc->counter += data - pmc_read_counter(pmc);
+ return 0;
+ } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
+ if (data == pmc->eventsel)
+ return 0;
+ if (!(data & pmu->reserved_bits)) {
+ reprogram_gp_counter(pmc, data);
+ return 0;
+ }
+ }
+ }
+
+ return 1;
+}
+
+static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_cpuid_entry2 *entry;
+ union cpuid10_eax eax;
+ union cpuid10_edx edx;
+
+ pmu->nr_arch_gp_counters = 0;
+ pmu->nr_arch_fixed_counters = 0;
+ pmu->counter_bitmask[KVM_PMC_GP] = 0;
+ pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
+ pmu->version = 0;
+ pmu->reserved_bits = 0xffffffff00200000ull;
+
+ entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
+ if (!entry)
+ return;
+ eax.full = entry->eax;
+ edx.full = entry->edx;
+
+ pmu->version = eax.split.version_id;
+ if (!pmu->version)
+ return;
+
+ pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
+ INTEL_PMC_MAX_GENERIC);
+ pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
+ pmu->available_event_types = ~entry->ebx &
+ ((1ull << eax.split.mask_length) - 1);
+
+ if (pmu->version == 1) {
+ pmu->nr_arch_fixed_counters = 0;
+ } else {
+ pmu->nr_arch_fixed_counters =
+ min_t(int, edx.split.num_counters_fixed,
+ INTEL_PMC_MAX_FIXED);
+ pmu->counter_bitmask[KVM_PMC_FIXED] =
+ ((u64)1 << edx.split.bit_width_fixed) - 1;
+ }
+
+ pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) |
+ (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
+ pmu->global_ctrl_mask = ~pmu->global_ctrl;
+
+ entry = kvm_find_cpuid_entry(vcpu, 7, 0);
+ if (entry &&
+ (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
+ (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
+ pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
+}
+
+static void intel_pmu_init(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
+ for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
+ pmu->gp_counters[i].type = KVM_PMC_GP;
+ pmu->gp_counters[i].vcpu = vcpu;
+ pmu->gp_counters[i].idx = i;
+ }
+
+ for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
+ pmu->fixed_counters[i].type = KVM_PMC_FIXED;
+ pmu->fixed_counters[i].vcpu = vcpu;
+ pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
+ }
+}
+
+static void intel_pmu_reset(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ int i;
+
+ for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) {
+ struct kvm_pmc *pmc = &pmu->gp_counters[i];
+
+ pmc_stop_counter(pmc);
+ pmc->counter = pmc->eventsel = 0;
+ }
+
+ for (i = 0; i < INTEL_PMC_MAX_FIXED; i++)
+ pmc_stop_counter(&pmu->fixed_counters[i]);
+
+ pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status =
+ pmu->global_ovf_ctrl = 0;
+}
+
+struct kvm_pmu_ops intel_pmu_ops = {
+ .find_arch_event = intel_find_arch_event,
+ .find_fixed_event = intel_find_fixed_event,
+ .pmc_is_enabled = intel_pmc_is_enabled,
+ .pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
+ .msr_idx_to_pmc = intel_msr_idx_to_pmc,
+ .is_valid_msr_idx = intel_is_valid_msr_idx,
+ .is_valid_msr = intel_is_valid_msr,
+ .get_msr = intel_pmu_get_msr,
+ .set_msr = intel_pmu_set_msr,
+ .refresh = intel_pmu_refresh,
+ .init = intel_pmu_init,
+ .reset = intel_pmu_reset,
+};
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9afa233b5482..602b974a60a6 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -21,6 +21,7 @@
#include "kvm_cache_regs.h"
#include "x86.h"
#include "cpuid.h"
+#include "pmu.h"
#include <linux/module.h>
#include <linux/mod_devicetable.h>
@@ -28,7 +29,7 @@
#include <linux/vmalloc.h>
#include <linux/highmem.h>
#include <linux/sched.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
#include <linux/slab.h>
#include <asm/perf_event.h>
@@ -511,8 +512,10 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (svm->vmcb->control.next_rip != 0)
+ if (svm->vmcb->control.next_rip != 0) {
+ WARN_ON(!static_cpu_has(X86_FEATURE_NRIPS));
svm->next_rip = svm->vmcb->control.next_rip;
+ }
if (!svm->next_rip) {
if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
@@ -1082,7 +1085,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
return target_tsc - tsc;
}
-static void init_vmcb(struct vcpu_svm *svm)
+static void init_vmcb(struct vcpu_svm *svm, bool init_event)
{
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
@@ -1153,17 +1156,17 @@ static void init_vmcb(struct vcpu_svm *svm)
init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
- svm_set_efer(&svm->vcpu, 0);
+ if (!init_event)
+ svm_set_efer(&svm->vcpu, 0);
save->dr6 = 0xffff0ff0;
kvm_set_rflags(&svm->vcpu, 2);
save->rip = 0x0000fff0;
svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
/*
- * This is the guest-visible cr0 value.
* svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
+ * It also updates the guest-visible cr0 value.
*/
- svm->vcpu.arch.cr0 = 0;
(void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
save->cr4 = X86_CR4_PAE;
@@ -1176,7 +1179,7 @@ static void init_vmcb(struct vcpu_svm *svm)
clr_exception_intercept(svm, PF_VECTOR);
clr_cr_intercept(svm, INTERCEPT_CR3_READ);
clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
- save->g_pat = 0x0007040600070406ULL;
+ save->g_pat = svm->vcpu.arch.pat;
save->cr3 = 0;
save->cr4 = 0;
}
@@ -1195,13 +1198,19 @@ static void init_vmcb(struct vcpu_svm *svm)
enable_gif(svm);
}
-static void svm_vcpu_reset(struct kvm_vcpu *vcpu)
+static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_svm *svm = to_svm(vcpu);
u32 dummy;
u32 eax = 1;
- init_vmcb(svm);
+ if (!init_event) {
+ svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
+ MSR_IA32_APICBASE_ENABLE;
+ if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
+ svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+ }
+ init_vmcb(svm, init_event);
kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
@@ -1257,12 +1266,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
clear_page(svm->vmcb);
svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
svm->asid_generation = 0;
- init_vmcb(svm);
-
- svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
- MSR_IA32_APICBASE_ENABLE;
- if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
- svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
+ init_vmcb(svm, false);
svm_init_osvw(&svm->vcpu);
@@ -1575,7 +1579,8 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
* does not do it - this results in some delay at
* reboot
*/
- cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
+ if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_CD_NW_CLEARED))
+ cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
svm->vmcb->save.cr0 = cr0;
mark_dirty(svm->vmcb, VMCB_CR);
update_cr0_intercept(svm);
@@ -1883,7 +1888,7 @@ static int shutdown_interception(struct vcpu_svm *svm)
* so reinitialize it.
*/
clear_page(svm->vmcb);
- init_vmcb(svm);
+ init_vmcb(svm, false);
kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
return 0;
@@ -1953,8 +1958,8 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
u64 pdpte;
int ret;
- ret = kvm_read_guest_page(vcpu->kvm, gpa_to_gfn(cr3), &pdpte,
- offset_in_page(cr3) + index * 8, 8);
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
+ offset_in_page(cr3) + index * 8, 8);
if (ret)
return 0;
return pdpte;
@@ -2112,7 +2117,7 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
might_sleep();
- page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
+ page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT);
if (is_error_page(page))
goto error;
@@ -2151,7 +2156,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
mask = (0xf >> (4 - size)) << start_bit;
val = 0;
- if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, iopm_len))
+ if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
return NESTED_EXIT_DONE;
return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
@@ -2176,7 +2181,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
/* Offset is in 32 bit units but need in 8 bit units */
offset *= 4;
- if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4))
+ if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4))
return NESTED_EXIT_DONE;
return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
@@ -2447,7 +2452,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
p = msrpm_offsets[i];
offset = svm->nested.vmcb_msrpm + (p * 4);
- if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4))
+ if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
return false;
svm->nested.msrpm[p] = svm->msrpm[p] | value;
@@ -3067,42 +3072,42 @@ static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
svm_scale_tsc(vcpu, host_tsc);
}
-static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
+static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_svm *svm = to_svm(vcpu);
- switch (ecx) {
+ switch (msr_info->index) {
case MSR_IA32_TSC: {
- *data = svm->vmcb->control.tsc_offset +
+ msr_info->data = svm->vmcb->control.tsc_offset +
svm_scale_tsc(vcpu, native_read_tsc());
break;
}
case MSR_STAR:
- *data = svm->vmcb->save.star;
+ msr_info->data = svm->vmcb->save.star;
break;
#ifdef CONFIG_X86_64
case MSR_LSTAR:
- *data = svm->vmcb->save.lstar;
+ msr_info->data = svm->vmcb->save.lstar;
break;
case MSR_CSTAR:
- *data = svm->vmcb->save.cstar;
+ msr_info->data = svm->vmcb->save.cstar;
break;
case MSR_KERNEL_GS_BASE:
- *data = svm->vmcb->save.kernel_gs_base;
+ msr_info->data = svm->vmcb->save.kernel_gs_base;
break;
case MSR_SYSCALL_MASK:
- *data = svm->vmcb->save.sfmask;
+ msr_info->data = svm->vmcb->save.sfmask;
break;
#endif
case MSR_IA32_SYSENTER_CS:
- *data = svm->vmcb->save.sysenter_cs;
+ msr_info->data = svm->vmcb->save.sysenter_cs;
break;
case MSR_IA32_SYSENTER_EIP:
- *data = svm->sysenter_eip;
+ msr_info->data = svm->sysenter_eip;
break;
case MSR_IA32_SYSENTER_ESP:
- *data = svm->sysenter_esp;
+ msr_info->data = svm->sysenter_esp;
break;
/*
* Nobody will change the following 5 values in the VMCB so we can
@@ -3110,31 +3115,31 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
* implemented.
*/
case MSR_IA32_DEBUGCTLMSR:
- *data = svm->vmcb->save.dbgctl;
+ msr_info->data = svm->vmcb->save.dbgctl;
break;
case MSR_IA32_LASTBRANCHFROMIP:
- *data = svm->vmcb->save.br_from;
+ msr_info->data = svm->vmcb->save.br_from;
break;
case MSR_IA32_LASTBRANCHTOIP:
- *data = svm->vmcb->save.br_to;
+ msr_info->data = svm->vmcb->save.br_to;
break;
case MSR_IA32_LASTINTFROMIP:
- *data = svm->vmcb->save.last_excp_from;
+ msr_info->data = svm->vmcb->save.last_excp_from;
break;
case MSR_IA32_LASTINTTOIP:
- *data = svm->vmcb->save.last_excp_to;
+ msr_info->data = svm->vmcb->save.last_excp_to;
break;
case MSR_VM_HSAVE_PA:
- *data = svm->nested.hsave_msr;
+ msr_info->data = svm->nested.hsave_msr;
break;
case MSR_VM_CR:
- *data = svm->nested.vm_cr_msr;
+ msr_info->data = svm->nested.vm_cr_msr;
break;
case MSR_IA32_UCODE_REV:
- *data = 0x01000065;
+ msr_info->data = 0x01000065;
break;
default:
- return kvm_get_msr_common(vcpu, ecx, data);
+ return kvm_get_msr_common(vcpu, msr_info);
}
return 0;
}
@@ -3142,16 +3147,20 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
static int rdmsr_interception(struct vcpu_svm *svm)
{
u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
- u64 data;
+ struct msr_data msr_info;
- if (svm_get_msr(&svm->vcpu, ecx, &data)) {
+ msr_info.index = ecx;
+ msr_info.host_initiated = false;
+ if (svm_get_msr(&svm->vcpu, &msr_info)) {
trace_kvm_msr_read_ex(ecx);
kvm_inject_gp(&svm->vcpu, 0);
} else {
- trace_kvm_msr_read(ecx, data);
+ trace_kvm_msr_read(ecx, msr_info.data);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, data & 0xffffffff);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, data >> 32);
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RAX,
+ msr_info.data & 0xffffffff);
+ kvm_register_write(&svm->vcpu, VCPU_REGS_RDX,
+ msr_info.data >> 32);
svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
skip_emulated_instruction(&svm->vcpu);
}
@@ -3388,6 +3397,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_MWAIT] = mwait_interception,
[SVM_EXIT_XSETBV] = xsetbv_interception,
[SVM_EXIT_NPF] = pf_interception,
+ [SVM_EXIT_RSM] = emulate_on_interception,
};
static void dump_vmcb(struct kvm_vcpu *vcpu)
@@ -4073,6 +4083,11 @@ static bool svm_cpu_has_accelerated_tpr(void)
return false;
}
+static bool svm_has_high_real_mode_segbase(void)
+{
+ return true;
+}
+
static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
return 0;
@@ -4317,7 +4332,9 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu,
break;
}
- vmcb->control.next_rip = info->next_rip;
+ /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
+ if (static_cpu_has(X86_FEATURE_NRIPS))
+ vmcb->control.next_rip = info->next_rip;
vmcb->control.exit_code = icpt_info.exit_code;
vmexit = nested_svm_exit_handled(svm);
@@ -4346,6 +4363,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.hardware_enable = svm_hardware_enable,
.hardware_disable = svm_hardware_disable,
.cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
+ .cpu_has_high_real_mode_segbase = svm_has_high_real_mode_segbase,
.vcpu_create = svm_create_vcpu,
.vcpu_free = svm_free_vcpu,
@@ -4440,6 +4458,8 @@ static struct kvm_x86_ops svm_x86_ops = {
.handle_external_intr = svm_handle_external_intr,
.sched_in = svm_sched_in,
+
+ .pmu_ops = &amd_pmu_ops,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 7c7bc8bef21f..4eae7c35ddf5 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -952,6 +952,28 @@ TRACE_EVENT(kvm_wait_lapic_expire,
__entry->delta < 0 ? "early" : "late")
);
+TRACE_EVENT(kvm_enter_smm,
+ TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering),
+ TP_ARGS(vcpu_id, smbase, entering),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( u64, smbase )
+ __field( bool, entering )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->smbase = smbase;
+ __entry->entering = entering;
+ ),
+
+ TP_printk("vcpu %u: %s SMM, smbase 0x%llx",
+ __entry->vcpu_id,
+ __entry->entering ? "entering" : "leaving",
+ __entry->smbase)
+);
+
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2d73807f0d31..e856dd566f4c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -28,7 +28,7 @@
#include <linux/sched.h>
#include <linux/moduleparam.h>
#include <linux/mod_devicetable.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
#include <linux/slab.h>
#include <linux/tboot.h>
#include <linux/hrtimer.h>
@@ -40,14 +40,14 @@
#include <asm/vmx.h>
#include <asm/virtext.h>
#include <asm/mce.h>
-#include <asm/i387.h>
-#include <asm/xcr.h>
+#include <asm/fpu/internal.h>
#include <asm/perf_event.h>
#include <asm/debugreg.h>
#include <asm/kexec.h>
#include <asm/apic.h>
#include "trace.h"
+#include "pmu.h"
#define __ex(x) __kvm_handle_fault_on_reboot(x)
#define __ex_clear(x, reg) \
@@ -786,7 +786,7 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
{
- struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT);
+ struct page *page = kvm_vcpu_gfn_to_page(vcpu, addr >> PAGE_SHIFT);
if (is_error_page(page))
return NULL;
@@ -1883,7 +1883,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
* If the FPU is not active (through the host task or
* the guest vcpu), then restore the cr0.TS bit.
*/
- if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded)
+ if (!fpregs_active() && !vmx->vcpu.guest_fpu_loaded)
stts();
load_gdt(this_cpu_ptr(&host_gdt));
}
@@ -2170,8 +2170,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
if (is_guest_mode(vcpu))
msr_bitmap = vmx_msr_bitmap_nested;
- else if (irqchip_in_kernel(vcpu->kvm) &&
- apic_x2apic_mode(vcpu->arch.apic)) {
+ else if (vcpu->arch.apic_base & X2APIC_ENABLE) {
if (is_long_mode(vcpu))
msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
else
@@ -2623,76 +2622,69 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
- u64 data;
struct shared_msr_entry *msr;
- if (!pdata) {
- printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
- return -EINVAL;
- }
-
- switch (msr_index) {
+ switch (msr_info->index) {
#ifdef CONFIG_X86_64
case MSR_FS_BASE:
- data = vmcs_readl(GUEST_FS_BASE);
+ msr_info->data = vmcs_readl(GUEST_FS_BASE);
break;
case MSR_GS_BASE:
- data = vmcs_readl(GUEST_GS_BASE);
+ msr_info->data = vmcs_readl(GUEST_GS_BASE);
break;
case MSR_KERNEL_GS_BASE:
vmx_load_host_state(to_vmx(vcpu));
- data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
+ msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
break;
#endif
case MSR_EFER:
- return kvm_get_msr_common(vcpu, msr_index, pdata);
+ return kvm_get_msr_common(vcpu, msr_info);
case MSR_IA32_TSC:
- data = guest_read_tsc();
+ msr_info->data = guest_read_tsc();
break;
case MSR_IA32_SYSENTER_CS:
- data = vmcs_read32(GUEST_SYSENTER_CS);
+ msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
break;
case MSR_IA32_SYSENTER_EIP:
- data = vmcs_readl(GUEST_SYSENTER_EIP);
+ msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP);
break;
case MSR_IA32_SYSENTER_ESP:
- data = vmcs_readl(GUEST_SYSENTER_ESP);
+ msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
break;
case MSR_IA32_BNDCFGS:
if (!vmx_mpx_supported())
return 1;
- data = vmcs_read64(GUEST_BNDCFGS);
+ msr_info->data = vmcs_read64(GUEST_BNDCFGS);
break;
case MSR_IA32_FEATURE_CONTROL:
if (!nested_vmx_allowed(vcpu))
return 1;
- data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
+ msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!nested_vmx_allowed(vcpu))
return 1;
- return vmx_get_vmx_msr(vcpu, msr_index, pdata);
+ return vmx_get_vmx_msr(vcpu, msr_info->index, &msr_info->data);
case MSR_IA32_XSS:
if (!vmx_xsaves_supported())
return 1;
- data = vcpu->arch.ia32_xss;
+ msr_info->data = vcpu->arch.ia32_xss;
break;
case MSR_TSC_AUX:
if (!to_vmx(vcpu)->rdtscp_enabled)
return 1;
/* Otherwise falls through */
default:
- msr = find_msr_entry(to_vmx(vcpu), msr_index);
+ msr = find_msr_entry(to_vmx(vcpu), msr_info->index);
if (msr) {
- data = msr->data;
+ msr_info->data = msr->data;
break;
}
- return kvm_get_msr_common(vcpu, msr_index, pdata);
+ return kvm_get_msr_common(vcpu, msr_info);
}
- *pdata = data;
return 0;
}
@@ -4123,7 +4115,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
kvm_userspace_mem.flags = 0;
kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE;
kvm_userspace_mem.memory_size = PAGE_SIZE;
- r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
+ r = __x86_set_memory_region(kvm, &kvm_userspace_mem);
if (r)
goto out;
@@ -4158,7 +4150,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
kvm_userspace_mem.guest_phys_addr =
kvm->arch.ept_identity_map_addr;
kvm_userspace_mem.memory_size = PAGE_SIZE;
- r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
+ r = __x86_set_memory_region(kvm, &kvm_userspace_mem);
return r;
}
@@ -4667,16 +4659,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest));
- if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
- u32 msr_low, msr_high;
- u64 host_pat;
- rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
- host_pat = msr_low | ((u64) msr_high << 32);
- /* Write the default value follow host pat */
- vmcs_write64(GUEST_IA32_PAT, host_pat);
- /* Keep arch.pat sync with GUEST_IA32_PAT */
- vmx->vcpu.arch.pat = host_pat;
- }
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+ vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
u32 index = vmx_msr_index[i];
@@ -4708,22 +4692,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
return 0;
}
-static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
+static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct msr_data apic_base_msr;
+ u64 cr0;
vmx->rmode.vm86_active = 0;
vmx->soft_vnmi_blocked = 0;
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
- kvm_set_cr8(&vmx->vcpu, 0);
- apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
- if (kvm_vcpu_is_reset_bsp(&vmx->vcpu))
- apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
- apic_base_msr.host_initiated = true;
- kvm_set_apic_base(&vmx->vcpu, &apic_base_msr);
+ kvm_set_cr8(vcpu, 0);
+
+ if (!init_event) {
+ apic_base_msr.data = APIC_DEFAULT_PHYS_BASE |
+ MSR_IA32_APICBASE_ENABLE;
+ if (kvm_vcpu_is_reset_bsp(vcpu))
+ apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
+ apic_base_msr.host_initiated = true;
+ kvm_set_apic_base(vcpu, &apic_base_msr);
+ }
vmx_segment_cache_clear(vmx);
@@ -4747,9 +4736,12 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
- vmcs_write32(GUEST_SYSENTER_CS, 0);
- vmcs_writel(GUEST_SYSENTER_ESP, 0);
- vmcs_writel(GUEST_SYSENTER_EIP, 0);
+ if (!init_event) {
+ vmcs_write32(GUEST_SYSENTER_CS, 0);
+ vmcs_writel(GUEST_SYSENTER_ESP, 0);
+ vmcs_writel(GUEST_SYSENTER_EIP, 0);
+ vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
+ }
vmcs_writel(GUEST_RFLAGS, 0x02);
kvm_rip_write(vcpu, 0xfff0);
@@ -4764,18 +4756,15 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
- /* Special registers */
- vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
-
setup_msrs(vmx);
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */
- if (cpu_has_vmx_tpr_shadow()) {
+ if (cpu_has_vmx_tpr_shadow() && !init_event) {
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
- if (vm_need_tpr_shadow(vmx->vcpu.kvm))
+ if (vm_need_tpr_shadow(vcpu->kvm))
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
- __pa(vmx->vcpu.arch.apic->regs));
+ __pa(vcpu->arch.apic->regs));
vmcs_write32(TPR_THRESHOLD, 0);
}
@@ -4787,12 +4776,14 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
if (vmx->vpid != 0)
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
- vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
- vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
- vmx_set_cr4(&vmx->vcpu, 0);
- vmx_set_efer(&vmx->vcpu, 0);
- vmx_fpu_activate(&vmx->vcpu);
- update_exception_bitmap(&vmx->vcpu);
+ cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
+ vmx_set_cr0(vcpu, cr0); /* enter rmode */
+ vmx->vcpu.arch.cr0 = cr0;
+ vmx_set_cr4(vcpu, 0);
+ if (!init_event)
+ vmx_set_efer(vcpu, 0);
+ vmx_fpu_activate(vcpu);
+ update_exception_bitmap(vcpu);
vpid_sync_context(vmx);
}
@@ -4965,7 +4956,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
.flags = 0,
};
- ret = kvm_set_memory_region(kvm, &tss_mem);
+ ret = x86_set_memory_region(kvm, &tss_mem);
if (ret)
return ret;
kvm->arch.tss_addr = addr;
@@ -5475,19 +5466,21 @@ static int handle_cpuid(struct kvm_vcpu *vcpu)
static int handle_rdmsr(struct kvm_vcpu *vcpu)
{
u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
- u64 data;
+ struct msr_data msr_info;
- if (vmx_get_msr(vcpu, ecx, &data)) {
+ msr_info.index = ecx;
+ msr_info.host_initiated = false;
+ if (vmx_get_msr(vcpu, &msr_info)) {
trace_kvm_msr_read_ex(ecx);
kvm_inject_gp(vcpu, 0);
return 1;
}
- trace_kvm_msr_read(ecx, data);
+ trace_kvm_msr_read(ecx, msr_info.data);
/* FIXME: handling of bits 32:63 of rax, rdx */
- vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
- vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
+ vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u;
+ vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u;
skip_emulated_instruction(vcpu);
return 1;
}
@@ -5710,9 +5703,6 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
return 0;
}
- /* clear all local breakpoint enable flags */
- vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x155);
-
/*
* TODO: What about debug traps on tss switch?
* Are we supposed to inject them and update dr6?
@@ -7333,7 +7323,7 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
bitmap += (port & 0x7fff) / 8;
if (last_bitmap != bitmap)
- if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1))
+ if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
return true;
if (b & (1 << (port & 7)))
return true;
@@ -7377,7 +7367,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
/* Then read the msr_index'th bit from this bitmap: */
if (msr_index < 1024*8) {
unsigned char b;
- if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1))
+ if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
return true;
return 1 & (b >> (msr_index & 7));
} else
@@ -7642,9 +7632,9 @@ static void vmx_disable_pml(struct vcpu_vmx *vmx)
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
}
-static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx)
+static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
{
- struct kvm *kvm = vmx->vcpu.kvm;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 *pml_buf;
u16 pml_idx;
@@ -7666,7 +7656,7 @@ static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx)
gpa = pml_buf[pml_idx];
WARN_ON(gpa & (PAGE_SIZE - 1));
- mark_page_dirty(kvm, gpa >> PAGE_SHIFT);
+ kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
}
/* reset PML index */
@@ -7691,6 +7681,158 @@ static void kvm_flush_pml_buffers(struct kvm *kvm)
kvm_vcpu_kick(vcpu);
}
+static void vmx_dump_sel(char *name, uint32_t sel)
+{
+ pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
+ name, vmcs_read32(sel),
+ vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
+ vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
+ vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
+}
+
+static void vmx_dump_dtsel(char *name, uint32_t limit)
+{
+ pr_err("%s limit=0x%08x, base=0x%016lx\n",
+ name, vmcs_read32(limit),
+ vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
+}
+
+static void dump_vmcs(void)
+{
+ u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS);
+ u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS);
+ u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
+ u32 secondary_exec_control = 0;
+ unsigned long cr4 = vmcs_readl(GUEST_CR4);
+ u64 efer = vmcs_readl(GUEST_IA32_EFER);
+ int i, n;
+
+ if (cpu_has_secondary_exec_ctrls())
+ secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+
+ pr_err("*** Guest State ***\n");
+ pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+ vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
+ vmcs_readl(CR0_GUEST_HOST_MASK));
+ pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
+ cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
+ pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
+ if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
+ (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
+ {
+ pr_err("PDPTR0 = 0x%016lx PDPTR1 = 0x%016lx\n",
+ vmcs_readl(GUEST_PDPTR0), vmcs_readl(GUEST_PDPTR1));
+ pr_err("PDPTR2 = 0x%016lx PDPTR3 = 0x%016lx\n",
+ vmcs_readl(GUEST_PDPTR2), vmcs_readl(GUEST_PDPTR3));
+ }
+ pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
+ vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
+ pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n",
+ vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
+ pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
+ vmcs_readl(GUEST_SYSENTER_ESP),
+ vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
+ vmx_dump_sel("CS: ", GUEST_CS_SELECTOR);
+ vmx_dump_sel("DS: ", GUEST_DS_SELECTOR);
+ vmx_dump_sel("SS: ", GUEST_SS_SELECTOR);
+ vmx_dump_sel("ES: ", GUEST_ES_SELECTOR);
+ vmx_dump_sel("FS: ", GUEST_FS_SELECTOR);
+ vmx_dump_sel("GS: ", GUEST_GS_SELECTOR);
+ vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT);
+ vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR);
+ vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT);
+ vmx_dump_sel("TR: ", GUEST_TR_SELECTOR);
+ if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
+ (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
+ pr_err("EFER = 0x%016llx PAT = 0x%016lx\n",
+ efer, vmcs_readl(GUEST_IA32_PAT));
+ pr_err("DebugCtl = 0x%016lx DebugExceptions = 0x%016lx\n",
+ vmcs_readl(GUEST_IA32_DEBUGCTL),
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
+ if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
+ pr_err("PerfGlobCtl = 0x%016lx\n",
+ vmcs_readl(GUEST_IA32_PERF_GLOBAL_CTRL));
+ if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
+ pr_err("BndCfgS = 0x%016lx\n", vmcs_readl(GUEST_BNDCFGS));
+ pr_err("Interruptibility = %08x ActivityState = %08x\n",
+ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
+ vmcs_read32(GUEST_ACTIVITY_STATE));
+ if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
+ pr_err("InterruptStatus = %04x\n",
+ vmcs_read16(GUEST_INTR_STATUS));
+
+ pr_err("*** Host State ***\n");
+ pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
+ vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
+ pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
+ vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
+ vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
+ vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
+ vmcs_read16(HOST_TR_SELECTOR));
+ pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
+ vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
+ vmcs_readl(HOST_TR_BASE));
+ pr_err("GDTBase=%016lx IDTBase=%016lx\n",
+ vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
+ pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
+ vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
+ vmcs_readl(HOST_CR4));
+ pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
+ vmcs_readl(HOST_IA32_SYSENTER_ESP),
+ vmcs_read32(HOST_IA32_SYSENTER_CS),
+ vmcs_readl(HOST_IA32_SYSENTER_EIP));
+ if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER))
+ pr_err("EFER = 0x%016lx PAT = 0x%016lx\n",
+ vmcs_readl(HOST_IA32_EFER), vmcs_readl(HOST_IA32_PAT));
+ if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+ pr_err("PerfGlobCtl = 0x%016lx\n",
+ vmcs_readl(HOST_IA32_PERF_GLOBAL_CTRL));
+
+ pr_err("*** Control State ***\n");
+ pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
+ pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control);
+ pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
+ pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
+ vmcs_read32(EXCEPTION_BITMAP),
+ vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
+ vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
+ pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
+ vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
+ vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
+ vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
+ pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
+ vmcs_read32(VM_EXIT_INTR_INFO),
+ vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
+ vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
+ pr_err(" reason=%08x qualification=%016lx\n",
+ vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
+ pr_err("IDTVectoring: info=%08x errcode=%08x\n",
+ vmcs_read32(IDT_VECTORING_INFO_FIELD),
+ vmcs_read32(IDT_VECTORING_ERROR_CODE));
+ pr_err("TSC Offset = 0x%016lx\n", vmcs_readl(TSC_OFFSET));
+ if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
+ pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
+ if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
+ pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
+ if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
+ pr_err("EPT pointer = 0x%016lx\n", vmcs_readl(EPT_POINTER));
+ n = vmcs_read32(CR3_TARGET_COUNT);
+ for (i = 0; i + 1 < n; i += 4)
+ pr_err("CR3 target%u=%016lx target%u=%016lx\n",
+ i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
+ i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
+ if (i < n)
+ pr_err("CR3 target%u=%016lx\n",
+ i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
+ if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
+ pr_err("PLE Gap=%08x Window=%08x\n",
+ vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
+ if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
+ pr_err("Virtual processor ID = 0x%04x\n",
+ vmcs_read16(VIRTUAL_PROCESSOR_ID));
+}
+
/*
* The guest has exited. See if we can fix it or if we need userspace
* assistance.
@@ -7709,7 +7851,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
* flushed already.
*/
if (enable_pml)
- vmx_flush_pml_buffer(vmx);
+ vmx_flush_pml_buffer(vcpu);
/* If guest state is invalid, start emulating */
if (vmx->emulation_required)
@@ -7723,6 +7865,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
}
if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+ dump_vmcs();
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
= exit_reason;
@@ -7996,6 +8139,11 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
local_irq_enable();
}
+static bool vmx_has_high_real_mode_segbase(void)
+{
+ return enable_unrestricted_guest || emulate_invalid_guest_state;
+}
+
static bool vmx_mpx_supported(void)
{
return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
@@ -8480,7 +8628,8 @@ static int get_ept_level(void)
static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
{
- u64 ret;
+ u8 cache;
+ u64 ipat = 0;
/* For VT-d and EPT combination
* 1. MMIO: always map as UC
@@ -8493,16 +8642,27 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
* 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
* consistent with host MTRR
*/
- if (is_mmio)
- ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
- else if (kvm_arch_has_noncoherent_dma(vcpu->kvm))
- ret = kvm_get_guest_memory_type(vcpu, gfn) <<
- VMX_EPT_MT_EPTE_SHIFT;
- else
- ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
- | VMX_EPT_IPAT_BIT;
+ if (is_mmio) {
+ cache = MTRR_TYPE_UNCACHABLE;
+ goto exit;
+ }
- return ret;
+ if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) {
+ ipat = VMX_EPT_IPAT_BIT;
+ cache = MTRR_TYPE_WRBACK;
+ goto exit;
+ }
+
+ if (kvm_read_cr0(vcpu) & X86_CR0_CD) {
+ ipat = VMX_EPT_IPAT_BIT;
+ cache = MTRR_TYPE_UNCACHABLE;
+ goto exit;
+ }
+
+ cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn);
+
+exit:
+ return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
}
static int vmx_get_lpage_level(void)
@@ -8924,7 +9084,7 @@ static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
struct vmx_msr_entry *e)
{
/* x2APIC MSR accesses are not allowed */
- if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8)
+ if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)
return -EINVAL;
if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
e->index == MSR_IA32_UCODE_REV)
@@ -8966,8 +9126,8 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
msr.host_initiated = false;
for (i = 0; i < count; i++) {
- if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e),
- &e, sizeof(e))) {
+ if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
+ &e, sizeof(e))) {
pr_warn_ratelimited(
"%s cannot read MSR entry (%u, 0x%08llx)\n",
__func__, i, gpa + i * sizeof(e));
@@ -8999,9 +9159,10 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
struct vmx_msr_entry e;
for (i = 0; i < count; i++) {
- if (kvm_read_guest(vcpu->kvm,
- gpa + i * sizeof(e),
- &e, 2 * sizeof(u32))) {
+ struct msr_data msr_info;
+ if (kvm_vcpu_read_guest(vcpu,
+ gpa + i * sizeof(e),
+ &e, 2 * sizeof(u32))) {
pr_warn_ratelimited(
"%s cannot read MSR entry (%u, 0x%08llx)\n",
__func__, i, gpa + i * sizeof(e));
@@ -9013,19 +9174,21 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
__func__, i, e.index, e.reserved);
return -EINVAL;
}
- if (kvm_get_msr(vcpu, e.index, &e.value)) {
+ msr_info.host_initiated = false;
+ msr_info.index = e.index;
+ if (kvm_get_msr(vcpu, &msr_info)) {
pr_warn_ratelimited(
"%s cannot read MSR (%u, 0x%x)\n",
__func__, i, e.index);
return -EINVAL;
}
- if (kvm_write_guest(vcpu->kvm,
- gpa + i * sizeof(e) +
- offsetof(struct vmx_msr_entry, value),
- &e.value, sizeof(e.value))) {
+ if (kvm_vcpu_write_guest(vcpu,
+ gpa + i * sizeof(e) +
+ offsetof(struct vmx_msr_entry, value),
+ &msr_info.data, sizeof(msr_info.data))) {
pr_warn_ratelimited(
"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
- __func__, i, e.index, e.value);
+ __func__, i, e.index, msr_info.data);
return -EINVAL;
}
}
@@ -10150,6 +10313,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.hardware_enable = hardware_enable,
.hardware_disable = hardware_disable,
.cpu_has_accelerated_tpr = report_flexpriority,
+ .cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase,
.vcpu_create = vmx_create_vcpu,
.vcpu_free = vmx_free_vcpu,
@@ -10255,6 +10419,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
.slot_disable_log_dirty = vmx_slot_disable_log_dirty,
.flush_log_dirty = vmx_flush_log_dirty,
.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
+
+ .pmu_ops = &intel_pmu_ops,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ea306adbbc13..ac165c2fb8e5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -28,6 +28,7 @@
#include "x86.h"
#include "cpuid.h"
#include "assigned-dev.h"
+#include "pmu.h"
#include <linux/clocksource.h>
#include <linux/interrupt.h>
@@ -57,11 +58,9 @@
#include <asm/debugreg.h>
#include <asm/msr.h>
#include <asm/desc.h>
-#include <asm/mtrr.h>
#include <asm/mce.h>
-#include <asm/i387.h>
-#include <asm/fpu-internal.h> /* Ugh! */
-#include <asm/xcr.h>
+#include <linux/kernel_stat.h>
+#include <asm/fpu/internal.h> /* Ugh! */
#include <asm/pvclock.h>
#include <asm/div64.h>
@@ -99,6 +98,9 @@ module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
unsigned int min_timer_period_us = 500;
module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
+static bool __read_mostly kvmclock_periodic_sync = true;
+module_param(kvmclock_periodic_sync, bool, S_IRUGO);
+
bool kvm_has_tsc_control;
EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
u32 kvm_max_guest_tsc_khz;
@@ -475,7 +477,7 @@ EXPORT_SYMBOL_GPL(kvm_require_dr);
/*
* This function will be used to read from the physical memory of the currently
- * running guest. The difference to kvm_read_guest_page is that this function
+ * running guest. The difference to kvm_vcpu_read_guest_page is that this function
* can read from guest physical or from the guest's guest physical memory.
*/
int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
@@ -493,7 +495,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
real_gfn = gpa_to_gfn(real_gfn);
- return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
+ return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
}
EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
@@ -572,8 +574,7 @@ out:
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
unsigned long old_cr0 = kvm_read_cr0(vcpu);
- unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
- X86_CR0_CD | X86_CR0_NW;
+ unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
cr0 |= X86_CR0_ET;
@@ -619,6 +620,10 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if ((cr0 ^ old_cr0) & update_bits)
kvm_mmu_reset_context(vcpu);
+
+ if ((cr0 ^ old_cr0) & X86_CR0_CD)
+ kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
+
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr0);
@@ -908,7 +913,7 @@ bool kvm_rdpmc(struct kvm_vcpu *vcpu)
u64 data;
int err;
- err = kvm_pmu_read_pmc(vcpu, ecx, &data);
+ err = kvm_pmu_rdpmc(vcpu, ecx, &data);
if (err)
return err;
kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
@@ -923,17 +928,11 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc);
*
* This list is modified at module load time to reflect the
* capabilities of the host cpu. This capabilities test skips MSRs that are
- * kvm-specific. Those are put in the beginning of the list.
+ * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs
+ * may depend on host virtualization features rather than host cpu features.
*/
-#define KVM_SAVE_MSRS_BEGIN 12
static u32 msrs_to_save[] = {
- MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
- MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
- HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
- HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
- HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
- MSR_KVM_PV_EOI_EN,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_STAR,
#ifdef CONFIG_X86_64
@@ -945,14 +944,24 @@ static u32 msrs_to_save[] = {
static unsigned num_msrs_to_save;
-static const u32 emulated_msrs[] = {
+static u32 emulated_msrs[] = {
+ MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
+ HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
+ HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
+ HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
+ MSR_KVM_PV_EOI_EN,
+
MSR_IA32_TSC_ADJUST,
MSR_IA32_TSCDEADLINE,
MSR_IA32_MISC_ENABLE,
MSR_IA32_MCG_STATUS,
MSR_IA32_MCG_CTL,
+ MSR_IA32_SMBASE,
};
+static unsigned num_emulated_msrs;
+
bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
{
if (efer & efer_reserved_bits)
@@ -1046,6 +1055,21 @@ EXPORT_SYMBOL_GPL(kvm_set_msr);
/*
* Adapt set_msr() to msr_io()'s calling convention
*/
+static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
+{
+ struct msr_data msr;
+ int r;
+
+ msr.index = index;
+ msr.host_initiated = true;
+ r = kvm_get_msr(vcpu, &msr);
+ if (r)
+ return r;
+
+ *data = msr.data;
+ return 0;
+}
+
static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
struct msr_data msr;
@@ -1698,6 +1722,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->pvclock_set_guest_stopped_request = false;
}
+ pvclock_flags |= PVCLOCK_COUNTS_FROM_ZERO;
+
/* If the host uses TSC clocksource, then it is stable */
if (use_master_clock)
pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
@@ -1768,127 +1794,14 @@ static void kvmclock_sync_fn(struct work_struct *work)
kvmclock_sync_work);
struct kvm *kvm = container_of(ka, struct kvm, arch);
+ if (!kvmclock_periodic_sync)
+ return;
+
schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
KVMCLOCK_SYNC_PERIOD);
}
-static bool msr_mtrr_valid(unsigned msr)
-{
- switch (msr) {
- case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
- case MSR_MTRRfix64K_00000:
- case MSR_MTRRfix16K_80000:
- case MSR_MTRRfix16K_A0000:
- case MSR_MTRRfix4K_C0000:
- case MSR_MTRRfix4K_C8000:
- case MSR_MTRRfix4K_D0000:
- case MSR_MTRRfix4K_D8000:
- case MSR_MTRRfix4K_E0000:
- case MSR_MTRRfix4K_E8000:
- case MSR_MTRRfix4K_F0000:
- case MSR_MTRRfix4K_F8000:
- case MSR_MTRRdefType:
- case MSR_IA32_CR_PAT:
- return true;
- case 0x2f8:
- return true;
- }
- return false;
-}
-
-static bool valid_pat_type(unsigned t)
-{
- return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
-}
-
-static bool valid_mtrr_type(unsigned t)
-{
- return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
-}
-
-bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
- int i;
- u64 mask;
-
- if (!msr_mtrr_valid(msr))
- return false;
-
- if (msr == MSR_IA32_CR_PAT) {
- for (i = 0; i < 8; i++)
- if (!valid_pat_type((data >> (i * 8)) & 0xff))
- return false;
- return true;
- } else if (msr == MSR_MTRRdefType) {
- if (data & ~0xcff)
- return false;
- return valid_mtrr_type(data & 0xff);
- } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
- for (i = 0; i < 8 ; i++)
- if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
- return false;
- return true;
- }
-
- /* variable MTRRs */
- WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR));
-
- mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
- if ((msr & 1) == 0) {
- /* MTRR base */
- if (!valid_mtrr_type(data & 0xff))
- return false;
- mask |= 0xf00;
- } else
- /* MTRR mask */
- mask |= 0x7ff;
- if (data & mask) {
- kvm_inject_gp(vcpu, 0);
- return false;
- }
-
- return true;
-}
-EXPORT_SYMBOL_GPL(kvm_mtrr_valid);
-
-static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
-{
- u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
-
- if (!kvm_mtrr_valid(vcpu, msr, data))
- return 1;
-
- if (msr == MSR_MTRRdefType) {
- vcpu->arch.mtrr_state.def_type = data;
- vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
- } else if (msr == MSR_MTRRfix64K_00000)
- p[0] = data;
- else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
- p[1 + msr - MSR_MTRRfix16K_80000] = data;
- else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
- p[3 + msr - MSR_MTRRfix4K_C0000] = data;
- else if (msr == MSR_IA32_CR_PAT)
- vcpu->arch.pat = data;
- else { /* Variable MTRRs */
- int idx, is_mtrr_mask;
- u64 *pt;
-
- idx = (msr - 0x200) / 2;
- is_mtrr_mask = msr - 0x200 - 2 * idx;
- if (!is_mtrr_mask)
- pt =
- (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
- else
- pt =
- (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
- *pt = data;
- }
-
- kvm_mmu_reset_context(vcpu);
- return 0;
-}
-
static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
u64 mcg_cap = vcpu->arch.mcg_cap;
@@ -1947,7 +1860,7 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
r = PTR_ERR(page);
goto out;
}
- if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
+ if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
goto out_free;
r = 0;
out_free:
@@ -2047,13 +1960,13 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
break;
}
gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT;
- addr = gfn_to_hva(vcpu->kvm, gfn);
+ addr = kvm_vcpu_gfn_to_hva(vcpu, gfn);
if (kvm_is_error_hva(addr))
return 1;
if (__clear_user((void __user *)addr, PAGE_SIZE))
return 1;
vcpu->arch.hv_vapic = data;
- mark_page_dirty(vcpu->kvm, gfn);
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED))
return 1;
break;
@@ -2180,7 +2093,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
__func__, data);
break;
case 0x200 ... 0x2ff:
- return set_msr_mtrr(vcpu, msr, data);
+ return kvm_mtrr_set_msr(vcpu, msr, data);
case MSR_IA32_APICBASE:
return kvm_set_apic_base(vcpu, msr_info);
case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
@@ -2200,6 +2113,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_MISC_ENABLE:
vcpu->arch.ia32_misc_enable_msr = data;
break;
+ case MSR_IA32_SMBASE:
+ if (!msr_info->host_initiated)
+ return 1;
+ vcpu->arch.smbase = data;
+ break;
case MSR_KVM_WALL_CLOCK_NEW:
case MSR_KVM_WALL_CLOCK:
vcpu->kvm->arch.wall_clock = data;
@@ -2220,6 +2138,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
&vcpu->requests);
ka->boot_vcpu_runs_old_kvmclock = tmp;
+
+ ka->kvmclock_offset = -get_kernel_ns();
}
vcpu->arch.time = data;
@@ -2281,37 +2201,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
return set_msr_mce(vcpu, msr, data);
- /* Performance counters are not protected by a CPUID bit,
- * so we should check all of them in the generic path for the sake of
- * cross vendor migration.
- * Writing a zero into the event select MSRs disables them,
- * which we perfectly emulate ;-). Any other value should be at least
- * reported, some guests depend on them.
- */
- case MSR_K7_EVNTSEL0:
- case MSR_K7_EVNTSEL1:
- case MSR_K7_EVNTSEL2:
- case MSR_K7_EVNTSEL3:
- if (data != 0)
- vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
- "0x%x data 0x%llx\n", msr, data);
- break;
- /* at least RHEL 4 unconditionally writes to the perfctr registers,
- * so we ignore writes to make it happy.
- */
- case MSR_K7_PERFCTR0:
- case MSR_K7_PERFCTR1:
- case MSR_K7_PERFCTR2:
- case MSR_K7_PERFCTR3:
- vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
- "0x%x data 0x%llx\n", msr, data);
- break;
- case MSR_P6_PERFCTR0:
- case MSR_P6_PERFCTR1:
- pr = true;
- case MSR_P6_EVNTSEL0:
- case MSR_P6_EVNTSEL1:
- if (kvm_pmu_msr(vcpu, msr))
+ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+ case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
+ pr = true; /* fall through */
+ case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+ case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
+ if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
if (pr || data != 0)
@@ -2357,7 +2252,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
default:
if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
return xen_hvm_config(vcpu, data);
- if (kvm_pmu_msr(vcpu, msr))
+ if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
if (!ignore_msrs) {
vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
@@ -2379,48 +2274,12 @@ EXPORT_SYMBOL_GPL(kvm_set_msr_common);
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
+int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
- return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
+ return kvm_x86_ops->get_msr(vcpu, msr);
}
EXPORT_SYMBOL_GPL(kvm_get_msr);
-static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
-{
- u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
-
- if (!msr_mtrr_valid(msr))
- return 1;
-
- if (msr == MSR_MTRRdefType)
- *pdata = vcpu->arch.mtrr_state.def_type +
- (vcpu->arch.mtrr_state.enabled << 10);
- else if (msr == MSR_MTRRfix64K_00000)
- *pdata = p[0];
- else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
- *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
- else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
- *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
- else if (msr == MSR_IA32_CR_PAT)
- *pdata = vcpu->arch.pat;
- else { /* Variable MTRRs */
- int idx, is_mtrr_mask;
- u64 *pt;
-
- idx = (msr - 0x200) / 2;
- is_mtrr_mask = msr - 0x200 - 2 * idx;
- if (!is_mtrr_mask)
- pt =
- (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
- else
- pt =
- (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
- *pdata = *pt;
- }
-
- return 0;
-}
-
static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
{
u64 data;
@@ -2518,11 +2377,11 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
return 0;
}
-int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
+int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
u64 data;
- switch (msr) {
+ switch (msr_info->index) {
case MSR_IA32_PLATFORM_ID:
case MSR_IA32_EBL_CR_POWERON:
case MSR_IA32_DEBUGCTLMSR:
@@ -2533,38 +2392,28 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_K8_SYSCFG:
case MSR_K7_HWCR:
case MSR_VM_HSAVE_PA:
- case MSR_K7_EVNTSEL0:
- case MSR_K7_EVNTSEL1:
- case MSR_K7_EVNTSEL2:
- case MSR_K7_EVNTSEL3:
- case MSR_K7_PERFCTR0:
- case MSR_K7_PERFCTR1:
- case MSR_K7_PERFCTR2:
- case MSR_K7_PERFCTR3:
case MSR_K8_INT_PENDING_MSG:
case MSR_AMD64_NB_CFG:
case MSR_FAM10H_MMIO_CONF_BASE:
case MSR_AMD64_BU_CFG2:
- data = 0;
+ msr_info->data = 0;
break;
- case MSR_P6_PERFCTR0:
- case MSR_P6_PERFCTR1:
- case MSR_P6_EVNTSEL0:
- case MSR_P6_EVNTSEL1:
- if (kvm_pmu_msr(vcpu, msr))
- return kvm_pmu_get_msr(vcpu, msr, pdata);
- data = 0;
+ case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
+ case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
+ case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
+ case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
+ if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+ return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
+ msr_info->data = 0;
break;
case MSR_IA32_UCODE_REV:
- data = 0x100000000ULL;
+ msr_info->data = 0x100000000ULL;
break;
case MSR_MTRRcap:
- data = 0x500 | KVM_NR_VAR_MTRR;
- break;
case 0x200 ... 0x2ff:
- return get_msr_mtrr(vcpu, msr, pdata);
+ return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
case 0xcd: /* fsb frequency */
- data = 3;
+ msr_info->data = 3;
break;
/*
* MSR_EBC_FREQUENCY_ID
@@ -2578,48 +2427,53 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
* multiplying by zero otherwise.
*/
case MSR_EBC_FREQUENCY_ID:
- data = 1 << 24;
+ msr_info->data = 1 << 24;
break;
case MSR_IA32_APICBASE:
- data = kvm_get_apic_base(vcpu);
+ msr_info->data = kvm_get_apic_base(vcpu);
break;
case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
- return kvm_x2apic_msr_read(vcpu, msr, pdata);
+ return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
break;
case MSR_IA32_TSCDEADLINE:
- data = kvm_get_lapic_tscdeadline_msr(vcpu);
+ msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
break;
case MSR_IA32_TSC_ADJUST:
- data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
+ msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
break;
case MSR_IA32_MISC_ENABLE:
- data = vcpu->arch.ia32_misc_enable_msr;
+ msr_info->data = vcpu->arch.ia32_misc_enable_msr;
+ break;
+ case MSR_IA32_SMBASE:
+ if (!msr_info->host_initiated)
+ return 1;
+ msr_info->data = vcpu->arch.smbase;
break;
case MSR_IA32_PERF_STATUS:
/* TSC increment by tick */
- data = 1000ULL;
+ msr_info->data = 1000ULL;
/* CPU multiplier */
data |= (((uint64_t)4ULL) << 40);
break;
case MSR_EFER:
- data = vcpu->arch.efer;
+ msr_info->data = vcpu->arch.efer;
break;
case MSR_KVM_WALL_CLOCK:
case MSR_KVM_WALL_CLOCK_NEW:
- data = vcpu->kvm->arch.wall_clock;
+ msr_info->data = vcpu->kvm->arch.wall_clock;
break;
case MSR_KVM_SYSTEM_TIME:
case MSR_KVM_SYSTEM_TIME_NEW:
- data = vcpu->arch.time;
+ msr_info->data = vcpu->arch.time;
break;
case MSR_KVM_ASYNC_PF_EN:
- data = vcpu->arch.apf.msr_val;
+ msr_info->data = vcpu->arch.apf.msr_val;
break;
case MSR_KVM_STEAL_TIME:
- data = vcpu->arch.st.msr_val;
+ msr_info->data = vcpu->arch.st.msr_val;
break;
case MSR_KVM_PV_EOI_EN:
- data = vcpu->arch.pv_eoi.msr_val;
+ msr_info->data = vcpu->arch.pv_eoi.msr_val;
break;
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
@@ -2627,7 +2481,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
- return get_msr_mce(vcpu, msr, pdata);
+ return get_msr_mce(vcpu, msr_info->index, &msr_info->data);
case MSR_K7_CLK_CTL:
/*
* Provide expected ramp-up count for K7. All other
@@ -2638,17 +2492,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
* type 6, model 8 and higher from exploding due to
* the rdmsr failing.
*/
- data = 0x20000000;
+ msr_info->data = 0x20000000;
break;
case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
- if (kvm_hv_msr_partition_wide(msr)) {
+ if (kvm_hv_msr_partition_wide(msr_info->index)) {
int r;
mutex_lock(&vcpu->kvm->lock);
- r = get_msr_hyperv_pw(vcpu, msr, pdata);
+ r = get_msr_hyperv_pw(vcpu, msr_info->index, &msr_info->data);
mutex_unlock(&vcpu->kvm->lock);
return r;
} else
- return get_msr_hyperv(vcpu, msr, pdata);
+ return get_msr_hyperv(vcpu, msr_info->index, &msr_info->data);
break;
case MSR_IA32_BBL_CR_CTL3:
/* This legacy MSR exists but isn't fully documented in current
@@ -2661,31 +2515,30 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
* L2 cache control register 3: 64GB range, 256KB size,
* enabled, latency 0x1, configured
*/
- data = 0xbe702111;
+ msr_info->data = 0xbe702111;
break;
case MSR_AMD64_OSVW_ID_LENGTH:
if (!guest_cpuid_has_osvw(vcpu))
return 1;
- data = vcpu->arch.osvw.length;
+ msr_info->data = vcpu->arch.osvw.length;
break;
case MSR_AMD64_OSVW_STATUS:
if (!guest_cpuid_has_osvw(vcpu))
return 1;
- data = vcpu->arch.osvw.status;
+ msr_info->data = vcpu->arch.osvw.status;
break;
default:
- if (kvm_pmu_msr(vcpu, msr))
- return kvm_pmu_get_msr(vcpu, msr, pdata);
+ if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
+ return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
if (!ignore_msrs) {
- vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
+ vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr_info->index);
return 1;
} else {
- vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
- data = 0;
+ vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr_info->index);
+ msr_info->data = 0;
}
break;
}
- *pdata = data;
return 0;
}
EXPORT_SYMBOL_GPL(kvm_get_msr_common);
@@ -2798,12 +2651,25 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_HYPERV_TIME:
case KVM_CAP_IOAPIC_POLARITY_IGNORED:
case KVM_CAP_TSC_DEADLINE_TIMER:
+ case KVM_CAP_ENABLE_CAP_VM:
+ case KVM_CAP_DISABLE_QUIRKS:
#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3:
#endif
r = 1;
break;
+ case KVM_CAP_X86_SMM:
+ /* SMBASE is usually relocated above 1M on modern chipsets,
+ * and SMM handlers might indeed rely on 4G segment limits,
+ * so do not report SMM to be available if real mode is
+ * emulated via vm86 mode. Still, do not go to great lengths
+ * to avoid userspace's usage of the feature, because it is a
+ * fringe case that is not enabled except via specific settings
+ * of the module parameters.
+ */
+ r = kvm_x86_ops->cpu_has_high_real_mode_segbase();
+ break;
case KVM_CAP_COALESCED_MMIO:
r = KVM_COALESCED_MMIO_PAGE_OFFSET;
break;
@@ -2860,7 +2726,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
goto out;
n = msr_list.nmsrs;
- msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
+ msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
goto out;
r = -E2BIG;
@@ -2872,7 +2738,7 @@ long kvm_arch_dev_ioctl(struct file *filp,
goto out;
if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
&emulated_msrs,
- ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
+ num_emulated_msrs * sizeof(u32)))
goto out;
r = 0;
break;
@@ -3016,6 +2882,13 @@ static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
return 0;
}
+static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
+{
+ kvm_make_request(KVM_REQ_SMI, vcpu);
+
+ return 0;
+}
+
static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
struct kvm_tpr_access_ctl *tac)
{
@@ -3121,8 +2994,15 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
events->sipi_vector = 0; /* never valid when reporting to user space */
+ events->smi.smm = is_smm(vcpu);
+ events->smi.pending = vcpu->arch.smi_pending;
+ events->smi.smm_inside_nmi =
+ !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
+ events->smi.latched_init = kvm_lapic_latched_init(vcpu);
+
events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
- | KVM_VCPUEVENT_VALID_SHADOW);
+ | KVM_VCPUEVENT_VALID_SHADOW
+ | KVM_VCPUEVENT_VALID_SMM);
memset(&events->reserved, 0, sizeof(events->reserved));
}
@@ -3131,7 +3011,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
{
if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
| KVM_VCPUEVENT_VALID_SIPI_VECTOR
- | KVM_VCPUEVENT_VALID_SHADOW))
+ | KVM_VCPUEVENT_VALID_SHADOW
+ | KVM_VCPUEVENT_VALID_SMM))
return -EINVAL;
process_nmi(vcpu);
@@ -3156,6 +3037,24 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
kvm_vcpu_has_lapic(vcpu))
vcpu->arch.apic->sipi_vector = events->sipi_vector;
+ if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
+ if (events->smi.smm)
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ else
+ vcpu->arch.hflags &= ~HF_SMM_MASK;
+ vcpu->arch.smi_pending = events->smi.pending;
+ if (events->smi.smm_inside_nmi)
+ vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
+ else
+ vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
+ if (kvm_vcpu_has_lapic(vcpu)) {
+ if (events->smi.latched_init)
+ set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ else
+ clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ }
+ }
+
kvm_make_request(KVM_REQ_EVENT, vcpu);
return 0;
@@ -3194,8 +3093,8 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
{
- struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave;
- u64 xstate_bv = xsave->xsave_hdr.xstate_bv;
+ struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
+ u64 xstate_bv = xsave->header.xfeatures;
u64 valid;
/*
@@ -3230,7 +3129,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
{
- struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave;
+ struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
u64 valid;
@@ -3241,9 +3140,9 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
memcpy(xsave, src, XSAVE_HDR_OFFSET);
/* Set XSTATE_BV and possibly XCOMP_BV. */
- xsave->xsave_hdr.xstate_bv = xstate_bv;
+ xsave->header.xfeatures = xstate_bv;
if (cpu_has_xsaves)
- xsave->xsave_hdr.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
+ xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
/*
* Copy each region from the non-compacted offset to the
@@ -3275,8 +3174,8 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
fill_xsave((u8 *) guest_xsave->region, vcpu);
} else {
memcpy(guest_xsave->region,
- &vcpu->arch.guest_fpu.state->fxsave,
- sizeof(struct i387_fxsave_struct));
+ &vcpu->arch.guest_fpu.state.fxsave,
+ sizeof(struct fxregs_state));
*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
XSTATE_FPSSE;
}
@@ -3300,8 +3199,8 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
} else {
if (xstate_bv & ~XSTATE_FPSSE)
return -EINVAL;
- memcpy(&vcpu->arch.guest_fpu.state->fxsave,
- guest_xsave->region, sizeof(struct i387_fxsave_struct));
+ memcpy(&vcpu->arch.guest_fpu.state.fxsave,
+ guest_xsave->region, sizeof(struct fxregs_state));
}
return 0;
}
@@ -3415,6 +3314,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = kvm_vcpu_ioctl_nmi(vcpu);
break;
}
+ case KVM_SMI: {
+ r = kvm_vcpu_ioctl_smi(vcpu);
+ break;
+ }
case KVM_SET_CPUID: {
struct kvm_cpuid __user *cpuid_arg = argp;
struct kvm_cpuid cpuid;
@@ -3454,7 +3357,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
break;
}
case KVM_GET_MSRS:
- r = msr_io(vcpu, argp, kvm_get_msr, 1);
+ r = msr_io(vcpu, argp, do_get_msr, 1);
break;
case KVM_SET_MSRS:
r = msr_io(vcpu, argp, do_set_msr, 0);
@@ -3845,6 +3748,26 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
return 0;
}
+static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+ struct kvm_enable_cap *cap)
+{
+ int r;
+
+ if (cap->flags)
+ return -EINVAL;
+
+ switch (cap->cap) {
+ case KVM_CAP_DISABLE_QUIRKS:
+ kvm->arch.disabled_quirks = cap->args[0];
+ r = 0;
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ return r;
+}
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -4097,7 +4020,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = 0;
break;
}
+ case KVM_ENABLE_CAP: {
+ struct kvm_enable_cap cap;
+ r = -EFAULT;
+ if (copy_from_user(&cap, argp, sizeof(cap)))
+ goto out;
+ r = kvm_vm_ioctl_enable_cap(kvm, &cap);
+ break;
+ }
default:
r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
}
@@ -4110,8 +4041,7 @@ static void kvm_init_msr_list(void)
u32 dummy[2];
unsigned i, j;
- /* skip the first msrs in the list. KVM-specific */
- for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
+ for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
continue;
@@ -4136,6 +4066,22 @@ static void kvm_init_msr_list(void)
j++;
}
num_msrs_to_save = j;
+
+ for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) {
+ switch (emulated_msrs[i]) {
+ case MSR_IA32_SMBASE:
+ if (!kvm_x86_ops->cpu_has_high_real_mode_segbase())
+ continue;
+ break;
+ default:
+ break;
+ }
+
+ if (j < i)
+ emulated_msrs[j] = emulated_msrs[i];
+ j++;
+ }
+ num_emulated_msrs = j;
}
static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
@@ -4253,8 +4199,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
- ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, data,
- offset, toread);
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
+ offset, toread);
if (ret < 0) {
r = X86EMUL_IO_NEEDED;
goto out;
@@ -4287,8 +4233,8 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
offset = addr & (PAGE_SIZE-1);
if (WARN_ON(offset + bytes > PAGE_SIZE))
bytes = (unsigned)PAGE_SIZE - offset;
- ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, val,
- offset, bytes);
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val,
+ offset, bytes);
if (unlikely(ret < 0))
return X86EMUL_IO_NEEDED;
@@ -4334,7 +4280,7 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
- ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
+ ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
if (ret < 0) {
r = X86EMUL_IO_NEEDED;
goto out;
@@ -4387,7 +4333,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
{
int ret;
- ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
+ ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
if (ret < 0)
return 0;
kvm_mmu_pte_write(vcpu, gpa, val, bytes);
@@ -4421,7 +4367,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
void *val, int bytes)
{
- return !kvm_read_guest(vcpu->kvm, gpa, val, bytes);
+ return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes);
}
static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -4619,7 +4565,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
goto emul_write;
- page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+ page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
if (is_error_page(page))
goto emul_write;
@@ -4647,7 +4593,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
if (!exchanged)
return X86EMUL_CMPXCHG_FAILED;
- mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
+ kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
kvm_mmu_pte_write(vcpu, gpa, new, bytes);
return X86EMUL_CONTINUE;
@@ -4946,7 +4892,17 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
u32 msr_index, u64 *pdata)
{
- return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
+ struct msr_data msr;
+ int r;
+
+ msr.index = msr_index;
+ msr.host_initiated = false;
+ r = kvm_get_msr(emul_to_vcpu(ctxt), &msr);
+ if (r)
+ return r;
+
+ *pdata = msr.data;
+ return 0;
}
static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
@@ -4960,16 +4916,30 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
}
+static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+ return vcpu->arch.smbase;
+}
+
+static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+ vcpu->arch.smbase = smbase;
+}
+
static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
u32 pmc)
{
- return kvm_pmu_check_pmc(emul_to_vcpu(ctxt), pmc);
+ return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc);
}
static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
u32 pmc, u64 *pdata)
{
- return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata);
+ return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata);
}
static void emulator_halt(struct x86_emulate_ctxt *ctxt)
@@ -5045,6 +5015,8 @@ static const struct x86_emulate_ops emulate_ops = {
.cpl = emulator_get_cpl,
.get_dr = emulator_get_dr,
.set_dr = emulator_set_dr,
+ .get_smbase = emulator_get_smbase,
+ .set_smbase = emulator_set_smbase,
.set_msr = emulator_set_msr,
.get_msr = emulator_get_msr,
.check_pmc = emulator_check_pmc,
@@ -5106,7 +5078,10 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
(cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 :
cs_db ? X86EMUL_MODE_PROT32 :
X86EMUL_MODE_PROT16;
- ctxt->guest_mode = is_guest_mode(vcpu);
+ BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
+ BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
+ BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
+ ctxt->emul_flags = vcpu->arch.hflags;
init_decode_cache(ctxt);
vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
@@ -5275,6 +5250,34 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
static int complete_emulated_pio(struct kvm_vcpu *vcpu);
+static void kvm_smm_changed(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->arch.hflags & HF_SMM_MASK)) {
+ /* This is a good place to trace that we are exiting SMM. */
+ trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
+
+ if (unlikely(vcpu->arch.smi_pending)) {
+ kvm_make_request(KVM_REQ_SMI, vcpu);
+ vcpu->arch.smi_pending = 0;
+ } else {
+ /* Process a latched INIT, if any. */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ }
+ }
+
+ kvm_mmu_reset_context(vcpu);
+}
+
+static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags)
+{
+ unsigned changed = vcpu->arch.hflags ^ emul_flags;
+
+ vcpu->arch.hflags = emul_flags;
+
+ if (changed & HF_SMM_MASK)
+ kvm_smm_changed(vcpu);
+}
+
static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
unsigned long *db)
{
@@ -5474,6 +5477,8 @@ restart:
unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+ if (vcpu->arch.hflags != ctxt->emul_flags)
+ kvm_set_hflags(vcpu, ctxt->emul_flags);
kvm_rip_write(vcpu, ctxt->eip);
if (r == EMULATE_DONE)
kvm_vcpu_check_singlestep(vcpu, rflags, &r);
@@ -5952,6 +5957,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
lapic_irq.shorthand = 0;
lapic_irq.dest_mode = 0;
lapic_irq.dest_id = apicid;
+ lapic_irq.msi_redir_hint = false;
lapic_irq.delivery_mode = APIC_DM_REMRD;
kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
@@ -6039,6 +6045,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
struct kvm_run *kvm_run = vcpu->run;
kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
+ kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
if (irqchip_in_kernel(vcpu->kvm))
@@ -6162,6 +6169,233 @@ static void process_nmi(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
+#define put_smstate(type, buf, offset, val) \
+ *(type *)((buf) + (offset) - 0x7e00) = val
+
+static u32 process_smi_get_segment_flags(struct kvm_segment *seg)
+{
+ u32 flags = 0;
+ flags |= seg->g << 23;
+ flags |= seg->db << 22;
+ flags |= seg->l << 21;
+ flags |= seg->avl << 20;
+ flags |= seg->present << 15;
+ flags |= seg->dpl << 13;
+ flags |= seg->s << 12;
+ flags |= seg->type << 8;
+ return flags;
+}
+
+static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
+{
+ struct kvm_segment seg;
+ int offset;
+
+ kvm_get_segment(vcpu, &seg, n);
+ put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector);
+
+ if (n < 3)
+ offset = 0x7f84 + n * 12;
+ else
+ offset = 0x7f2c + (n - 3) * 12;
+
+ put_smstate(u32, buf, offset + 8, seg.base);
+ put_smstate(u32, buf, offset + 4, seg.limit);
+ put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg));
+}
+
+static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
+{
+ struct kvm_segment seg;
+ int offset;
+ u16 flags;
+
+ kvm_get_segment(vcpu, &seg, n);
+ offset = 0x7e00 + n * 16;
+
+ flags = process_smi_get_segment_flags(&seg) >> 8;
+ put_smstate(u16, buf, offset, seg.selector);
+ put_smstate(u16, buf, offset + 2, flags);
+ put_smstate(u32, buf, offset + 4, seg.limit);
+ put_smstate(u64, buf, offset + 8, seg.base);
+}
+
+static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
+{
+ struct desc_ptr dt;
+ struct kvm_segment seg;
+ unsigned long val;
+ int i;
+
+ put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu));
+ put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu));
+ put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu));
+ put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu));
+
+ for (i = 0; i < 8; i++)
+ put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read(vcpu, i));
+
+ kvm_get_dr(vcpu, 6, &val);
+ put_smstate(u32, buf, 0x7fcc, (u32)val);
+ kvm_get_dr(vcpu, 7, &val);
+ put_smstate(u32, buf, 0x7fc8, (u32)val);
+
+ kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
+ put_smstate(u32, buf, 0x7fc4, seg.selector);
+ put_smstate(u32, buf, 0x7f64, seg.base);
+ put_smstate(u32, buf, 0x7f60, seg.limit);
+ put_smstate(u32, buf, 0x7f5c, process_smi_get_segment_flags(&seg));
+
+ kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
+ put_smstate(u32, buf, 0x7fc0, seg.selector);
+ put_smstate(u32, buf, 0x7f80, seg.base);
+ put_smstate(u32, buf, 0x7f7c, seg.limit);
+ put_smstate(u32, buf, 0x7f78, process_smi_get_segment_flags(&seg));
+
+ kvm_x86_ops->get_gdt(vcpu, &dt);
+ put_smstate(u32, buf, 0x7f74, dt.address);
+ put_smstate(u32, buf, 0x7f70, dt.size);
+
+ kvm_x86_ops->get_idt(vcpu, &dt);
+ put_smstate(u32, buf, 0x7f58, dt.address);
+ put_smstate(u32, buf, 0x7f54, dt.size);
+
+ for (i = 0; i < 6; i++)
+ process_smi_save_seg_32(vcpu, buf, i);
+
+ put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
+
+ /* revision id */
+ put_smstate(u32, buf, 0x7efc, 0x00020000);
+ put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
+}
+
+static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
+{
+#ifdef CONFIG_X86_64
+ struct desc_ptr dt;
+ struct kvm_segment seg;
+ unsigned long val;
+ int i;
+
+ for (i = 0; i < 16; i++)
+ put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read(vcpu, i));
+
+ put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu));
+ put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu));
+
+ kvm_get_dr(vcpu, 6, &val);
+ put_smstate(u64, buf, 0x7f68, val);
+ kvm_get_dr(vcpu, 7, &val);
+ put_smstate(u64, buf, 0x7f60, val);
+
+ put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu));
+ put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu));
+ put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu));
+
+ put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase);
+
+ /* revision id */
+ put_smstate(u32, buf, 0x7efc, 0x00020064);
+
+ put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer);
+
+ kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
+ put_smstate(u16, buf, 0x7e90, seg.selector);
+ put_smstate(u16, buf, 0x7e92, process_smi_get_segment_flags(&seg) >> 8);
+ put_smstate(u32, buf, 0x7e94, seg.limit);
+ put_smstate(u64, buf, 0x7e98, seg.base);
+
+ kvm_x86_ops->get_idt(vcpu, &dt);
+ put_smstate(u32, buf, 0x7e84, dt.size);
+ put_smstate(u64, buf, 0x7e88, dt.address);
+
+ kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
+ put_smstate(u16, buf, 0x7e70, seg.selector);
+ put_smstate(u16, buf, 0x7e72, process_smi_get_segment_flags(&seg) >> 8);
+ put_smstate(u32, buf, 0x7e74, seg.limit);
+ put_smstate(u64, buf, 0x7e78, seg.base);
+
+ kvm_x86_ops->get_gdt(vcpu, &dt);
+ put_smstate(u32, buf, 0x7e64, dt.size);
+ put_smstate(u64, buf, 0x7e68, dt.address);
+
+ for (i = 0; i < 6; i++)
+ process_smi_save_seg_64(vcpu, buf, i);
+#else
+ WARN_ON_ONCE(1);
+#endif
+}
+
+static void process_smi(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment cs, ds;
+ char buf[512];
+ u32 cr0;
+
+ if (is_smm(vcpu)) {
+ vcpu->arch.smi_pending = true;
+ return;
+ }
+
+ trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ memset(buf, 0, 512);
+ if (guest_cpuid_has_longmode(vcpu))
+ process_smi_save_state_64(vcpu, buf);
+ else
+ process_smi_save_state_32(vcpu, buf);
+
+ kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
+
+ if (kvm_x86_ops->get_nmi_mask(vcpu))
+ vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
+ else
+ kvm_x86_ops->set_nmi_mask(vcpu, true);
+
+ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
+ kvm_rip_write(vcpu, 0x8000);
+
+ cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
+ kvm_x86_ops->set_cr0(vcpu, cr0);
+ vcpu->arch.cr0 = cr0;
+
+ kvm_x86_ops->set_cr4(vcpu, 0);
+
+ __kvm_set_dr(vcpu, 7, DR7_FIXED_1);
+
+ cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
+ cs.base = vcpu->arch.smbase;
+
+ ds.selector = 0;
+ ds.base = 0;
+
+ cs.limit = ds.limit = 0xffffffff;
+ cs.type = ds.type = 0x3;
+ cs.dpl = ds.dpl = 0;
+ cs.db = ds.db = 0;
+ cs.s = ds.s = 1;
+ cs.l = ds.l = 0;
+ cs.g = ds.g = 1;
+ cs.avl = ds.avl = 0;
+ cs.present = ds.present = 1;
+ cs.unusable = ds.unusable = 0;
+ cs.padding = ds.padding = 0;
+
+ kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
+ kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
+
+ if (guest_cpuid_has_longmode(vcpu))
+ kvm_x86_ops->set_efer(vcpu, 0);
+
+ kvm_update_cpuid(vcpu);
+ kvm_mmu_reset_context(vcpu);
+}
+
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
{
u64 eoi_exit_bitmap[4];
@@ -6270,12 +6504,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
record_steal_time(vcpu);
+ if (kvm_check_request(KVM_REQ_SMI, vcpu))
+ process_smi(vcpu);
if (kvm_check_request(KVM_REQ_NMI, vcpu))
process_nmi(vcpu);
if (kvm_check_request(KVM_REQ_PMU, vcpu))
- kvm_handle_pmu_event(vcpu);
+ kvm_pmu_handle_event(vcpu);
if (kvm_check_request(KVM_REQ_PMI, vcpu))
- kvm_deliver_pmi(vcpu);
+ kvm_pmu_deliver_pmi(vcpu);
if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
vcpu_scan_ioapic(vcpu);
if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
@@ -6347,7 +6583,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (req_immediate_exit)
smp_send_reschedule(vcpu->cpu);
- kvm_guest_enter();
+ __kvm_guest_enter();
if (unlikely(vcpu->arch.switch_db_regs)) {
set_debugreg(0, 7);
@@ -6597,11 +6833,11 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
+ struct fpu *fpu = &current->thread.fpu;
int r;
sigset_t sigsaved;
- if (!tsk_used_math(current) && init_fpu(current))
- return -ENOMEM;
+ fpu__activate_curr(fpu);
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
@@ -6971,8 +7207,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- struct i387_fxsave_struct *fxsave =
- &vcpu->arch.guest_fpu.state->fxsave;
+ struct fxregs_state *fxsave =
+ &vcpu->arch.guest_fpu.state.fxsave;
memcpy(fpu->fpr, fxsave->st_space, 128);
fpu->fcw = fxsave->cwd;
@@ -6988,8 +7224,8 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
- struct i387_fxsave_struct *fxsave =
- &vcpu->arch.guest_fpu.state->fxsave;
+ struct fxregs_state *fxsave =
+ &vcpu->arch.guest_fpu.state.fxsave;
memcpy(fxsave->st_space, fpu->fpr, 128);
fxsave->cwd = fpu->fcw;
@@ -7003,17 +7239,11 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
return 0;
}
-int fx_init(struct kvm_vcpu *vcpu)
+static void fx_init(struct kvm_vcpu *vcpu)
{
- int err;
-
- err = fpu_alloc(&vcpu->arch.guest_fpu);
- if (err)
- return err;
-
- fpu_finit(&vcpu->arch.guest_fpu);
+ fpstate_init(&vcpu->arch.guest_fpu.state);
if (cpu_has_xsaves)
- vcpu->arch.guest_fpu.state->xsave.xsave_hdr.xcomp_bv =
+ vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv =
host_xcr0 | XSTATE_COMPACTION_ENABLED;
/*
@@ -7022,14 +7252,6 @@ int fx_init(struct kvm_vcpu *vcpu)
vcpu->arch.xcr0 = XSTATE_FP;
vcpu->arch.cr0 |= X86_CR0_ET;
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(fx_init);
-
-static void fx_free(struct kvm_vcpu *vcpu)
-{
- fpu_free(&vcpu->arch.guest_fpu);
}
void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
@@ -7045,7 +7267,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
kvm_put_guest_xcr0(vcpu);
vcpu->guest_fpu_loaded = 1;
__kernel_fpu_begin();
- fpu_restore_checking(&vcpu->arch.guest_fpu);
+ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state);
trace_kvm_fpu(1);
}
@@ -7053,16 +7275,25 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
kvm_put_guest_xcr0(vcpu);
- if (!vcpu->guest_fpu_loaded)
+ if (!vcpu->guest_fpu_loaded) {
+ vcpu->fpu_counter = 0;
return;
+ }
vcpu->guest_fpu_loaded = 0;
- fpu_save_init(&vcpu->arch.guest_fpu);
+ copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
__kernel_fpu_end();
++vcpu->stat.fpu_reload;
- if (!vcpu->arch.eager_fpu)
- kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
-
+ /*
+ * If using eager FPU mode, or if the guest is a frequent user
+ * of the FPU, just leave the FPU active for next time.
+ * Every 255 times fpu_counter rolls over to 0; a guest that uses
+ * the FPU in bursts will revert to loading it on demand.
+ */
+ if (!vcpu->arch.eager_fpu) {
+ if (++vcpu->fpu_counter < 5)
+ kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
+ }
trace_kvm_fpu(0);
}
@@ -7071,7 +7302,6 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
kvmclock_reset(vcpu);
free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
- fx_free(vcpu);
kvm_x86_ops->vcpu_free(vcpu);
}
@@ -7099,14 +7329,13 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
{
int r;
- vcpu->arch.mtrr_state.have_fixed = 1;
+ kvm_vcpu_mtrr_init(vcpu);
r = vcpu_load(vcpu);
if (r)
return r;
- kvm_vcpu_reset(vcpu);
+ kvm_vcpu_reset(vcpu, false);
kvm_mmu_setup(vcpu);
vcpu_put(vcpu);
-
return r;
}
@@ -7123,6 +7352,9 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
kvm_write_tsc(vcpu, &msr);
vcpu_put(vcpu);
+ if (!kvmclock_periodic_sync)
+ return;
+
schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
KVMCLOCK_SYNC_PERIOD);
}
@@ -7137,12 +7369,13 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kvm_mmu_unload(vcpu);
vcpu_put(vcpu);
- fx_free(vcpu);
kvm_x86_ops->vcpu_free(vcpu);
}
-void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
+ vcpu->arch.hflags = 0;
+
atomic_set(&vcpu->arch.nmi_queued, 0);
vcpu->arch.nmi_pending = 0;
vcpu->arch.nmi_injected = false;
@@ -7168,13 +7401,16 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
kvm_async_pf_hash_reset(vcpu);
vcpu->arch.apf.halted = false;
- kvm_pmu_reset(vcpu);
+ if (!init_event) {
+ kvm_pmu_reset(vcpu);
+ vcpu->arch.smbase = 0x30000;
+ }
memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
vcpu->arch.regs_avail = ~0;
vcpu->arch.regs_dirty = ~0;
- kvm_x86_ops->vcpu_reset(vcpu);
+ kvm_x86_ops->vcpu_reset(vcpu, init_event);
}
void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
@@ -7363,9 +7599,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
goto fail_free_mce_banks;
}
- r = fx_init(vcpu);
- if (r)
- goto fail_free_wbinvd_dirty_mask;
+ fx_init(vcpu);
vcpu->arch.ia32_tsc_adjust_msr = 0x0;
vcpu->arch.pv_time_enabled = false;
@@ -7375,12 +7609,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+ vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
+
kvm_async_pf_hash_reset(vcpu);
kvm_pmu_init(vcpu);
return 0;
-fail_free_wbinvd_dirty_mask:
- free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
+
fail_free_mce_banks:
kfree(vcpu->arch.mce_banks);
fail_free_lapic:
@@ -7482,6 +7717,40 @@ void kvm_arch_sync_events(struct kvm *kvm)
kvm_free_pit(kvm);
}
+int __x86_set_memory_region(struct kvm *kvm,
+ const struct kvm_userspace_memory_region *mem)
+{
+ int i, r;
+
+ /* Called with kvm->slots_lock held. */
+ BUG_ON(mem->slot >= KVM_MEM_SLOTS_NUM);
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ struct kvm_userspace_memory_region m = *mem;
+
+ m.slot |= i << 16;
+ r = __kvm_set_memory_region(kvm, &m);
+ if (r < 0)
+ return r;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__x86_set_memory_region);
+
+int x86_set_memory_region(struct kvm *kvm,
+ const struct kvm_userspace_memory_region *mem)
+{
+ int r;
+
+ mutex_lock(&kvm->slots_lock);
+ r = __x86_set_memory_region(kvm, mem);
+ mutex_unlock(&kvm->slots_lock);
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(x86_set_memory_region);
+
void kvm_arch_destroy_vm(struct kvm *kvm)
{
if (current->mm == kvm->mm) {
@@ -7493,13 +7762,13 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
struct kvm_userspace_memory_region mem;
memset(&mem, 0, sizeof(mem));
mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
- kvm_set_memory_region(kvm, &mem);
+ x86_set_memory_region(kvm, &mem);
mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
- kvm_set_memory_region(kvm, &mem);
+ x86_set_memory_region(kvm, &mem);
mem.slot = TSS_PRIVATE_MEMSLOT;
- kvm_set_memory_region(kvm, &mem);
+ x86_set_memory_region(kvm, &mem);
}
kvm_iommu_unmap_guest(kvm);
kfree(kvm->arch.vpic);
@@ -7588,18 +7857,18 @@ out_free:
return -ENOMEM;
}
-void kvm_arch_memslots_updated(struct kvm *kvm)
+void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
{
/*
* memslots->generation has been incremented.
* mmio generation may have reached its maximum value.
*/
- kvm_mmu_invalidate_mmio_sptes(kvm);
+ kvm_mmu_invalidate_mmio_sptes(kvm, slots);
}
int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *memslot,
- struct kvm_userspace_memory_region *mem,
+ const struct kvm_userspace_memory_region *mem,
enum kvm_mr_change change)
{
/*
@@ -7677,14 +7946,14 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
}
void kvm_arch_commit_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem,
+ const struct kvm_userspace_memory_region *mem,
const struct kvm_memory_slot *old,
+ const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
- struct kvm_memory_slot *new;
int nr_mmu_pages = 0;
- if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
+ if (change == KVM_MR_DELETE && old->id >= KVM_USER_MEM_SLOTS) {
int ret;
ret = vm_munmap(old->userspace_addr,
@@ -7701,9 +7970,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
if (nr_mmu_pages)
kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
- /* It's OK to get 'new' slot here as it has already been installed */
- new = id_to_memslot(kvm->memslots, mem->slot);
-
/*
* Dirty logging tracks sptes in 4k granularity, meaning that large
* sptes have to be split. If live migration is successful, the guest
@@ -7728,9 +7994,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
* been zapped so no dirty logging staff is needed for old slot. For
* KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
* new and it's also covered when dealing with the new slot.
+ *
+ * FIXME: const-ify all uses of struct kvm_memory_slot.
*/
if (change != KVM_MR_DELETE)
- kvm_mmu_slot_apply_flags(kvm, new);
+ kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
}
void kvm_arch_flush_shadow_all(struct kvm *kvm)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f5fef1868096..edc8cdcd786b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -4,6 +4,8 @@
#include <linux/kvm_host.h>
#include "kvm_cache_regs.h"
+#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL
+
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
{
vcpu->arch.exception.pending = false;
@@ -160,7 +162,13 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception);
+void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
+u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
+ int page_num);
#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \
| XSTATE_BNDREGS | XSTATE_BNDCSR \