diff options
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r-- | arch/x86/kvm/Kconfig | 9 | ||||
-rw-r--r-- | arch/x86/kvm/Makefile | 6 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.c | 17 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.h | 8 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 303 | ||||
-rw-r--r-- | arch/x86/kvm/ioapic.c | 9 | ||||
-rw-r--r-- | arch/x86/kvm/irq_comm.c | 14 | ||||
-rw-r--r-- | arch/x86/kvm/kvm_cache_regs.h | 5 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 59 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 15 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.c | 678 | ||||
-rw-r--r-- | arch/x86/kvm/mmu.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/mmu_audit.c | 20 | ||||
-rw-r--r-- | arch/x86/kvm/mmutrace.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/mtrr.c | 699 | ||||
-rw-r--r-- | arch/x86/kvm/paging_tmpl.h | 18 | ||||
-rw-r--r-- | arch/x86/kvm/pmu.c | 553 | ||||
-rw-r--r-- | arch/x86/kvm/pmu.h | 118 | ||||
-rw-r--r-- | arch/x86/kvm/pmu_amd.c | 207 | ||||
-rw-r--r-- | arch/x86/kvm/pmu_intel.c | 358 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 118 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 22 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 370 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 958 | ||||
-rw-r--r-- | arch/x86/kvm/x86.h | 8 |
25 files changed, 3242 insertions, 1334 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 413a7bf9efbb..d8a1d56276e1 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -86,15 +86,16 @@ config KVM_MMU_AUDIT auditing of KVM MMU events at runtime. config KVM_DEVICE_ASSIGNMENT - bool "KVM legacy PCI device assignment support" + bool "KVM legacy PCI device assignment support (DEPRECATED)" depends on KVM && PCI && IOMMU_API - default y + default n ---help--- Provide support for legacy PCI device assignment through KVM. The kernel now also supports a full featured userspace device driver - framework through VFIO, which supersedes much of this support. + framework through VFIO, which supersedes this support and provides + better security. - If unsure, say Y. + If unsure, say N. # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 16e8f962eaad..67d215cb8953 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -12,10 +12,10 @@ kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ - i8254.o ioapic.o irq_comm.o cpuid.o pmu.o + i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o -kvm-intel-y += vmx.o -kvm-amd-y += svm.o +kvm-intel-y += vmx.o pmu_intel.o +kvm-amd-y += svm.o pmu_amd.o obj-$(CONFIG_KVM) += kvm.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 1d08ad3582d0..64dd46793099 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -16,14 +16,14 @@ #include <linux/module.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> -#include <asm/i387.h> /* For use_eager_fpu. Ugh! */ -#include <asm/fpu-internal.h> /* For use_eager_fpu. Ugh! */ +#include <asm/fpu/internal.h> /* For use_eager_fpu. Ugh! */ #include <asm/user.h> -#include <asm/xsave.h> +#include <asm/fpu/xstate.h> #include "cpuid.h" #include "lapic.h" #include "mmu.h" #include "trace.h" +#include "pmu.h" static u32 xstate_required_size(u64 xstate_bv, bool compacted) { @@ -97,7 +97,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) if (best && (best->eax & (F(XSAVES) | F(XSAVEC)))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - vcpu->arch.eager_fpu = guest_cpuid_has_mpx(vcpu); + vcpu->arch.eager_fpu = use_eager_fpu() || guest_cpuid_has_mpx(vcpu); /* * The existing code assumes virtual address is 48-bit in the canonical @@ -111,7 +111,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) /* Update physical-address width */ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - kvm_pmu_cpuid_update(vcpu); + kvm_pmu_refresh(vcpu); return 0; } @@ -415,6 +415,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } break; } + case 6: /* Thermal management */ + entry->eax = 0x4; /* allow ARAT */ + entry->ebx = 0; + entry->ecx = 0; + entry->edx = 0; + break; case 7: { entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* Mask ebx against host capability word 9 */ @@ -591,7 +597,6 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, break; case 3: /* Processor serial number */ case 5: /* MONITOR/MWAIT */ - case 6: /* Thermal management */ case 0xC0000002: case 0xC0000003: case 0xC0000004: diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 496b3695d3d3..dd05b9cef6ae 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -70,6 +70,14 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); } +static inline bool guest_cpuid_has_longmode(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + return best && (best->edx & bit(X86_FEATURE_LM)); +} + static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 630bcb0d7a04..e7a4fde5d631 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -25,6 +25,7 @@ #include <linux/module.h> #include <asm/kvm_emulate.h> #include <linux/stringify.h> +#include <asm/debugreg.h> #include "x86.h" #include "tss.h" @@ -523,13 +524,9 @@ static void masked_increment(ulong *reg, ulong mask, int inc) static inline void register_address_increment(struct x86_emulate_ctxt *ctxt, int reg, int inc) { - ulong mask; + ulong *preg = reg_rmw(ctxt, reg); - if (ctxt->ad_bytes == sizeof(unsigned long)) - mask = ~0UL; - else - mask = ad_mask(ctxt); - masked_increment(reg_rmw(ctxt, reg), mask, inc); + assign_register(preg, *preg + inc, ctxt->ad_bytes); } static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc) @@ -2262,6 +2259,260 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt) return rc; } +static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt) +{ + u32 eax, ebx, ecx, edx; + + eax = 0x80000001; + ecx = 0; + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + return edx & bit(X86_FEATURE_LM); +} + +#define GET_SMSTATE(type, smbase, offset) \ + ({ \ + type __val; \ + int r = ctxt->ops->read_std(ctxt, smbase + offset, &__val, \ + sizeof(__val), NULL); \ + if (r != X86EMUL_CONTINUE) \ + return X86EMUL_UNHANDLEABLE; \ + __val; \ + }) + +static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags) +{ + desc->g = (flags >> 23) & 1; + desc->d = (flags >> 22) & 1; + desc->l = (flags >> 21) & 1; + desc->avl = (flags >> 20) & 1; + desc->p = (flags >> 15) & 1; + desc->dpl = (flags >> 13) & 3; + desc->s = (flags >> 12) & 1; + desc->type = (flags >> 8) & 15; +} + +static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) +{ + struct desc_struct desc; + int offset; + u16 selector; + + selector = GET_SMSTATE(u32, smbase, 0x7fa8 + n * 4); + + if (n < 3) + offset = 0x7f84 + n * 12; + else + offset = 0x7f2c + (n - 3) * 12; + + set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8)); + set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4)); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, offset)); + ctxt->ops->set_segment(ctxt, selector, &desc, 0, n); + return X86EMUL_CONTINUE; +} + +static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) +{ + struct desc_struct desc; + int offset; + u16 selector; + u32 base3; + + offset = 0x7e00 + n * 16; + + selector = GET_SMSTATE(u16, smbase, offset); + rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smbase, offset + 2) << 8); + set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4)); + set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8)); + base3 = GET_SMSTATE(u32, smbase, offset + 12); + + ctxt->ops->set_segment(ctxt, selector, &desc, base3, n); + return X86EMUL_CONTINUE; +} + +static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, + u64 cr0, u64 cr4) +{ + int bad; + + /* + * First enable PAE, long mode needs it before CR0.PG = 1 is set. + * Then enable protected mode. However, PCID cannot be enabled + * if EFER.LMA=0, so set it separately. + */ + bad = ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); + if (bad) + return X86EMUL_UNHANDLEABLE; + + bad = ctxt->ops->set_cr(ctxt, 0, cr0); + if (bad) + return X86EMUL_UNHANDLEABLE; + + if (cr4 & X86_CR4_PCIDE) { + bad = ctxt->ops->set_cr(ctxt, 4, cr4); + if (bad) + return X86EMUL_UNHANDLEABLE; + } + + return X86EMUL_CONTINUE; +} + +static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) +{ + struct desc_struct desc; + struct desc_ptr dt; + u16 selector; + u32 val, cr0, cr4; + int i; + + cr0 = GET_SMSTATE(u32, smbase, 0x7ffc); + ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8)); + ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED; + ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0); + + for (i = 0; i < 8; i++) + *reg_write(ctxt, i) = GET_SMSTATE(u32, smbase, 0x7fd0 + i * 4); + + val = GET_SMSTATE(u32, smbase, 0x7fcc); + ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1); + val = GET_SMSTATE(u32, smbase, 0x7fc8); + ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); + + selector = GET_SMSTATE(u32, smbase, 0x7fc4); + set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f64)); + set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f60)); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f5c)); + ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR); + + selector = GET_SMSTATE(u32, smbase, 0x7fc0); + set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f80)); + set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f7c)); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f78)); + ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR); + + dt.address = GET_SMSTATE(u32, smbase, 0x7f74); + dt.size = GET_SMSTATE(u32, smbase, 0x7f70); + ctxt->ops->set_gdt(ctxt, &dt); + + dt.address = GET_SMSTATE(u32, smbase, 0x7f58); + dt.size = GET_SMSTATE(u32, smbase, 0x7f54); + ctxt->ops->set_idt(ctxt, &dt); + + for (i = 0; i < 6; i++) { + int r = rsm_load_seg_32(ctxt, smbase, i); + if (r != X86EMUL_CONTINUE) + return r; + } + + cr4 = GET_SMSTATE(u32, smbase, 0x7f14); + + ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8)); + + return rsm_enter_protected_mode(ctxt, cr0, cr4); +} + +static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) +{ + struct desc_struct desc; + struct desc_ptr dt; + u64 val, cr0, cr4; + u32 base3; + u16 selector; + int i; + + for (i = 0; i < 16; i++) + *reg_write(ctxt, i) = GET_SMSTATE(u64, smbase, 0x7ff8 - i * 8); + + ctxt->_eip = GET_SMSTATE(u64, smbase, 0x7f78); + ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7f70) | X86_EFLAGS_FIXED; + + val = GET_SMSTATE(u32, smbase, 0x7f68); + ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1); + val = GET_SMSTATE(u32, smbase, 0x7f60); + ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); + + cr0 = GET_SMSTATE(u64, smbase, 0x7f58); + ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u64, smbase, 0x7f50)); + cr4 = GET_SMSTATE(u64, smbase, 0x7f48); + ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00)); + val = GET_SMSTATE(u64, smbase, 0x7ed0); + ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA); + + selector = GET_SMSTATE(u32, smbase, 0x7e90); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e92) << 8); + set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e94)); + set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e98)); + base3 = GET_SMSTATE(u32, smbase, 0x7e9c); + ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR); + + dt.size = GET_SMSTATE(u32, smbase, 0x7e84); + dt.address = GET_SMSTATE(u64, smbase, 0x7e88); + ctxt->ops->set_idt(ctxt, &dt); + + selector = GET_SMSTATE(u32, smbase, 0x7e70); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e72) << 8); + set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e74)); + set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e78)); + base3 = GET_SMSTATE(u32, smbase, 0x7e7c); + ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR); + + dt.size = GET_SMSTATE(u32, smbase, 0x7e64); + dt.address = GET_SMSTATE(u64, smbase, 0x7e68); + ctxt->ops->set_gdt(ctxt, &dt); + + for (i = 0; i < 6; i++) { + int r = rsm_load_seg_64(ctxt, smbase, i); + if (r != X86EMUL_CONTINUE) + return r; + } + + return rsm_enter_protected_mode(ctxt, cr0, cr4); +} + +static int em_rsm(struct x86_emulate_ctxt *ctxt) +{ + unsigned long cr0, cr4, efer; + u64 smbase; + int ret; + + if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0) + return emulate_ud(ctxt); + + /* + * Get back to real mode, to prepare a safe state in which to load + * CR0/CR3/CR4/EFER. Also this will ensure that addresses passed + * to read_std/write_std are not virtual. + * + * CR4.PCIDE must be zero, because it is a 64-bit mode only feature. + */ + cr0 = ctxt->ops->get_cr(ctxt, 0); + if (cr0 & X86_CR0_PE) + ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); + cr4 = ctxt->ops->get_cr(ctxt, 4); + if (cr4 & X86_CR4_PAE) + ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); + efer = 0; + ctxt->ops->set_msr(ctxt, MSR_EFER, efer); + + smbase = ctxt->ops->get_smbase(ctxt); + if (emulator_has_longmode(ctxt)) + ret = rsm_load_state_64(ctxt, smbase + 0x8000); + else + ret = rsm_load_state_32(ctxt, smbase + 0x8000); + + if (ret != X86EMUL_CONTINUE) { + /* FIXME: should triple fault */ + return X86EMUL_UNHANDLEABLE; + } + + if ((ctxt->emul_flags & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) + ctxt->ops->set_nmi_mask(ctxt, false); + + ctxt->emul_flags &= ~X86EMUL_SMM_INSIDE_NMI_MASK; + ctxt->emul_flags &= ~X86EMUL_SMM_MASK; + return X86EMUL_CONTINUE; +} + static void setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, struct desc_struct *cs, struct desc_struct *ss) @@ -2573,6 +2824,30 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, return true; } +static void string_registers_quirk(struct x86_emulate_ctxt *ctxt) +{ + /* + * Intel CPUs mask the counter and pointers in quite strange + * manner when ECX is zero due to REP-string optimizations. + */ +#ifdef CONFIG_X86_64 + if (ctxt->ad_bytes != 4 || !vendor_intel(ctxt)) + return; + + *reg_write(ctxt, VCPU_REGS_RCX) = 0; + + switch (ctxt->b) { + case 0xa4: /* movsb */ + case 0xa5: /* movsd/w */ + *reg_rmw(ctxt, VCPU_REGS_RSI) &= (u32)-1; + /* fall through */ + case 0xaa: /* stosb */ + case 0xab: /* stosd/w */ + *reg_rmw(ctxt, VCPU_REGS_RDI) &= (u32)-1; + } +#endif +} + static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, struct tss_segment_16 *tss) { @@ -2849,7 +3124,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, ulong old_tss_base = ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); u32 desc_limit; - ulong desc_addr; + ulong desc_addr, dr7; /* FIXME: old_tss_base == ~0 ? */ @@ -2934,6 +3209,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, ret = em_push(ctxt); } + ops->get_dr(ctxt, 7, &dr7); + ops->set_dr(ctxt, 7, dr7 & ~(DR_LOCAL_ENABLE_MASK | DR_LOCAL_SLOWDOWN)); + return ret; } @@ -3840,7 +4118,7 @@ static const struct opcode group5[] = { F(DstMem | SrcNone | Lock, em_inc), F(DstMem | SrcNone | Lock, em_dec), I(SrcMem | NearBranch, em_call_near_abs), - I(SrcMemFAddr | ImplicitOps | Stack, em_call_far), + I(SrcMemFAddr | ImplicitOps, em_call_far), I(SrcMem | NearBranch, em_jmp_abs), I(SrcMemFAddr | ImplicitOps, em_jmp_far), I(SrcMem | Stack, em_push), D(Undefined), @@ -4173,7 +4451,7 @@ static const struct opcode twobyte_table[256] = { F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N, /* 0xA8 - 0xAF */ I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), - DI(ImplicitOps, rsm), + II(No64 | EmulateOnUD | ImplicitOps, em_rsm, rsm), F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd), F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), @@ -4871,7 +5149,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) fetch_possible_mmx_operand(ctxt, &ctxt->dst); } - if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) { + if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_PRE_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -4900,7 +5178,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) { + if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -4910,6 +5188,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) if (ctxt->rep_prefix && (ctxt->d & String)) { /* All REP prefixes have the same first termination condition */ if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) { + string_registers_quirk(ctxt); ctxt->eip = ctxt->_eip; ctxt->eflags &= ~X86_EFLAGS_RF; goto done; @@ -4953,7 +5232,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: - if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) { + if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_MEMACCESS); if (rc != X86EMUL_CONTINUE) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 28146f03c514..856f79105bb5 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -349,6 +349,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) irqe.delivery_mode = entry->fields.delivery_mode << 8; irqe.level = 1; irqe.shorthand = 0; + irqe.msi_redir_hint = false; if (irqe.trig_mode == IOAPIC_EDGE_TRIG) ioapic->irr_delivered |= 1 << irq; @@ -637,11 +638,9 @@ void kvm_ioapic_destroy(struct kvm *kvm) struct kvm_ioapic *ioapic = kvm->arch.vioapic; cancel_delayed_work_sync(&ioapic->eoi_inject); - if (ioapic) { - kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); - kvm->arch.vioapic = NULL; - kfree(ioapic); - } + kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); + kvm->arch.vioapic = NULL; + kfree(ioapic); } int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 72298b3ac025..9efff9e5b58c 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -31,6 +31,8 @@ #include "ioapic.h" +#include "lapic.h" + static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) @@ -48,11 +50,6 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e, line_status); } -inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq) -{ - return irq->delivery_mode == APIC_DM_LOWEST; -} - int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, unsigned long *dest_map) { @@ -60,7 +57,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_vcpu *vcpu, *lowest = NULL; if (irq->dest_mode == 0 && irq->dest_id == 0xff && - kvm_is_dm_lowest_prio(irq)) { + kvm_lowest_prio_delivery(irq)) { printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n"); irq->delivery_mode = APIC_DM_FIXED; } @@ -76,7 +73,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, irq->dest_id, irq->dest_mode)) continue; - if (!kvm_is_dm_lowest_prio(irq)) { + if (!kvm_lowest_prio_delivery(irq)) { if (r < 0) r = 0; r += kvm_apic_set_irq(vcpu, irq, dest_map); @@ -106,9 +103,10 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data; irq->delivery_mode = e->msi.data & 0x700; + irq->msi_redir_hint = ((e->msi.address_lo + & MSI_ADDR_REDIRECTION_LOWPRI) > 0); irq->level = 1; irq->shorthand = 0; - /* TODO Deal with RH bit of MSI message address */ } int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 544076c4f44b..e1e89ee4af75 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -99,4 +99,9 @@ static inline bool is_guest_mode(struct kvm_vcpu *vcpu) return vcpu->arch.hflags & HF_GUEST_MASK; } +static inline bool is_smm(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.hflags & HF_SMM_MASK; +} + #endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4c7deb4f78a1..36e9de1b4127 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -240,6 +240,15 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) recalculate_apic_map(apic->vcpu->kvm); } +static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u8 id) +{ + u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); + + apic_set_reg(apic, APIC_ID, id << 24); + apic_set_reg(apic, APIC_LDR, ldr); + recalculate_apic_map(apic->vcpu->kvm); +} + static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) { return !(kvm_apic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); @@ -728,7 +737,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, dst = map->logical_map[cid]; - if (irq->delivery_mode == APIC_DM_LOWEST) { + if (kvm_lowest_prio_delivery(irq)) { int l = -1; for_each_set_bit(i, &bitmap, 16) { if (!dst[i]) @@ -799,7 +808,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; case APIC_DM_SMI: - apic_debug("Ignoring guest SMI\n"); + result = 1; + kvm_make_request(KVM_REQ_SMI, vcpu); + kvm_vcpu_kick(vcpu); break; case APIC_DM_NMI: @@ -914,9 +925,10 @@ static void apic_send_ipi(struct kvm_lapic *apic) irq.vector = icr_low & APIC_VECTOR_MASK; irq.delivery_mode = icr_low & APIC_MODE_MASK; irq.dest_mode = icr_low & APIC_DEST_MASK; - irq.level = icr_low & APIC_INT_ASSERT; + irq.level = (icr_low & APIC_INT_ASSERT) != 0; irq.trig_mode = icr_low & APIC_INT_LEVELTRIG; irq.shorthand = icr_low & APIC_SHORT_MASK; + irq.msi_redir_hint = false; if (apic_x2apic_mode(apic)) irq.dest_id = icr_high; else @@ -926,10 +938,11 @@ static void apic_send_ipi(struct kvm_lapic *apic) apic_debug("icr_high 0x%x, icr_low 0x%x, " "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " - "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n", + "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x, " + "msi_redir_hint 0x%x\n", icr_high, icr_low, irq.shorthand, irq.dest_id, irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, - irq.vector); + irq.vector, irq.msi_redir_hint); kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL); } @@ -1541,9 +1554,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if ((old_value ^ value) & X2APIC_ENABLE) { if (value & X2APIC_ENABLE) { - u32 id = kvm_apic_id(apic); - u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); - kvm_apic_set_ldr(apic, ldr); + kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true); } else kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false); @@ -1562,7 +1573,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) } -void kvm_lapic_reset(struct kvm_vcpu *vcpu) +void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) { struct kvm_lapic *apic; int i; @@ -1576,19 +1587,22 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) /* Stop the timer in case it's a reset to an active apic */ hrtimer_cancel(&apic->lapic_timer.timer); - kvm_apic_set_id(apic, vcpu->vcpu_id); + if (!init_event) + kvm_apic_set_id(apic, vcpu->vcpu_id); kvm_apic_set_version(apic->vcpu); for (i = 0; i < APIC_LVT_NUM; i++) apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED); apic_update_lvtt(apic); - apic_set_reg(apic, APIC_LVT0, - SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); + if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_LINT0_REENABLED)) + apic_set_reg(apic, APIC_LVT0, + SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT)); apic_set_reg(apic, APIC_DFR, 0xffffffffU); apic_set_spiv(apic, 0xff); apic_set_reg(apic, APIC_TASKPRI, 0); - kvm_apic_set_ldr(apic, 0); + if (!apic_x2apic_mode(apic)) + kvm_apic_set_ldr(apic, 0); apic_set_reg(apic, APIC_ESR, 0); apic_set_reg(apic, APIC_ICR, 0); apic_set_reg(apic, APIC_ICR2, 0); @@ -1717,7 +1731,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE); static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */ - kvm_lapic_reset(vcpu); + kvm_lapic_reset(vcpu, false); kvm_iodevice_init(&apic->dev, &apic_mmio_ops); return 0; @@ -2049,11 +2063,22 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events) return; - pe = xchg(&apic->pending_events, 0); + /* + * INITs are latched while in SMM. Because an SMM CPU cannot + * be in KVM_MP_STATE_INIT_RECEIVED state, just eat SIPIs + * and delay processing of INIT until the next RSM. + */ + if (is_smm(vcpu)) { + WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED); + if (test_bit(KVM_APIC_SIPI, &apic->pending_events)) + clear_bit(KVM_APIC_SIPI, &apic->pending_events); + return; + } + pe = xchg(&apic->pending_events, 0); if (test_bit(KVM_APIC_INIT, &pe)) { - kvm_lapic_reset(vcpu); - kvm_vcpu_reset(vcpu); + kvm_lapic_reset(vcpu, true); + kvm_vcpu_reset(vcpu, true); if (kvm_vcpu_is_bsp(apic->vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 9d28383fc1e7..f2f4e10ab772 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -48,7 +48,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu); int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); void kvm_apic_accept_events(struct kvm_vcpu *vcpu); -void kvm_lapic_reset(struct kvm_vcpu *vcpu); +void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event); u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); @@ -150,7 +150,18 @@ static inline bool kvm_apic_vid_enabled(struct kvm *kvm) static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu) { - return vcpu->arch.apic->pending_events; + return kvm_vcpu_has_lapic(vcpu) && vcpu->arch.apic->pending_events; +} + +static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq) +{ + return (irq->delivery_mode == APIC_DM_LOWEST || + irq->msi_redir_hint); +} + +static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu) +{ + return kvm_vcpu_has_lapic(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); } bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b73337634214..f807496b62c2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -223,15 +223,15 @@ static unsigned int get_mmio_spte_generation(u64 spte) return gen; } -static unsigned int kvm_current_mmio_generation(struct kvm *kvm) +static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu) { - return kvm_memslots(kvm)->generation & MMIO_GEN_MASK; + return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK; } -static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn, +static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned access) { - unsigned int gen = kvm_current_mmio_generation(kvm); + unsigned int gen = kvm_current_mmio_generation(vcpu); u64 mask = generation_mmio_spte_mask(gen); access &= ACC_WRITE_MASK | ACC_USER_MASK; @@ -258,22 +258,22 @@ static unsigned get_mmio_spte_access(u64 spte) return (spte & ~mask) & ~PAGE_MASK; } -static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, +static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) { if (unlikely(is_noslot_pfn(pfn))) { - mark_mmio_spte(kvm, sptep, gfn, access); + mark_mmio_spte(vcpu, sptep, gfn, access); return true; } return false; } -static bool check_mmio_spte(struct kvm *kvm, u64 spte) +static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) { unsigned int kvm_gen, spte_gen; - kvm_gen = kvm_current_mmio_generation(kvm); + kvm_gen = kvm_current_mmio_generation(vcpu); spte_gen = get_mmio_spte_generation(spte); trace_check_mmio_spte(spte, kvm_gen, spte_gen); @@ -804,30 +804,36 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, return &slot->arch.lpage_info[level - 2][idx]; } -static void account_shadowed(struct kvm *kvm, gfn_t gfn) +static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { + struct kvm_memslots *slots; struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; + gfn_t gfn; int i; - slot = gfn_to_memslot(kvm, gfn); - for (i = PT_DIRECTORY_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + gfn = sp->gfn; + slots = kvm_memslots_for_spte_role(kvm, sp->role); + slot = __gfn_to_memslot(slots, gfn); + for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); linfo->write_count += 1; } kvm->arch.indirect_shadow_pages++; } -static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) +static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { + struct kvm_memslots *slots; struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; + gfn_t gfn; int i; - slot = gfn_to_memslot(kvm, gfn); - for (i = PT_DIRECTORY_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + gfn = sp->gfn; + slots = kvm_memslots_for_spte_role(kvm, sp->role); + slot = __gfn_to_memslot(slots, gfn); + for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); linfo->write_count -= 1; WARN_ON(linfo->write_count < 0); @@ -835,14 +841,14 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) kvm->arch.indirect_shadow_pages--; } -static int has_wrprotected_page(struct kvm *kvm, +static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level) { struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; - slot = gfn_to_memslot(kvm, gfn); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (slot) { linfo = lpage_info_slot(gfn, slot, level); return linfo->write_count; @@ -858,8 +864,7 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) page_size = kvm_host_page_size(kvm, gfn); - for (i = PT_PAGE_TABLE_LEVEL; - i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { + for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { if (page_size >= KVM_HPAGE_SIZE(i)) ret = i; else @@ -875,7 +880,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, { struct kvm_memory_slot *slot; - slot = gfn_to_memslot(vcpu->kvm, gfn); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (!slot || slot->flags & KVM_MEMSLOT_INVALID || (no_dirty_log && slot->dirty_bitmap)) slot = NULL; @@ -900,7 +905,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) max_level = min(kvm_x86_ops->get_lpage_level(), host_level); for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) - if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) + if (has_wrprotected_page(vcpu, large_gfn, level)) break; return level - 1; @@ -1042,12 +1047,14 @@ static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, /* * Take gfn and return the reverse mapping to it. */ -static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) +static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, struct kvm_mmu_page *sp) { + struct kvm_memslots *slots; struct kvm_memory_slot *slot; - slot = gfn_to_memslot(kvm, gfn); - return __gfn_to_rmap(gfn, level, slot); + slots = kvm_memslots_for_spte_role(kvm, sp->role); + slot = __gfn_to_memslot(slots, gfn); + return __gfn_to_rmap(gfn, sp->role.level, slot); } static bool rmap_can_add(struct kvm_vcpu *vcpu) @@ -1065,7 +1072,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) sp = page_header(__pa(spte)); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); - rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); + rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp); return pte_list_add(vcpu, spte, rmapp); } @@ -1077,7 +1084,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) sp = page_header(__pa(spte)); gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); - rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); + rmapp = gfn_to_rmap(kvm, gfn, sp); pte_list_remove(spte, rmapp); } @@ -1142,6 +1149,11 @@ static u64 *rmap_get_next(struct rmap_iterator *iter) return NULL; } +#define for_each_rmap_spte(_rmap_, _iter_, _spte_) \ + for (_spte_ = rmap_get_first(*_rmap_, _iter_); \ + _spte_ && ({BUG_ON(!is_shadow_present_pte(*_spte_)); 1;}); \ + _spte_ = rmap_get_next(_iter_)) + static void drop_spte(struct kvm *kvm, u64 *sptep) { if (mmu_spte_clear_track_bits(sptep)) @@ -1205,12 +1217,8 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, struct rmap_iterator iter; bool flush = false; - for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); - + for_each_rmap_spte(rmapp, &iter, sptep) flush |= spte_write_protect(kvm, sptep, pt_protect); - sptep = rmap_get_next(&iter); - } return flush; } @@ -1232,12 +1240,8 @@ static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp) struct rmap_iterator iter; bool flush = false; - for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); - + for_each_rmap_spte(rmapp, &iter, sptep) flush |= spte_clear_dirty(kvm, sptep); - sptep = rmap_get_next(&iter); - } return flush; } @@ -1259,12 +1263,8 @@ static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp) struct rmap_iterator iter; bool flush = false; - for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); - + for_each_rmap_spte(rmapp, &iter, sptep) flush |= spte_set_dirty(kvm, sptep); - sptep = rmap_get_next(&iter); - } return flush; } @@ -1342,42 +1342,45 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -static bool rmap_write_protect(struct kvm *kvm, u64 gfn) +static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) { struct kvm_memory_slot *slot; unsigned long *rmapp; int i; bool write_protected = false; - slot = gfn_to_memslot(kvm, gfn); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - for (i = PT_PAGE_TABLE_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { rmapp = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmapp, true); + write_protected |= __rmap_write_protect(vcpu->kvm, rmapp, true); } return write_protected; } -static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - unsigned long data) +static bool kvm_zap_rmapp(struct kvm *kvm, unsigned long *rmapp) { u64 *sptep; struct rmap_iterator iter; - int need_tlb_flush = 0; + bool flush = false; while ((sptep = rmap_get_first(*rmapp, &iter))) { BUG_ON(!(*sptep & PT_PRESENT_MASK)); - rmap_printk("kvm_rmap_unmap_hva: spte %p %llx gfn %llx (%d)\n", - sptep, *sptep, gfn, level); + rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep); drop_spte(kvm, sptep); - need_tlb_flush = 1; + flush = true; } - return need_tlb_flush; + return flush; +} + +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + unsigned long data) +{ + return kvm_zap_rmapp(kvm, rmapp); } static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, @@ -1394,8 +1397,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, WARN_ON(pte_huge(*ptep)); new_pfn = pte_pfn(*ptep); - for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { - BUG_ON(!is_shadow_present_pte(*sptep)); +restart: + for_each_rmap_spte(rmapp, &iter, sptep) { rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", sptep, *sptep, gfn, level); @@ -1403,7 +1406,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, if (pte_write(*ptep)) { drop_spte(kvm, sptep); - sptep = rmap_get_first(*rmapp, &iter); + goto restart; } else { new_spte = *sptep & ~PT64_BASE_ADDR_MASK; new_spte |= (u64)new_pfn << PAGE_SHIFT; @@ -1414,7 +1417,6 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, mmu_spte_clear_track_bits(sptep); mmu_spte_set(sptep, new_spte); - sptep = rmap_get_next(&iter); } } @@ -1424,6 +1426,74 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, return 0; } +struct slot_rmap_walk_iterator { + /* input fields. */ + struct kvm_memory_slot *slot; + gfn_t start_gfn; + gfn_t end_gfn; + int start_level; + int end_level; + + /* output fields. */ + gfn_t gfn; + unsigned long *rmap; + int level; + + /* private field. */ + unsigned long *end_rmap; +}; + +static void +rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) +{ + iterator->level = level; + iterator->gfn = iterator->start_gfn; + iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot); + iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level, + iterator->slot); +} + +static void +slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, + struct kvm_memory_slot *slot, int start_level, + int end_level, gfn_t start_gfn, gfn_t end_gfn) +{ + iterator->slot = slot; + iterator->start_level = start_level; + iterator->end_level = end_level; + iterator->start_gfn = start_gfn; + iterator->end_gfn = end_gfn; + + rmap_walk_init_level(iterator, iterator->start_level); +} + +static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator) +{ + return !!iterator->rmap; +} + +static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) +{ + if (++iterator->rmap <= iterator->end_rmap) { + iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level)); + return; + } + + if (++iterator->level > iterator->end_level) { + iterator->rmap = NULL; + return; + } + + rmap_walk_init_level(iterator, iterator->level); +} + +#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \ + _start_gfn, _end_gfn, _iter_) \ + for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \ + _end_level_, _start_gfn, _end_gfn); \ + slot_rmap_walk_okay(_iter_); \ + slot_rmap_walk_next(_iter_)) + static int kvm_handle_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, @@ -1435,48 +1505,36 @@ static int kvm_handle_hva_range(struct kvm *kvm, int level, unsigned long data)) { - int j; - int ret = 0; struct kvm_memslots *slots; struct kvm_memory_slot *memslot; + struct slot_rmap_walk_iterator iterator; + int ret = 0; + int i; - slots = kvm_memslots(kvm); - - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gfn_start, gfn_end; - - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - /* - * {gfn(page) | page intersects with [hva_start, hva_end)} = - * {gfn_start, gfn_start+1, ..., gfn_end-1}. - */ - gfn_start = hva_to_gfn_memslot(hva_start, memslot); - gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - - for (j = PT_PAGE_TABLE_LEVEL; - j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) { - unsigned long idx, idx_end; - unsigned long *rmapp; - gfn_t gfn = gfn_start; + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(memslot, slots) { + unsigned long hva_start, hva_end; + gfn_t gfn_start, gfn_end; + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; /* - * {idx(page_j) | page_j intersects with - * [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}. + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn_start, gfn_start+1, ..., gfn_end-1}. */ - idx = gfn_to_index(gfn_start, memslot->base_gfn, j); - idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j); - - rmapp = __gfn_to_rmap(gfn_start, j, memslot); - - for (; idx <= idx_end; - ++idx, gfn += (1UL << KVM_HPAGE_GFN_SHIFT(j))) - ret |= handler(kvm, rmapp++, memslot, - gfn, j, data); + gfn_start = hva_to_gfn_memslot(hva_start, memslot); + gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); + + for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL, + PT_MAX_HUGEPAGE_LEVEL, + gfn_start, gfn_end - 1, + &iterator) + ret |= handler(kvm, iterator.rmap, memslot, + iterator.gfn, iterator.level, data); } } @@ -1518,16 +1576,13 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, BUG_ON(!shadow_accessed_mask); - for (sptep = rmap_get_first(*rmapp, &iter); sptep; - sptep = rmap_get_next(&iter)) { - BUG_ON(!is_shadow_present_pte(*sptep)); - + for_each_rmap_spte(rmapp, &iter, sptep) if (*sptep & shadow_accessed_mask) { young = 1; clear_bit((ffs(shadow_accessed_mask) - 1), (unsigned long *)sptep); } - } + trace_kvm_age_page(gfn, level, slot, young); return young; } @@ -1548,15 +1603,11 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, if (!shadow_accessed_mask) goto out; - for (sptep = rmap_get_first(*rmapp, &iter); sptep; - sptep = rmap_get_next(&iter)) { - BUG_ON(!is_shadow_present_pte(*sptep)); - + for_each_rmap_spte(rmapp, &iter, sptep) if (*sptep & shadow_accessed_mask) { young = 1; break; } - } out: return young; } @@ -1570,7 +1621,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) sp = page_header(__pa(spte)); - rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); + rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp); kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, gfn, sp->role.level, 0); kvm_flush_remote_tlbs(vcpu->kvm); @@ -1990,7 +2041,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, bool protected = false; for_each_sp(pages, sp, parents, i) - protected |= rmap_write_protect(vcpu->kvm, sp->gfn); + protected |= rmap_write_protect(vcpu, sp->gfn); if (protected) kvm_flush_remote_tlbs(vcpu->kvm); @@ -2088,12 +2139,12 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, hlist_add_head(&sp->hash_link, &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); if (!direct) { - if (rmap_write_protect(vcpu->kvm, gfn)) + if (rmap_write_protect(vcpu, gfn)) kvm_flush_remote_tlbs(vcpu->kvm); if (level > PT_PAGE_TABLE_LEVEL && need_sync) kvm_sync_pages(vcpu, gfn); - account_shadowed(vcpu->kvm, gfn); + account_shadowed(vcpu->kvm, sp); } sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; init_shadow_page_table(sp); @@ -2274,7 +2325,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, kvm_mmu_unlink_parents(kvm, sp); if (!sp->role.invalid && !sp->role.direct) - unaccount_shadowed(kvm, sp->gfn); + unaccount_shadowed(kvm, sp); if (sp->unsync) kvm_unlink_unsync_page(kvm, sp); @@ -2386,111 +2437,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); -/* - * The function is based on mtrr_type_lookup() in - * arch/x86/kernel/cpu/mtrr/generic.c - */ -static int get_mtrr_type(struct mtrr_state_type *mtrr_state, - u64 start, u64 end) -{ - int i; - u64 base, mask; - u8 prev_match, curr_match; - int num_var_ranges = KVM_NR_VAR_MTRR; - - if (!mtrr_state->enabled) - return 0xFF; - - /* Make end inclusive end, instead of exclusive */ - end--; - - /* Look in fixed ranges. Just return the type as per start */ - if (mtrr_state->have_fixed && (start < 0x100000)) { - int idx; - - if (start < 0x80000) { - idx = 0; - idx += (start >> 16); - return mtrr_state->fixed_ranges[idx]; - } else if (start < 0xC0000) { - idx = 1 * 8; - idx += ((start - 0x80000) >> 14); - return mtrr_state->fixed_ranges[idx]; - } else if (start < 0x1000000) { - idx = 3 * 8; - idx += ((start - 0xC0000) >> 12); - return mtrr_state->fixed_ranges[idx]; - } - } - - /* - * Look in variable ranges - * Look of multiple ranges matching this address and pick type - * as per MTRR precedence - */ - if (!(mtrr_state->enabled & 2)) - return mtrr_state->def_type; - - prev_match = 0xFF; - for (i = 0; i < num_var_ranges; ++i) { - unsigned short start_state, end_state; - - if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11))) - continue; - - base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) + - (mtrr_state->var_ranges[i].base_lo & PAGE_MASK); - mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) + - (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK); - - start_state = ((start & mask) == (base & mask)); - end_state = ((end & mask) == (base & mask)); - if (start_state != end_state) - return 0xFE; - - if ((start & mask) != (base & mask)) - continue; - - curr_match = mtrr_state->var_ranges[i].base_lo & 0xff; - if (prev_match == 0xFF) { - prev_match = curr_match; - continue; - } - - if (prev_match == MTRR_TYPE_UNCACHABLE || - curr_match == MTRR_TYPE_UNCACHABLE) - return MTRR_TYPE_UNCACHABLE; - - if ((prev_match == MTRR_TYPE_WRBACK && - curr_match == MTRR_TYPE_WRTHROUGH) || - (prev_match == MTRR_TYPE_WRTHROUGH && - curr_match == MTRR_TYPE_WRBACK)) { - prev_match = MTRR_TYPE_WRTHROUGH; - curr_match = MTRR_TYPE_WRTHROUGH; - } - - if (prev_match != curr_match) - return MTRR_TYPE_UNCACHABLE; - } - - if (prev_match != 0xFF) - return prev_match; - - return mtrr_state->def_type; -} - -u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - u8 mtrr; - - mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT, - (gfn << PAGE_SHIFT) + PAGE_SIZE); - if (mtrr == 0xfe || mtrr == 0xff) - mtrr = MTRR_TYPE_WRBACK; - return mtrr; -} -EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); - static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { trace_kvm_mmu_unsync_page(sp); @@ -2541,7 +2487,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte; int ret = 0; - if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access)) + if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; spte = PT_PRESENT_MASK; @@ -2578,7 +2524,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * be fixed if guest refault. */ if (level > PT_PAGE_TABLE_LEVEL && - has_wrprotected_page(vcpu->kvm, gfn, level)) + has_wrprotected_page(vcpu, gfn, level)) goto done; spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; @@ -2602,7 +2548,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } if (pte_access & ACC_WRITE_MASK) { - mark_page_dirty(vcpu->kvm, gfn); + kvm_vcpu_mark_page_dirty(vcpu, gfn); spte |= shadow_dirty_mask; } @@ -2692,15 +2638,17 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, u64 *start, u64 *end) { struct page *pages[PTE_PREFETCH_NUM]; + struct kvm_memory_slot *slot; unsigned access = sp->role.access; int i, ret; gfn_t gfn; gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); - if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK)) + slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK); + if (!slot) return -1; - ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); + ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start); if (ret <= 0) return -1; @@ -2818,7 +2766,7 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) return 1; if (pfn == KVM_PFN_ERR_HWPOISON) { - kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current); + kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); return 0; } @@ -2841,7 +2789,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompound(pfn_to_page(pfn)) && - !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { + !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) { unsigned long mask; /* * mmu_notifier_retry was successful and we hold the @@ -2933,7 +2881,7 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * Compare with set_spte where instead shadow_dirty_mask is set. */ if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) - mark_page_dirty(vcpu->kvm, gfn); + kvm_vcpu_mark_page_dirty(vcpu, gfn); return true; } @@ -3388,7 +3336,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) gfn_t gfn = get_mmio_spte_gfn(spte); unsigned access = get_mmio_spte_access(spte); - if (!check_mmio_spte(vcpu->kvm, spte)) + if (!check_mmio_spte(vcpu, spte)) return RET_MMIO_PF_INVALID; if (direct) @@ -3460,7 +3408,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) arch.direct_map = vcpu->arch.mmu.direct_map; arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); - return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu->kvm, gfn), &arch); + return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } static bool can_do_async_pf(struct kvm_vcpu *vcpu) @@ -3475,10 +3423,12 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu) static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gva_t gva, pfn_t *pfn, bool write, bool *writable) { + struct kvm_memory_slot *slot; bool async; - *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable); - + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + async = false; + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); if (!async) return false; /* *pfn has correct page already */ @@ -3492,11 +3442,20 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, return true; } - *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable); - + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable); return false; } +static bool +check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level) +{ + int page_num = KVM_PAGES_PER_HPAGE(level); + + gfn &= ~(page_num - 1); + + return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num); +} + static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, bool prefault) { @@ -3522,9 +3481,17 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (r) return r; - force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); + if (mapping_level_dirty_bitmap(vcpu, gfn) || + !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL)) + force_pt_level = 1; + else + force_pt_level = 0; + if (likely(!force_pt_level)) { level = mapping_level(vcpu, gfn); + if (level > PT_DIRECTORY_LEVEL && + !check_hugepage_cache_consistency(vcpu, gfn, level)) + level = PT_DIRECTORY_LEVEL; gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); } else level = PT_PAGE_TABLE_LEVEL; @@ -3590,7 +3557,7 @@ static void inject_page_fault(struct kvm_vcpu *vcpu, vcpu->arch.mmu.inject_page_fault(vcpu, fault); } -static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, +static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, unsigned access, int *nr_present) { if (unlikely(is_mmio_spte(*sptep))) { @@ -3600,7 +3567,7 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, } (*nr_present)++; - mark_mmio_spte(kvm, sptep, gfn, access); + mark_mmio_spte(vcpu, sptep, gfn, access); return true; } @@ -3878,6 +3845,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) struct kvm_mmu *context = &vcpu->arch.mmu; context->base_role.word = 0; + context->base_role.smm = is_smm(vcpu); context->page_fault = tdp_page_fault; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; @@ -3939,6 +3907,7 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) = smep && !is_write_protection(vcpu); context->base_role.smap_andnot_wp = smap && !is_write_protection(vcpu); + context->base_role.smm = is_smm(vcpu); } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); @@ -4110,7 +4079,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ *gpa &= ~(gpa_t)7; *bytes = 8; - r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8); + r = kvm_vcpu_read_guest(vcpu, *gpa, &gentry, 8); if (r) gentry = 0; new = (const u8 *)&gentry; @@ -4222,6 +4191,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, mask.nxe = 1; mask.smep_andnot_wp = 1; mask.smap_andnot_wp = 1; + mask.smm = 1; /* * If we don't have indirect shadow pages, it means no page is @@ -4420,36 +4390,115 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu) init_kvm_mmu(vcpu); } -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot) +/* The return value indicates if tlb flush on all vcpus is needed. */ +typedef bool (*slot_level_handler) (struct kvm *kvm, unsigned long *rmap); + +/* The caller should hold mmu-lock before calling this function. */ +static bool +slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, int start_level, int end_level, + gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb) { - gfn_t last_gfn; - int i; + struct slot_rmap_walk_iterator iterator; bool flush = false; - last_gfn = memslot->base_gfn + memslot->npages - 1; + for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, + end_gfn, &iterator) { + if (iterator.rmap) + flush |= fn(kvm, iterator.rmap); - spin_lock(&kvm->mmu_lock); + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (flush && lock_flush_tlb) { + kvm_flush_remote_tlbs(kvm); + flush = false; + } + cond_resched_lock(&kvm->mmu_lock); + } + } - for (i = PT_PAGE_TABLE_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - unsigned long *rmapp; - unsigned long last_index, index; + if (flush && lock_flush_tlb) { + kvm_flush_remote_tlbs(kvm); + flush = false; + } - rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; - last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); + return flush; +} - for (index = 0; index <= last_index; ++index, ++rmapp) { - if (*rmapp) - flush |= __rmap_write_protect(kvm, rmapp, - false); +static bool +slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, int start_level, int end_level, + bool lock_flush_tlb) +{ + return slot_handle_level_range(kvm, memslot, fn, start_level, + end_level, memslot->base_gfn, + memslot->base_gfn + memslot->npages - 1, + lock_flush_tlb); +} + +static bool +slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) +{ + return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, + PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); +} + +static bool +slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) +{ + return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, + PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); +} + +static bool +slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) +{ + return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, + PT_PAGE_TABLE_LEVEL, lock_flush_tlb); +} - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) - cond_resched_lock(&kvm->mmu_lock); +void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int i; + + spin_lock(&kvm->mmu_lock); + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(memslot, slots) { + gfn_t start, end; + + start = max(gfn_start, memslot->base_gfn); + end = min(gfn_end, memslot->base_gfn + memslot->npages); + if (start >= end) + continue; + + slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, + PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, + start, end - 1, true); } } spin_unlock(&kvm->mmu_lock); +} + +static bool slot_rmap_write_protect(struct kvm *kvm, unsigned long *rmapp) +{ + return __rmap_write_protect(kvm, rmapp, false); +} + +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + bool flush; + + spin_lock(&kvm->mmu_lock); + flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect, + false); + spin_unlock(&kvm->mmu_lock); /* * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log() @@ -4482,9 +4531,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, pfn_t pfn; struct kvm_mmu_page *sp; - for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); - +restart: + for_each_rmap_spte(rmapp, &iter, sptep) { sp = page_header(__pa(sptep)); pfn = spte_to_pfn(*sptep); @@ -4499,71 +4547,31 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, !kvm_is_reserved_pfn(pfn) && PageTransCompound(pfn_to_page(pfn))) { drop_spte(kvm, sptep); - sptep = rmap_get_first(*rmapp, &iter); need_tlb_flush = 1; - } else - sptep = rmap_get_next(&iter); + goto restart; + } } return need_tlb_flush; } void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - struct kvm_memory_slot *memslot) + const struct kvm_memory_slot *memslot) { - bool flush = false; - unsigned long *rmapp; - unsigned long last_index, index; - + /* FIXME: const-ify all uses of struct kvm_memory_slot. */ spin_lock(&kvm->mmu_lock); - - rmapp = memslot->arch.rmap[0]; - last_index = gfn_to_index(memslot->base_gfn + memslot->npages - 1, - memslot->base_gfn, PT_PAGE_TABLE_LEVEL); - - for (index = 0; index <= last_index; ++index, ++rmapp) { - if (*rmapp) - flush |= kvm_mmu_zap_collapsible_spte(kvm, rmapp); - - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { - if (flush) { - kvm_flush_remote_tlbs(kvm); - flush = false; - } - cond_resched_lock(&kvm->mmu_lock); - } - } - - if (flush) - kvm_flush_remote_tlbs(kvm); - + slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot, + kvm_mmu_zap_collapsible_spte, true); spin_unlock(&kvm->mmu_lock); } void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot) { - gfn_t last_gfn; - unsigned long *rmapp; - unsigned long last_index, index; - bool flush = false; - - last_gfn = memslot->base_gfn + memslot->npages - 1; + bool flush; spin_lock(&kvm->mmu_lock); - - rmapp = memslot->arch.rmap[PT_PAGE_TABLE_LEVEL - 1]; - last_index = gfn_to_index(last_gfn, memslot->base_gfn, - PT_PAGE_TABLE_LEVEL); - - for (index = 0; index <= last_index; ++index, ++rmapp) { - if (*rmapp) - flush |= __rmap_clear_dirty(kvm, rmapp); - - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) - cond_resched_lock(&kvm->mmu_lock); - } - + flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); spin_unlock(&kvm->mmu_lock); lockdep_assert_held(&kvm->slots_lock); @@ -4582,31 +4590,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty); void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot) { - gfn_t last_gfn; - int i; - bool flush = false; - - last_gfn = memslot->base_gfn + memslot->npages - 1; + bool flush; spin_lock(&kvm->mmu_lock); - - for (i = PT_PAGE_TABLE_LEVEL + 1; /* skip rmap for 4K page */ - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - unsigned long *rmapp; - unsigned long last_index, index; - - rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; - last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); - - for (index = 0; index <= last_index; ++index, ++rmapp) { - if (*rmapp) - flush |= __rmap_write_protect(kvm, rmapp, - false); - - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) - cond_resched_lock(&kvm->mmu_lock); - } - } + flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, + false); spin_unlock(&kvm->mmu_lock); /* see kvm_mmu_slot_remove_write_access */ @@ -4620,31 +4608,10 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access); void kvm_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot) { - gfn_t last_gfn; - int i; - bool flush = false; - - last_gfn = memslot->base_gfn + memslot->npages - 1; + bool flush; spin_lock(&kvm->mmu_lock); - - for (i = PT_PAGE_TABLE_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - unsigned long *rmapp; - unsigned long last_index, index; - - rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; - last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); - - for (index = 0; index <= last_index; ++index, ++rmapp) { - if (*rmapp) - flush |= __rmap_set_dirty(kvm, rmapp); - - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) - cond_resched_lock(&kvm->mmu_lock); - } - } - + flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); spin_unlock(&kvm->mmu_lock); lockdep_assert_held(&kvm->slots_lock); @@ -4741,13 +4708,13 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); } -void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm) +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots) { /* * The very rare case: if the generation-number is round, * zap all shadow pages. */ - if (unlikely(kvm_current_mmio_generation(kvm) == 0)) { + if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) { printk_ratelimited(KERN_DEBUG "kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } @@ -4869,15 +4836,18 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) unsigned int nr_pages = 0; struct kvm_memslots *slots; struct kvm_memory_slot *memslot; + int i; - slots = kvm_memslots(kvm); + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(memslot, slots) - nr_pages += memslot->npages; + kvm_for_each_memslot(memslot, slots) + nr_pages += memslot->npages; + } nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; nr_mmu_pages = max(nr_mmu_pages, - (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); + (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); return nr_mmu_pages; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 0ada65ecddcf..398d21c0f6dd 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -43,6 +43,7 @@ #define PT_PDPE_LEVEL 3 #define PT_DIRECTORY_LEVEL 2 #define PT_PAGE_TABLE_LEVEL 1 +#define PT_MAX_HUGEPAGE_LEVEL (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES - 1) static inline u64 rsvd_bits(int s, int e) { @@ -170,4 +171,5 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, } void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm); +void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); #endif diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 9ade5cfb5a4c..a4f62e6f2db2 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -114,7 +114,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) return; gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); - pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn); + pfn = kvm_vcpu_gfn_to_pfn_atomic(vcpu, gfn); if (is_error_pfn(pfn)) return; @@ -131,12 +131,16 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); unsigned long *rmapp; struct kvm_mmu_page *rev_sp; + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; gfn_t gfn; rev_sp = page_header(__pa(sptep)); gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); - if (!gfn_to_memslot(kvm, gfn)) { + slots = kvm_memslots_for_spte_role(kvm, rev_sp->role); + slot = __gfn_to_memslot(slots, gfn); + if (!slot) { if (!__ratelimit(&ratelimit_state)) return; audit_printk(kvm, "no memslot for gfn %llx\n", gfn); @@ -146,7 +150,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) return; } - rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); + rmapp = __gfn_to_rmap(gfn, rev_sp->role.level, slot); if (!*rmapp) { if (!__ratelimit(&ratelimit_state)) return; @@ -191,19 +195,21 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) unsigned long *rmapp; u64 *sptep; struct rmap_iterator iter; + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; if (sp->role.direct || sp->unsync || sp->role.invalid) return; - rmapp = gfn_to_rmap(kvm, sp->gfn, PT_PAGE_TABLE_LEVEL); + slots = kvm_memslots_for_spte_role(kvm, sp->role); + slot = __gfn_to_memslot(slots, sp->gfn); + rmapp = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot); - for (sptep = rmap_get_first(*rmapp, &iter); sptep; - sptep = rmap_get_next(&iter)) { + for_each_rmap_spte(rmapp, &iter, sptep) if (is_writable_pte(*sptep)) audit_printk(kvm, "shadow page has writable " "mappings: gfn %llx role %x\n", sp->gfn, sp->role.word); - } } static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp) diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index ce463a9cc8fb..5a24b846a1cb 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -2,7 +2,7 @@ #define _TRACE_KVMMMU_H #include <linux/tracepoint.h> -#include <linux/ftrace_event.h> +#include <linux/trace_events.h> #undef TRACE_SYSTEM #define TRACE_SYSTEM kvmmmu diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c new file mode 100644 index 000000000000..de1d2d8062e2 --- /dev/null +++ b/arch/x86/kvm/mtrr.c @@ -0,0 +1,699 @@ +/* + * vMTRR implementation + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * Copyright(C) 2015 Intel Corporation. + * + * Authors: + * Yaniv Kamay <yaniv@qumranet.com> + * Avi Kivity <avi@qumranet.com> + * Marcelo Tosatti <mtosatti@redhat.com> + * Paolo Bonzini <pbonzini@redhat.com> + * Xiao Guangrong <guangrong.xiao@linux.intel.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/kvm_host.h> +#include <asm/mtrr.h> + +#include "cpuid.h" +#include "mmu.h" + +#define IA32_MTRR_DEF_TYPE_E (1ULL << 11) +#define IA32_MTRR_DEF_TYPE_FE (1ULL << 10) +#define IA32_MTRR_DEF_TYPE_TYPE_MASK (0xff) + +static bool msr_mtrr_valid(unsigned msr) +{ + switch (msr) { + case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: + case MSR_MTRRfix64K_00000: + case MSR_MTRRfix16K_80000: + case MSR_MTRRfix16K_A0000: + case MSR_MTRRfix4K_C0000: + case MSR_MTRRfix4K_C8000: + case MSR_MTRRfix4K_D0000: + case MSR_MTRRfix4K_D8000: + case MSR_MTRRfix4K_E0000: + case MSR_MTRRfix4K_E8000: + case MSR_MTRRfix4K_F0000: + case MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: + case MSR_IA32_CR_PAT: + return true; + case 0x2f8: + return true; + } + return false; +} + +static bool valid_pat_type(unsigned t) +{ + return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ +} + +static bool valid_mtrr_type(unsigned t) +{ + return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ +} + +bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + int i; + u64 mask; + + if (!msr_mtrr_valid(msr)) + return false; + + if (msr == MSR_IA32_CR_PAT) { + for (i = 0; i < 8; i++) + if (!valid_pat_type((data >> (i * 8)) & 0xff)) + return false; + return true; + } else if (msr == MSR_MTRRdefType) { + if (data & ~0xcff) + return false; + return valid_mtrr_type(data & 0xff); + } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { + for (i = 0; i < 8 ; i++) + if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) + return false; + return true; + } + + /* variable MTRRs */ + WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR)); + + mask = (~0ULL) << cpuid_maxphyaddr(vcpu); + if ((msr & 1) == 0) { + /* MTRR base */ + if (!valid_mtrr_type(data & 0xff)) + return false; + mask |= 0xf00; + } else + /* MTRR mask */ + mask |= 0x7ff; + if (data & mask) { + kvm_inject_gp(vcpu, 0); + return false; + } + + return true; +} +EXPORT_SYMBOL_GPL(kvm_mtrr_valid); + +static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state) +{ + return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_E); +} + +static bool fixed_mtrr_is_enabled(struct kvm_mtrr *mtrr_state) +{ + return !!(mtrr_state->deftype & IA32_MTRR_DEF_TYPE_FE); +} + +static u8 mtrr_default_type(struct kvm_mtrr *mtrr_state) +{ + return mtrr_state->deftype & IA32_MTRR_DEF_TYPE_TYPE_MASK; +} + +/* +* Three terms are used in the following code: +* - segment, it indicates the address segments covered by fixed MTRRs. +* - unit, it corresponds to the MSR entry in the segment. +* - range, a range is covered in one memory cache type. +*/ +struct fixed_mtrr_segment { + u64 start; + u64 end; + + int range_shift; + + /* the start position in kvm_mtrr.fixed_ranges[]. */ + int range_start; +}; + +static struct fixed_mtrr_segment fixed_seg_table[] = { + /* MSR_MTRRfix64K_00000, 1 unit. 64K fixed mtrr. */ + { + .start = 0x0, + .end = 0x80000, + .range_shift = 16, /* 64K */ + .range_start = 0, + }, + + /* + * MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000, 2 units, + * 16K fixed mtrr. + */ + { + .start = 0x80000, + .end = 0xc0000, + .range_shift = 14, /* 16K */ + .range_start = 8, + }, + + /* + * MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000, 8 units, + * 4K fixed mtrr. + */ + { + .start = 0xc0000, + .end = 0x100000, + .range_shift = 12, /* 12K */ + .range_start = 24, + } +}; + +/* + * The size of unit is covered in one MSR, one MSR entry contains + * 8 ranges so that unit size is always 8 * 2^range_shift. + */ +static u64 fixed_mtrr_seg_unit_size(int seg) +{ + return 8 << fixed_seg_table[seg].range_shift; +} + +static bool fixed_msr_to_seg_unit(u32 msr, int *seg, int *unit) +{ + switch (msr) { + case MSR_MTRRfix64K_00000: + *seg = 0; + *unit = 0; + break; + case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000: + *seg = 1; + *unit = msr - MSR_MTRRfix16K_80000; + break; + case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000: + *seg = 2; + *unit = msr - MSR_MTRRfix4K_C0000; + break; + default: + return false; + } + + return true; +} + +static void fixed_mtrr_seg_unit_range(int seg, int unit, u64 *start, u64 *end) +{ + struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg]; + u64 unit_size = fixed_mtrr_seg_unit_size(seg); + + *start = mtrr_seg->start + unit * unit_size; + *end = *start + unit_size; + WARN_ON(*end > mtrr_seg->end); +} + +static int fixed_mtrr_seg_unit_range_index(int seg, int unit) +{ + struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg]; + + WARN_ON(mtrr_seg->start + unit * fixed_mtrr_seg_unit_size(seg) + > mtrr_seg->end); + + /* each unit has 8 ranges. */ + return mtrr_seg->range_start + 8 * unit; +} + +static int fixed_mtrr_seg_end_range_index(int seg) +{ + struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg]; + int n; + + n = (mtrr_seg->end - mtrr_seg->start) >> mtrr_seg->range_shift; + return mtrr_seg->range_start + n - 1; +} + +static bool fixed_msr_to_range(u32 msr, u64 *start, u64 *end) +{ + int seg, unit; + + if (!fixed_msr_to_seg_unit(msr, &seg, &unit)) + return false; + + fixed_mtrr_seg_unit_range(seg, unit, start, end); + return true; +} + +static int fixed_msr_to_range_index(u32 msr) +{ + int seg, unit; + + if (!fixed_msr_to_seg_unit(msr, &seg, &unit)) + return -1; + + return fixed_mtrr_seg_unit_range_index(seg, unit); +} + +static int fixed_mtrr_addr_to_seg(u64 addr) +{ + struct fixed_mtrr_segment *mtrr_seg; + int seg, seg_num = ARRAY_SIZE(fixed_seg_table); + + for (seg = 0; seg < seg_num; seg++) { + mtrr_seg = &fixed_seg_table[seg]; + if (mtrr_seg->start >= addr && addr < mtrr_seg->end) + return seg; + } + + return -1; +} + +static int fixed_mtrr_addr_seg_to_range_index(u64 addr, int seg) +{ + struct fixed_mtrr_segment *mtrr_seg; + int index; + + mtrr_seg = &fixed_seg_table[seg]; + index = mtrr_seg->range_start; + index += (addr - mtrr_seg->start) >> mtrr_seg->range_shift; + return index; +} + +static u64 fixed_mtrr_range_end_addr(int seg, int index) +{ + struct fixed_mtrr_segment *mtrr_seg = &fixed_seg_table[seg]; + int pos = index - mtrr_seg->range_start; + + return mtrr_seg->start + ((pos + 1) << mtrr_seg->range_shift); +} + +static void var_mtrr_range(struct kvm_mtrr_range *range, u64 *start, u64 *end) +{ + u64 mask; + + *start = range->base & PAGE_MASK; + + mask = range->mask & PAGE_MASK; + mask |= ~0ULL << boot_cpu_data.x86_phys_bits; + + /* This cannot overflow because writing to the reserved bits of + * variable MTRRs causes a #GP. + */ + *end = (*start | ~mask) + 1; +} + +static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; + gfn_t start, end; + int index; + + if (msr == MSR_IA32_CR_PAT || !tdp_enabled || + !kvm_arch_has_noncoherent_dma(vcpu->kvm)) + return; + + if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType) + return; + + /* fixed MTRRs. */ + if (fixed_msr_to_range(msr, &start, &end)) { + if (!fixed_mtrr_is_enabled(mtrr_state)) + return; + } else if (msr == MSR_MTRRdefType) { + start = 0x0; + end = ~0ULL; + } else { + /* variable range MTRRs. */ + index = (msr - 0x200) / 2; + var_mtrr_range(&mtrr_state->var_ranges[index], &start, &end); + } + + kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end)); +} + +static bool var_mtrr_range_is_valid(struct kvm_mtrr_range *range) +{ + return (range->mask & (1 << 11)) != 0; +} + +static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; + struct kvm_mtrr_range *tmp, *cur; + int index, is_mtrr_mask; + + index = (msr - 0x200) / 2; + is_mtrr_mask = msr - 0x200 - 2 * index; + cur = &mtrr_state->var_ranges[index]; + + /* remove the entry if it's in the list. */ + if (var_mtrr_range_is_valid(cur)) + list_del(&mtrr_state->var_ranges[index].node); + + if (!is_mtrr_mask) + cur->base = data; + else + cur->mask = data; + + /* add it to the list if it's enabled. */ + if (var_mtrr_range_is_valid(cur)) { + list_for_each_entry(tmp, &mtrr_state->head, node) + if (cur->base >= tmp->base) + break; + list_add_tail(&cur->node, &tmp->node); + } +} + +int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + int index; + + if (!kvm_mtrr_valid(vcpu, msr, data)) + return 1; + + index = fixed_msr_to_range_index(msr); + if (index >= 0) + *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index] = data; + else if (msr == MSR_MTRRdefType) + vcpu->arch.mtrr_state.deftype = data; + else if (msr == MSR_IA32_CR_PAT) + vcpu->arch.pat = data; + else + set_var_mtrr_msr(vcpu, msr, data); + + update_mtrr(vcpu, msr); + return 0; +} + +int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + int index; + + /* MSR_MTRRcap is a readonly MSR. */ + if (msr == MSR_MTRRcap) { + /* + * SMRR = 0 + * WC = 1 + * FIX = 1 + * VCNT = KVM_NR_VAR_MTRR + */ + *pdata = 0x500 | KVM_NR_VAR_MTRR; + return 0; + } + + if (!msr_mtrr_valid(msr)) + return 1; + + index = fixed_msr_to_range_index(msr); + if (index >= 0) + *pdata = *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index]; + else if (msr == MSR_MTRRdefType) + *pdata = vcpu->arch.mtrr_state.deftype; + else if (msr == MSR_IA32_CR_PAT) + *pdata = vcpu->arch.pat; + else { /* Variable MTRRs */ + int is_mtrr_mask; + + index = (msr - 0x200) / 2; + is_mtrr_mask = msr - 0x200 - 2 * index; + if (!is_mtrr_mask) + *pdata = vcpu->arch.mtrr_state.var_ranges[index].base; + else + *pdata = vcpu->arch.mtrr_state.var_ranges[index].mask; + } + + return 0; +} + +void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu) +{ + INIT_LIST_HEAD(&vcpu->arch.mtrr_state.head); +} + +struct mtrr_iter { + /* input fields. */ + struct kvm_mtrr *mtrr_state; + u64 start; + u64 end; + + /* output fields. */ + int mem_type; + /* [start, end) is not fully covered in MTRRs? */ + bool partial_map; + + /* private fields. */ + union { + /* used for fixed MTRRs. */ + struct { + int index; + int seg; + }; + + /* used for var MTRRs. */ + struct { + struct kvm_mtrr_range *range; + /* max address has been covered in var MTRRs. */ + u64 start_max; + }; + }; + + bool fixed; +}; + +static bool mtrr_lookup_fixed_start(struct mtrr_iter *iter) +{ + int seg, index; + + if (!fixed_mtrr_is_enabled(iter->mtrr_state)) + return false; + + seg = fixed_mtrr_addr_to_seg(iter->start); + if (seg < 0) + return false; + + iter->fixed = true; + index = fixed_mtrr_addr_seg_to_range_index(iter->start, seg); + iter->index = index; + iter->seg = seg; + return true; +} + +static bool match_var_range(struct mtrr_iter *iter, + struct kvm_mtrr_range *range) +{ + u64 start, end; + + var_mtrr_range(range, &start, &end); + if (!(start >= iter->end || end <= iter->start)) { + iter->range = range; + + /* + * the function is called when we do kvm_mtrr.head walking. + * Range has the minimum base address which interleaves + * [looker->start_max, looker->end). + */ + iter->partial_map |= iter->start_max < start; + + /* update the max address has been covered. */ + iter->start_max = max(iter->start_max, end); + return true; + } + + return false; +} + +static void __mtrr_lookup_var_next(struct mtrr_iter *iter) +{ + struct kvm_mtrr *mtrr_state = iter->mtrr_state; + + list_for_each_entry_continue(iter->range, &mtrr_state->head, node) + if (match_var_range(iter, iter->range)) + return; + + iter->range = NULL; + iter->partial_map |= iter->start_max < iter->end; +} + +static void mtrr_lookup_var_start(struct mtrr_iter *iter) +{ + struct kvm_mtrr *mtrr_state = iter->mtrr_state; + + iter->fixed = false; + iter->start_max = iter->start; + iter->range = list_prepare_entry(iter->range, &mtrr_state->head, node); + + __mtrr_lookup_var_next(iter); +} + +static void mtrr_lookup_fixed_next(struct mtrr_iter *iter) +{ + /* terminate the lookup. */ + if (fixed_mtrr_range_end_addr(iter->seg, iter->index) >= iter->end) { + iter->fixed = false; + iter->range = NULL; + return; + } + + iter->index++; + + /* have looked up for all fixed MTRRs. */ + if (iter->index >= ARRAY_SIZE(iter->mtrr_state->fixed_ranges)) + return mtrr_lookup_var_start(iter); + + /* switch to next segment. */ + if (iter->index > fixed_mtrr_seg_end_range_index(iter->seg)) + iter->seg++; +} + +static void mtrr_lookup_var_next(struct mtrr_iter *iter) +{ + __mtrr_lookup_var_next(iter); +} + +static void mtrr_lookup_start(struct mtrr_iter *iter) +{ + if (!mtrr_is_enabled(iter->mtrr_state)) { + iter->partial_map = true; + return; + } + + if (!mtrr_lookup_fixed_start(iter)) + mtrr_lookup_var_start(iter); +} + +static void mtrr_lookup_init(struct mtrr_iter *iter, + struct kvm_mtrr *mtrr_state, u64 start, u64 end) +{ + iter->mtrr_state = mtrr_state; + iter->start = start; + iter->end = end; + iter->partial_map = false; + iter->fixed = false; + iter->range = NULL; + + mtrr_lookup_start(iter); +} + +static bool mtrr_lookup_okay(struct mtrr_iter *iter) +{ + if (iter->fixed) { + iter->mem_type = iter->mtrr_state->fixed_ranges[iter->index]; + return true; + } + + if (iter->range) { + iter->mem_type = iter->range->base & 0xff; + return true; + } + + return false; +} + +static void mtrr_lookup_next(struct mtrr_iter *iter) +{ + if (iter->fixed) + mtrr_lookup_fixed_next(iter); + else + mtrr_lookup_var_next(iter); +} + +#define mtrr_for_each_mem_type(_iter_, _mtrr_, _gpa_start_, _gpa_end_) \ + for (mtrr_lookup_init(_iter_, _mtrr_, _gpa_start_, _gpa_end_); \ + mtrr_lookup_okay(_iter_); mtrr_lookup_next(_iter_)) + +u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; + struct mtrr_iter iter; + u64 start, end; + int type = -1; + const int wt_wb_mask = (1 << MTRR_TYPE_WRBACK) + | (1 << MTRR_TYPE_WRTHROUGH); + + start = gfn_to_gpa(gfn); + end = start + PAGE_SIZE; + + mtrr_for_each_mem_type(&iter, mtrr_state, start, end) { + int curr_type = iter.mem_type; + + /* + * Please refer to Intel SDM Volume 3: 11.11.4.1 MTRR + * Precedences. + */ + + if (type == -1) { + type = curr_type; + continue; + } + + /* + * If two or more variable memory ranges match and the + * memory types are identical, then that memory type is + * used. + */ + if (type == curr_type) + continue; + + /* + * If two or more variable memory ranges match and one of + * the memory types is UC, the UC memory type used. + */ + if (curr_type == MTRR_TYPE_UNCACHABLE) + return MTRR_TYPE_UNCACHABLE; + + /* + * If two or more variable memory ranges match and the + * memory types are WT and WB, the WT memory type is used. + */ + if (((1 << type) & wt_wb_mask) && + ((1 << curr_type) & wt_wb_mask)) { + type = MTRR_TYPE_WRTHROUGH; + continue; + } + + /* + * For overlaps not defined by the above rules, processor + * behavior is undefined. + */ + + /* We use WB for this undefined behavior. :( */ + return MTRR_TYPE_WRBACK; + } + + /* It is not covered by MTRRs. */ + if (iter.partial_map) { + /* + * We just check one page, partially covered by MTRRs is + * impossible. + */ + WARN_ON(type != -1); + type = mtrr_default_type(mtrr_state); + } + return type; +} +EXPORT_SYMBOL_GPL(kvm_mtrr_get_guest_memory_type); + +bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, + int page_num) +{ + struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; + struct mtrr_iter iter; + u64 start, end; + int type = -1; + + start = gfn_to_gpa(gfn); + end = gfn_to_gpa(gfn + page_num); + mtrr_for_each_mem_type(&iter, mtrr_state, start, end) { + if (type == -1) { + type = iter.mem_type; + continue; + } + + if (type != iter.mem_type) + return false; + } + + if (!iter.partial_map) + return true; + + if (type == -1) + return true; + + return type == mtrr_default_type(mtrr_state); +} diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 6e6d115fe9b5..0f67d7e24800 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -256,7 +256,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, if (ret) return ret; - mark_page_dirty(vcpu->kvm, table_gfn); + kvm_vcpu_mark_page_dirty(vcpu, table_gfn); walker->ptes[level] = pte; } return 0; @@ -338,7 +338,7 @@ retry_walk: real_gfn = gpa_to_gfn(real_gfn); - host_addr = gfn_to_hva_prot(vcpu->kvm, real_gfn, + host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, real_gfn, &walker->pte_writable[walker->level - 1]); if (unlikely(kvm_is_error_hva(host_addr))) goto error; @@ -511,11 +511,11 @@ static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, base_gpa = pte_gpa & ~mask; index = (pte_gpa - base_gpa) / sizeof(pt_element_t); - r = kvm_read_guest_atomic(vcpu->kvm, base_gpa, + r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa, gw->prefetch_ptes, sizeof(gw->prefetch_ptes)); curr_pte = gw->prefetch_ptes[index]; } else - r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, + r = kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &curr_pte, sizeof(curr_pte)); return r || curr_pte != gw->ptes[level - 1]; @@ -869,8 +869,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) if (!rmap_can_add(vcpu)) break; - if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, - sizeof(pt_element_t))) + if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, + sizeof(pt_element_t))) break; FNAME(update_pte)(vcpu, sp, sptep, &gpte); @@ -956,8 +956,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); - if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, - sizeof(pt_element_t))) + if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, + sizeof(pt_element_t))) return -EINVAL; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { @@ -970,7 +970,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) pte_access &= FNAME(gpte_access)(vcpu, gpte); FNAME(protect_clean_gpte)(&pte_access, gpte); - if (sync_mmio_spte(vcpu->kvm, &sp->spt[i], gfn, pte_access, + if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access, &nr_present)) continue; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 29fbf9dfdc54..31aa2c85dc97 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -1,11 +1,12 @@ /* * Kernel-based Virtual Machine -- Performance Monitoring Unit support * - * Copyright 2011 Red Hat, Inc. and/or its affiliates. + * Copyright 2015 Red Hat, Inc. and/or its affiliates. * * Authors: * Avi Kivity <avi@redhat.com> * Gleb Natapov <gleb@redhat.com> + * Wei Huang <wei@redhat.com> * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. @@ -19,88 +20,39 @@ #include "x86.h" #include "cpuid.h" #include "lapic.h" +#include "pmu.h" + +/* NOTE: + * - Each perf counter is defined as "struct kvm_pmc"; + * - There are two types of perf counters: general purpose (gp) and fixed. + * gp counters are stored in gp_counters[] and fixed counters are stored + * in fixed_counters[] respectively. Both of them are part of "struct + * kvm_pmu"; + * - pmu.c understands the difference between gp counters and fixed counters. + * However AMD doesn't support fixed-counters; + * - There are three types of index to access perf counters (PMC): + * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD + * has MSR_K7_PERFCTRn. + * 2. MSR Index (named idx): This normally is used by RDPMC instruction. + * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access + * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except + * that it also supports fixed counters. idx can be used to as index to + * gp and fixed counters. + * 3. Global PMC Index (named pmc): pmc is an index specific to PMU + * code. Each pmc, stored in kvm_pmc.idx field, is unique across + * all perf counters (both gp and fixed). The mapping relationship + * between pmc and perf counters is as the following: + * * Intel: [0 .. INTEL_PMC_MAX_GENERIC-1] <=> gp counters + * [INTEL_PMC_IDX_FIXED .. INTEL_PMC_IDX_FIXED + 2] <=> fixed + * * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters + */ -static struct kvm_arch_event_perf_mapping { - u8 eventsel; - u8 unit_mask; - unsigned event_type; - bool inexact; -} arch_events[] = { - /* Index must match CPUID 0x0A.EBX bit vector */ - [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, - [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, - [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, - [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES }, - [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, - [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, - [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, - [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES }, -}; - -/* mapping between fixed pmc index and arch_events array */ -static int fixed_pmc_events[] = {1, 0, 7}; - -static bool pmc_is_gp(struct kvm_pmc *pmc) -{ - return pmc->type == KVM_PMC_GP; -} - -static inline u64 pmc_bitmask(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; - - return pmu->counter_bitmask[pmc->type]; -} - -static inline bool pmc_enabled(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; - return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); -} - -static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr, - u32 base) -{ - if (msr >= base && msr < base + pmu->nr_arch_gp_counters) - return &pmu->gp_counters[msr - base]; - return NULL; -} - -static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr) -{ - int base = MSR_CORE_PERF_FIXED_CTR0; - if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) - return &pmu->fixed_counters[msr - base]; - return NULL; -} - -static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx) -{ - return get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + idx); -} - -static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx) -{ - if (idx < INTEL_PMC_IDX_FIXED) - return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0); - else - return get_fixed_pmc_idx(pmu, idx - INTEL_PMC_IDX_FIXED); -} - -void kvm_deliver_pmi(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.apic) - kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); -} - -static void trigger_pmi(struct irq_work *irq_work) +static void kvm_pmi_trigger_fn(struct irq_work *irq_work) { - struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, - irq_work); - struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu, - arch.pmu); + struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work); + struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); - kvm_deliver_pmi(vcpu); + kvm_pmu_deliver_pmi(vcpu); } static void kvm_perf_overflow(struct perf_event *perf_event, @@ -108,63 +60,46 @@ static void kvm_perf_overflow(struct perf_event *perf_event, struct pt_regs *regs) { struct kvm_pmc *pmc = perf_event->overflow_handler_context; - struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; - if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + if (!test_and_set_bit(pmc->idx, + (unsigned long *)&pmu->reprogram_pmi)) { __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } } static void kvm_perf_overflow_intr(struct perf_event *perf_event, - struct perf_sample_data *data, struct pt_regs *regs) + struct perf_sample_data *data, + struct pt_regs *regs) { struct kvm_pmc *pmc = perf_event->overflow_handler_context; - struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; - if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + if (!test_and_set_bit(pmc->idx, + (unsigned long *)&pmu->reprogram_pmi)) { __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); kvm_make_request(KVM_REQ_PMU, pmc->vcpu); + /* * Inject PMI. If vcpu was in a guest mode during NMI PMI * can be ejected on a guest mode re-entry. Otherwise we can't * be sure that vcpu wasn't executing hlt instruction at the - * time of vmexit and is not going to re-enter guest mode until, + * time of vmexit and is not going to re-enter guest mode until * woken up. So we should wake it, but this is impossible from * NMI context. Do it from irq work instead. */ if (!kvm_is_in_guest()) - irq_work_queue(&pmc->vcpu->arch.pmu.irq_work); + irq_work_queue(&pmc_to_pmu(pmc)->irq_work); else kvm_make_request(KVM_REQ_PMI, pmc->vcpu); } } -static u64 read_pmc(struct kvm_pmc *pmc) -{ - u64 counter, enabled, running; - - counter = pmc->counter; - - if (pmc->perf_event) - counter += perf_event_read_value(pmc->perf_event, - &enabled, &running); - - /* FIXME: Scaling needed? */ - - return counter & pmc_bitmask(pmc); -} - -static void stop_counter(struct kvm_pmc *pmc) -{ - if (pmc->perf_event) { - pmc->counter = read_pmc(pmc); - perf_event_release_kernel(pmc->perf_event); - pmc->perf_event = NULL; - } -} - -static void reprogram_counter(struct kvm_pmc *pmc, u32 type, - unsigned config, bool exclude_user, bool exclude_kernel, - bool intr, bool in_tx, bool in_tx_cp) +static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, + unsigned config, bool exclude_user, + bool exclude_kernel, bool intr, + bool in_tx, bool in_tx_cp) { struct perf_event *event; struct perf_event_attr attr = { @@ -177,6 +112,7 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type, .exclude_kernel = exclude_kernel, .config = config, }; + if (in_tx) attr.config |= HSW_IN_TX; if (in_tx_cp) @@ -188,33 +124,16 @@ static void reprogram_counter(struct kvm_pmc *pmc, u32 type, intr ? kvm_perf_overflow_intr : kvm_perf_overflow, pmc); if (IS_ERR(event)) { - printk_once("kvm: pmu event creation failed %ld\n", - PTR_ERR(event)); + printk_once("kvm_pmu: event creation failed %ld\n", + PTR_ERR(event)); return; } pmc->perf_event = event; - clear_bit(pmc->idx, (unsigned long*)&pmc->vcpu->arch.pmu.reprogram_pmi); -} - -static unsigned find_arch_event(struct kvm_pmu *pmu, u8 event_select, - u8 unit_mask) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(arch_events); i++) - if (arch_events[i].eventsel == event_select - && arch_events[i].unit_mask == unit_mask - && (pmu->available_event_types & (1 << i))) - break; - - if (i == ARRAY_SIZE(arch_events)) - return PERF_COUNT_HW_MAX; - - return arch_events[i].event_type; + clear_bit(pmc->idx, (unsigned long*)&pmc_to_pmu(pmc)->reprogram_pmi); } -static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) +void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) { unsigned config, type = PERF_TYPE_RAW; u8 event_select, unit_mask; @@ -224,21 +143,22 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) pmc->eventsel = eventsel; - stop_counter(pmc); + pmc_stop_counter(pmc); - if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_enabled(pmc)) + if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_is_enabled(pmc)) return; event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | - ARCH_PERFMON_EVENTSEL_INV | - ARCH_PERFMON_EVENTSEL_CMASK | - HSW_IN_TX | - HSW_IN_TX_CHECKPOINTED))) { - config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, - unit_mask); + ARCH_PERFMON_EVENTSEL_INV | + ARCH_PERFMON_EVENTSEL_CMASK | + HSW_IN_TX | + HSW_IN_TX_CHECKPOINTED))) { + config = kvm_x86_ops->pmu_ops->find_arch_event(pmc_to_pmu(pmc), + event_select, + unit_mask); if (config != PERF_COUNT_HW_MAX) type = PERF_TYPE_HARDWARE; } @@ -246,56 +166,36 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) if (type == PERF_TYPE_RAW) config = eventsel & X86_RAW_EVENT_MASK; - reprogram_counter(pmc, type, config, - !(eventsel & ARCH_PERFMON_EVENTSEL_USR), - !(eventsel & ARCH_PERFMON_EVENTSEL_OS), - eventsel & ARCH_PERFMON_EVENTSEL_INT, - (eventsel & HSW_IN_TX), - (eventsel & HSW_IN_TX_CHECKPOINTED)); + pmc_reprogram_counter(pmc, type, config, + !(eventsel & ARCH_PERFMON_EVENTSEL_USR), + !(eventsel & ARCH_PERFMON_EVENTSEL_OS), + eventsel & ARCH_PERFMON_EVENTSEL_INT, + (eventsel & HSW_IN_TX), + (eventsel & HSW_IN_TX_CHECKPOINTED)); } +EXPORT_SYMBOL_GPL(reprogram_gp_counter); -static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx) +void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) { - unsigned en = en_pmi & 0x3; - bool pmi = en_pmi & 0x8; + unsigned en_field = ctrl & 0x3; + bool pmi = ctrl & 0x8; - stop_counter(pmc); + pmc_stop_counter(pmc); - if (!en || !pmc_enabled(pmc)) + if (!en_field || !pmc_is_enabled(pmc)) return; - reprogram_counter(pmc, PERF_TYPE_HARDWARE, - arch_events[fixed_pmc_events[idx]].event_type, - !(en & 0x2), /* exclude user */ - !(en & 0x1), /* exclude kernel */ - pmi, false, false); + pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, + kvm_x86_ops->pmu_ops->find_fixed_event(idx), + !(en_field & 0x2), /* exclude user */ + !(en_field & 0x1), /* exclude kernel */ + pmi, false, false); } +EXPORT_SYMBOL_GPL(reprogram_fixed_counter); -static inline u8 fixed_en_pmi(u64 ctrl, int idx) +void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) { - return (ctrl >> (idx * 4)) & 0xf; -} - -static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) -{ - int i; - - for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { - u8 en_pmi = fixed_en_pmi(data, i); - struct kvm_pmc *pmc = get_fixed_pmc_idx(pmu, i); - - if (fixed_en_pmi(pmu->fixed_ctr_ctrl, i) == en_pmi) - continue; - - reprogram_fixed_counter(pmc, en_pmi, i); - } - - pmu->fixed_ctr_ctrl = data; -} - -static void reprogram_idx(struct kvm_pmu *pmu, int idx) -{ - struct kvm_pmc *pmc = global_idx_to_pmc(pmu, idx); + struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx); if (!pmc) return; @@ -303,274 +203,107 @@ static void reprogram_idx(struct kvm_pmu *pmu, int idx) if (pmc_is_gp(pmc)) reprogram_gp_counter(pmc, pmc->eventsel); else { - int fidx = idx - INTEL_PMC_IDX_FIXED; - reprogram_fixed_counter(pmc, - fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx); + int idx = pmc_idx - INTEL_PMC_IDX_FIXED; + u8 ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, idx); + + reprogram_fixed_counter(pmc, ctrl, idx); } } +EXPORT_SYMBOL_GPL(reprogram_counter); -static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) +void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) { + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u64 bitmask; int bit; - u64 diff = pmu->global_ctrl ^ data; - - pmu->global_ctrl = data; - - for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) - reprogram_idx(pmu, bit); -} -bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - int ret; - - switch (msr) { - case MSR_CORE_PERF_FIXED_CTR_CTRL: - case MSR_CORE_PERF_GLOBAL_STATUS: - case MSR_CORE_PERF_GLOBAL_CTRL: - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - ret = pmu->version > 1; - break; - default: - ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) - || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) - || get_fixed_pmc(pmu, msr); - break; - } - return ret; -} + bitmask = pmu->reprogram_pmi; -int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc; + for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) { + struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, bit); - switch (index) { - case MSR_CORE_PERF_FIXED_CTR_CTRL: - *data = pmu->fixed_ctr_ctrl; - return 0; - case MSR_CORE_PERF_GLOBAL_STATUS: - *data = pmu->global_status; - return 0; - case MSR_CORE_PERF_GLOBAL_CTRL: - *data = pmu->global_ctrl; - return 0; - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - *data = pmu->global_ovf_ctrl; - return 0; - default: - if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) || - (pmc = get_fixed_pmc(pmu, index))) { - *data = read_pmc(pmc); - return 0; - } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { - *data = pmc->eventsel; - return 0; + if (unlikely(!pmc || !pmc->perf_event)) { + clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi); + continue; } - } - return 1; -} -int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_pmc *pmc; - u32 index = msr_info->index; - u64 data = msr_info->data; - - switch (index) { - case MSR_CORE_PERF_FIXED_CTR_CTRL: - if (pmu->fixed_ctr_ctrl == data) - return 0; - if (!(data & 0xfffffffffffff444ull)) { - reprogram_fixed_counters(pmu, data); - return 0; - } - break; - case MSR_CORE_PERF_GLOBAL_STATUS: - if (msr_info->host_initiated) { - pmu->global_status = data; - return 0; - } - break; /* RO MSR */ - case MSR_CORE_PERF_GLOBAL_CTRL: - if (pmu->global_ctrl == data) - return 0; - if (!(data & pmu->global_ctrl_mask)) { - global_ctrl_changed(pmu, data); - return 0; - } - break; - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) { - if (!msr_info->host_initiated) - pmu->global_status &= ~data; - pmu->global_ovf_ctrl = data; - return 0; - } - break; - default: - if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) || - (pmc = get_fixed_pmc(pmu, index))) { - if (!msr_info->host_initiated) - data = (s64)(s32)data; - pmc->counter += data - read_pmc(pmc); - return 0; - } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { - if (data == pmc->eventsel) - return 0; - if (!(data & pmu->reserved_bits)) { - reprogram_gp_counter(pmc, data); - return 0; - } - } + reprogram_counter(pmu, bit); } - return 1; } -int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc) +/* check if idx is a valid index to access PMU */ +int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) { - struct kvm_pmu *pmu = &vcpu->arch.pmu; - bool fixed = pmc & (1u << 30); - pmc &= ~(3u << 30); - return (!fixed && pmc >= pmu->nr_arch_gp_counters) || - (fixed && pmc >= pmu->nr_arch_fixed_counters); + return kvm_x86_ops->pmu_ops->is_valid_msr_idx(vcpu, idx); } -int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data) +int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) { - struct kvm_pmu *pmu = &vcpu->arch.pmu; - bool fast_mode = pmc & (1u << 31); - bool fixed = pmc & (1u << 30); - struct kvm_pmc *counters; - u64 ctr; - - pmc &= ~(3u << 30); - if (!fixed && pmc >= pmu->nr_arch_gp_counters) - return 1; - if (fixed && pmc >= pmu->nr_arch_fixed_counters) + bool fast_mode = idx & (1u << 31); + struct kvm_pmc *pmc; + u64 ctr_val; + + pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, idx); + if (!pmc) return 1; - counters = fixed ? pmu->fixed_counters : pmu->gp_counters; - ctr = read_pmc(&counters[pmc]); + + ctr_val = pmc_read_counter(pmc); if (fast_mode) - ctr = (u32)ctr; - *data = ctr; + ctr_val = (u32)ctr_val; + *data = ctr_val; return 0; } -void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) +void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) { - struct kvm_pmu *pmu = &vcpu->arch.pmu; - struct kvm_cpuid_entry2 *entry; - union cpuid10_eax eax; - union cpuid10_edx edx; - - pmu->nr_arch_gp_counters = 0; - pmu->nr_arch_fixed_counters = 0; - pmu->counter_bitmask[KVM_PMC_GP] = 0; - pmu->counter_bitmask[KVM_PMC_FIXED] = 0; - pmu->version = 0; - pmu->reserved_bits = 0xffffffff00200000ull; - - entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); - if (!entry) - return; - eax.full = entry->eax; - edx.full = entry->edx; - - pmu->version = eax.split.version_id; - if (!pmu->version) - return; - - pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, - INTEL_PMC_MAX_GENERIC); - pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; - pmu->available_event_types = ~entry->ebx & - ((1ull << eax.split.mask_length) - 1); - - if (pmu->version == 1) { - pmu->nr_arch_fixed_counters = 0; - } else { - pmu->nr_arch_fixed_counters = - min_t(int, edx.split.num_counters_fixed, - INTEL_PMC_MAX_FIXED); - pmu->counter_bitmask[KVM_PMC_FIXED] = - ((u64)1 << edx.split.bit_width_fixed) - 1; - } + if (vcpu->arch.apic) + kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); +} - pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | - (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); - pmu->global_ctrl_mask = ~pmu->global_ctrl; +bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) +{ + return kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, msr); +} - entry = kvm_find_cpuid_entry(vcpu, 7, 0); - if (entry && - (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && - (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) - pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED; +int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) +{ + return kvm_x86_ops->pmu_ops->get_msr(vcpu, msr, data); } -void kvm_pmu_init(struct kvm_vcpu *vcpu) +int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - int i; - struct kvm_pmu *pmu = &vcpu->arch.pmu; + return kvm_x86_ops->pmu_ops->set_msr(vcpu, msr_info); +} - memset(pmu, 0, sizeof(*pmu)); - for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { - pmu->gp_counters[i].type = KVM_PMC_GP; - pmu->gp_counters[i].vcpu = vcpu; - pmu->gp_counters[i].idx = i; - } - for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) { - pmu->fixed_counters[i].type = KVM_PMC_FIXED; - pmu->fixed_counters[i].vcpu = vcpu; - pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; - } - init_irq_work(&pmu->irq_work, trigger_pmi); - kvm_pmu_cpuid_update(vcpu); +/* refresh PMU settings. This function generally is called when underlying + * settings are changed (such as changes of PMU CPUID by guest VMs), which + * should rarely happen. + */ +void kvm_pmu_refresh(struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->pmu_ops->refresh(vcpu); } void kvm_pmu_reset(struct kvm_vcpu *vcpu) { - struct kvm_pmu *pmu = &vcpu->arch.pmu; - int i; + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); irq_work_sync(&pmu->irq_work); - for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { - struct kvm_pmc *pmc = &pmu->gp_counters[i]; - stop_counter(pmc); - pmc->counter = pmc->eventsel = 0; - } + kvm_x86_ops->pmu_ops->reset(vcpu); +} - for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) - stop_counter(&pmu->fixed_counters[i]); +void kvm_pmu_init(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = - pmu->global_ovf_ctrl = 0; + memset(pmu, 0, sizeof(*pmu)); + kvm_x86_ops->pmu_ops->init(vcpu); + init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn); + kvm_pmu_refresh(vcpu); } void kvm_pmu_destroy(struct kvm_vcpu *vcpu) { kvm_pmu_reset(vcpu); } - -void kvm_handle_pmu_event(struct kvm_vcpu *vcpu) -{ - struct kvm_pmu *pmu = &vcpu->arch.pmu; - u64 bitmask; - int bit; - - bitmask = pmu->reprogram_pmi; - - for_each_set_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) { - struct kvm_pmc *pmc = global_idx_to_pmc(pmu, bit); - - if (unlikely(!pmc || !pmc->perf_event)) { - clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi); - continue; - } - - reprogram_idx(pmu, bit); - } -} diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h new file mode 100644 index 000000000000..f96e1f962587 --- /dev/null +++ b/arch/x86/kvm/pmu.h @@ -0,0 +1,118 @@ +#ifndef __KVM_X86_PMU_H +#define __KVM_X86_PMU_H + +#define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu) +#define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu)) +#define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu) + +/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */ +#define fixed_ctrl_field(ctrl_reg, idx) (((ctrl_reg) >> ((idx)*4)) & 0xf) + +struct kvm_event_hw_type_mapping { + u8 eventsel; + u8 unit_mask; + unsigned event_type; +}; + +struct kvm_pmu_ops { + unsigned (*find_arch_event)(struct kvm_pmu *pmu, u8 event_select, + u8 unit_mask); + unsigned (*find_fixed_event)(int idx); + bool (*pmc_is_enabled)(struct kvm_pmc *pmc); + struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx); + struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, unsigned idx); + int (*is_valid_msr_idx)(struct kvm_vcpu *vcpu, unsigned idx); + bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr); + int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr, u64 *data); + int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); + void (*refresh)(struct kvm_vcpu *vcpu); + void (*init)(struct kvm_vcpu *vcpu); + void (*reset)(struct kvm_vcpu *vcpu); +}; + +static inline u64 pmc_bitmask(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + return pmu->counter_bitmask[pmc->type]; +} + +static inline u64 pmc_read_counter(struct kvm_pmc *pmc) +{ + u64 counter, enabled, running; + + counter = pmc->counter; + if (pmc->perf_event) + counter += perf_event_read_value(pmc->perf_event, + &enabled, &running); + /* FIXME: Scaling needed? */ + return counter & pmc_bitmask(pmc); +} + +static inline void pmc_stop_counter(struct kvm_pmc *pmc) +{ + if (pmc->perf_event) { + pmc->counter = pmc_read_counter(pmc); + perf_event_release_kernel(pmc->perf_event); + pmc->perf_event = NULL; + } +} + +static inline bool pmc_is_gp(struct kvm_pmc *pmc) +{ + return pmc->type == KVM_PMC_GP; +} + +static inline bool pmc_is_fixed(struct kvm_pmc *pmc) +{ + return pmc->type == KVM_PMC_FIXED; +} + +static inline bool pmc_is_enabled(struct kvm_pmc *pmc) +{ + return kvm_x86_ops->pmu_ops->pmc_is_enabled(pmc); +} + +/* returns general purpose PMC with the specified MSR. Note that it can be + * used for both PERFCTRn and EVNTSELn; that is why it accepts base as a + * paramenter to tell them apart. + */ +static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr, + u32 base) +{ + if (msr >= base && msr < base + pmu->nr_arch_gp_counters) + return &pmu->gp_counters[msr - base]; + + return NULL; +} + +/* returns fixed PMC with the specified MSR */ +static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr) +{ + int base = MSR_CORE_PERF_FIXED_CTR0; + + if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) + return &pmu->fixed_counters[msr - base]; + + return NULL; +} + +void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel); +void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx); +void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx); + +void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); +void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); +int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); +int kvm_pmu_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx); +bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr); +int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data); +int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); +void kvm_pmu_refresh(struct kvm_vcpu *vcpu); +void kvm_pmu_reset(struct kvm_vcpu *vcpu); +void kvm_pmu_init(struct kvm_vcpu *vcpu); +void kvm_pmu_destroy(struct kvm_vcpu *vcpu); + +extern struct kvm_pmu_ops intel_pmu_ops; +extern struct kvm_pmu_ops amd_pmu_ops; +#endif /* __KVM_X86_PMU_H */ diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c new file mode 100644 index 000000000000..886aa25a7131 --- /dev/null +++ b/arch/x86/kvm/pmu_amd.c @@ -0,0 +1,207 @@ +/* + * KVM PMU support for AMD + * + * Copyright 2015, Red Hat, Inc. and/or its affiliates. + * + * Author: + * Wei Huang <wei@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Implementation is based on pmu_intel.c file + */ +#include <linux/types.h> +#include <linux/kvm_host.h> +#include <linux/perf_event.h> +#include "x86.h" +#include "cpuid.h" +#include "lapic.h" +#include "pmu.h" + +/* duplicated from amd_perfmon_event_map, K7 and above should work. */ +static struct kvm_event_hw_type_mapping amd_event_mapping[] = { + [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES }, + [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, + [2] = { 0x80, 0x00, PERF_COUNT_HW_CACHE_REFERENCES }, + [3] = { 0x81, 0x00, PERF_COUNT_HW_CACHE_MISSES }, + [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, + [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, + [6] = { 0xd0, 0x00, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, + [7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, +}; + +static unsigned amd_find_arch_event(struct kvm_pmu *pmu, + u8 event_select, + u8 unit_mask) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++) + if (amd_event_mapping[i].eventsel == event_select + && amd_event_mapping[i].unit_mask == unit_mask) + break; + + if (i == ARRAY_SIZE(amd_event_mapping)) + return PERF_COUNT_HW_MAX; + + return amd_event_mapping[i].event_type; +} + +/* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */ +static unsigned amd_find_fixed_event(int idx) +{ + return PERF_COUNT_HW_MAX; +} + +/* check if a PMC is enabled by comparing it against global_ctrl bits. Because + * AMD CPU doesn't have global_ctrl MSR, all PMCs are enabled (return TRUE). + */ +static bool amd_pmc_is_enabled(struct kvm_pmc *pmc) +{ + return true; +} + +static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) +{ + return get_gp_pmc(pmu, MSR_K7_EVNTSEL0 + pmc_idx, MSR_K7_EVNTSEL0); +} + +/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ +static int amd_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + idx &= ~(3u << 30); + + return (idx >= pmu->nr_arch_gp_counters); +} + +/* idx is the ECX register of RDPMC instruction */ +static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, unsigned idx) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *counters; + + idx &= ~(3u << 30); + if (idx >= pmu->nr_arch_gp_counters) + return NULL; + counters = pmu->gp_counters; + + return &counters[idx]; +} + +static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + int ret = false; + + ret = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0) || + get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0); + + return ret; +} + +static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + + /* MSR_K7_PERFCTRn */ + pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0); + if (pmc) { + *data = pmc_read_counter(pmc); + return 0; + } + /* MSR_K7_EVNTSELn */ + pmc = get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0); + if (pmc) { + *data = pmc->eventsel; + return 0; + } + + return 1; +} + +static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + u32 msr = msr_info->index; + u64 data = msr_info->data; + + /* MSR_K7_PERFCTRn */ + pmc = get_gp_pmc(pmu, msr, MSR_K7_PERFCTR0); + if (pmc) { + if (!msr_info->host_initiated) + data = (s64)data; + pmc->counter += data - pmc_read_counter(pmc); + return 0; + } + /* MSR_K7_EVNTSELn */ + pmc = get_gp_pmc(pmu, msr, MSR_K7_EVNTSEL0); + if (pmc) { + if (data == pmc->eventsel) + return 0; + if (!(data & pmu->reserved_bits)) { + reprogram_gp_counter(pmc, data); + return 0; + } + } + + return 1; +} + +static void amd_pmu_refresh(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS; + pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; + pmu->reserved_bits = 0xffffffff00200000ull; + /* not applicable to AMD; but clean them to prevent any fall out */ + pmu->counter_bitmask[KVM_PMC_FIXED] = 0; + pmu->nr_arch_fixed_counters = 0; + pmu->version = 0; + pmu->global_status = 0; +} + +static void amd_pmu_init(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + int i; + + for (i = 0; i < AMD64_NUM_COUNTERS ; i++) { + pmu->gp_counters[i].type = KVM_PMC_GP; + pmu->gp_counters[i].vcpu = vcpu; + pmu->gp_counters[i].idx = i; + } +} + +static void amd_pmu_reset(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + int i; + + for (i = 0; i < AMD64_NUM_COUNTERS; i++) { + struct kvm_pmc *pmc = &pmu->gp_counters[i]; + + pmc_stop_counter(pmc); + pmc->counter = pmc->eventsel = 0; + } +} + +struct kvm_pmu_ops amd_pmu_ops = { + .find_arch_event = amd_find_arch_event, + .find_fixed_event = amd_find_fixed_event, + .pmc_is_enabled = amd_pmc_is_enabled, + .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, + .msr_idx_to_pmc = amd_msr_idx_to_pmc, + .is_valid_msr_idx = amd_is_valid_msr_idx, + .is_valid_msr = amd_is_valid_msr, + .get_msr = amd_pmu_get_msr, + .set_msr = amd_pmu_set_msr, + .refresh = amd_pmu_refresh, + .init = amd_pmu_init, + .reset = amd_pmu_reset, +}; diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c new file mode 100644 index 000000000000..ab38af4f4947 --- /dev/null +++ b/arch/x86/kvm/pmu_intel.c @@ -0,0 +1,358 @@ +/* + * KVM PMU support for Intel CPUs + * + * Copyright 2011 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Avi Kivity <avi@redhat.com> + * Gleb Natapov <gleb@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ +#include <linux/types.h> +#include <linux/kvm_host.h> +#include <linux/perf_event.h> +#include <asm/perf_event.h> +#include "x86.h" +#include "cpuid.h" +#include "lapic.h" +#include "pmu.h" + +static struct kvm_event_hw_type_mapping intel_arch_events[] = { + /* Index must match CPUID 0x0A.EBX bit vector */ + [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, + [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, + [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, + [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES }, + [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, + [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, + [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, + [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES }, +}; + +/* mapping between fixed pmc index and intel_arch_events array */ +static int fixed_pmc_events[] = {1, 0, 7}; + +static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) +{ + int i; + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + u8 new_ctrl = fixed_ctrl_field(data, i); + u8 old_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, i); + struct kvm_pmc *pmc; + + pmc = get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + i); + + if (old_ctrl == new_ctrl) + continue; + + reprogram_fixed_counter(pmc, new_ctrl, i); + } + + pmu->fixed_ctr_ctrl = data; +} + +/* function is called when global control register has been updated. */ +static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) +{ + int bit; + u64 diff = pmu->global_ctrl ^ data; + + pmu->global_ctrl = data; + + for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) + reprogram_counter(pmu, bit); +} + +static unsigned intel_find_arch_event(struct kvm_pmu *pmu, + u8 event_select, + u8 unit_mask) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) + if (intel_arch_events[i].eventsel == event_select + && intel_arch_events[i].unit_mask == unit_mask + && (pmu->available_event_types & (1 << i))) + break; + + if (i == ARRAY_SIZE(intel_arch_events)) + return PERF_COUNT_HW_MAX; + + return intel_arch_events[i].event_type; +} + +static unsigned intel_find_fixed_event(int idx) +{ + if (idx >= ARRAY_SIZE(fixed_pmc_events)) + return PERF_COUNT_HW_MAX; + + return intel_arch_events[fixed_pmc_events[idx]].event_type; +} + +/* check if a PMC is enabled by comparising it with globl_ctrl bits. */ +static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); +} + +static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) +{ + if (pmc_idx < INTEL_PMC_IDX_FIXED) + return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + pmc_idx, + MSR_P6_EVNTSEL0); + else { + u32 idx = pmc_idx - INTEL_PMC_IDX_FIXED; + + return get_fixed_pmc(pmu, idx + MSR_CORE_PERF_FIXED_CTR0); + } +} + +/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ +static int intel_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + bool fixed = idx & (1u << 30); + + idx &= ~(3u << 30); + + return (!fixed && idx >= pmu->nr_arch_gp_counters) || + (fixed && idx >= pmu->nr_arch_fixed_counters); +} + +static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, + unsigned idx) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + bool fixed = idx & (1u << 30); + struct kvm_pmc *counters; + + idx &= ~(3u << 30); + if (!fixed && idx >= pmu->nr_arch_gp_counters) + return NULL; + if (fixed && idx >= pmu->nr_arch_fixed_counters) + return NULL; + counters = fixed ? pmu->fixed_counters : pmu->gp_counters; + + return &counters[idx]; +} + +static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + int ret; + + switch (msr) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_CORE_PERF_GLOBAL_CTRL: + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + ret = pmu->version > 1; + break; + default: + ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || + get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || + get_fixed_pmc(pmu, msr); + break; + } + + return ret; +} + +static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + + switch (msr) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + *data = pmu->fixed_ctr_ctrl; + return 0; + case MSR_CORE_PERF_GLOBAL_STATUS: + *data = pmu->global_status; + return 0; + case MSR_CORE_PERF_GLOBAL_CTRL: + *data = pmu->global_ctrl; + return 0; + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + *data = pmu->global_ovf_ctrl; + return 0; + default: + if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || + (pmc = get_fixed_pmc(pmu, msr))) { + *data = pmc_read_counter(pmc); + return 0; + } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { + *data = pmc->eventsel; + return 0; + } + } + + return 1; +} + +static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc; + u32 msr = msr_info->index; + u64 data = msr_info->data; + + switch (msr) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + if (pmu->fixed_ctr_ctrl == data) + return 0; + if (!(data & 0xfffffffffffff444ull)) { + reprogram_fixed_counters(pmu, data); + return 0; + } + break; + case MSR_CORE_PERF_GLOBAL_STATUS: + if (msr_info->host_initiated) { + pmu->global_status = data; + return 0; + } + break; /* RO MSR */ + case MSR_CORE_PERF_GLOBAL_CTRL: + if (pmu->global_ctrl == data) + return 0; + if (!(data & pmu->global_ctrl_mask)) { + global_ctrl_changed(pmu, data); + return 0; + } + break; + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) { + if (!msr_info->host_initiated) + pmu->global_status &= ~data; + pmu->global_ovf_ctrl = data; + return 0; + } + break; + default: + if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || + (pmc = get_fixed_pmc(pmu, msr))) { + if (!msr_info->host_initiated) + data = (s64)(s32)data; + pmc->counter += data - pmc_read_counter(pmc); + return 0; + } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { + if (data == pmc->eventsel) + return 0; + if (!(data & pmu->reserved_bits)) { + reprogram_gp_counter(pmc, data); + return 0; + } + } + } + + return 1; +} + +static void intel_pmu_refresh(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_cpuid_entry2 *entry; + union cpuid10_eax eax; + union cpuid10_edx edx; + + pmu->nr_arch_gp_counters = 0; + pmu->nr_arch_fixed_counters = 0; + pmu->counter_bitmask[KVM_PMC_GP] = 0; + pmu->counter_bitmask[KVM_PMC_FIXED] = 0; + pmu->version = 0; + pmu->reserved_bits = 0xffffffff00200000ull; + + entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); + if (!entry) + return; + eax.full = entry->eax; + edx.full = entry->edx; + + pmu->version = eax.split.version_id; + if (!pmu->version) + return; + + pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, + INTEL_PMC_MAX_GENERIC); + pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1; + pmu->available_event_types = ~entry->ebx & + ((1ull << eax.split.mask_length) - 1); + + if (pmu->version == 1) { + pmu->nr_arch_fixed_counters = 0; + } else { + pmu->nr_arch_fixed_counters = + min_t(int, edx.split.num_counters_fixed, + INTEL_PMC_MAX_FIXED); + pmu->counter_bitmask[KVM_PMC_FIXED] = + ((u64)1 << edx.split.bit_width_fixed) - 1; + } + + pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | + (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); + pmu->global_ctrl_mask = ~pmu->global_ctrl; + + entry = kvm_find_cpuid_entry(vcpu, 7, 0); + if (entry && + (boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) && + (entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM))) + pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED; +} + +static void intel_pmu_init(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { + pmu->gp_counters[i].type = KVM_PMC_GP; + pmu->gp_counters[i].vcpu = vcpu; + pmu->gp_counters[i].idx = i; + } + + for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) { + pmu->fixed_counters[i].type = KVM_PMC_FIXED; + pmu->fixed_counters[i].vcpu = vcpu; + pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; + } +} + +static void intel_pmu_reset(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + int i; + + for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { + struct kvm_pmc *pmc = &pmu->gp_counters[i]; + + pmc_stop_counter(pmc); + pmc->counter = pmc->eventsel = 0; + } + + for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) + pmc_stop_counter(&pmu->fixed_counters[i]); + + pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = + pmu->global_ovf_ctrl = 0; +} + +struct kvm_pmu_ops intel_pmu_ops = { + .find_arch_event = intel_find_arch_event, + .find_fixed_event = intel_find_fixed_event, + .pmc_is_enabled = intel_pmc_is_enabled, + .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, + .msr_idx_to_pmc = intel_msr_idx_to_pmc, + .is_valid_msr_idx = intel_is_valid_msr_idx, + .is_valid_msr = intel_is_valid_msr, + .get_msr = intel_pmu_get_msr, + .set_msr = intel_pmu_set_msr, + .refresh = intel_pmu_refresh, + .init = intel_pmu_init, + .reset = intel_pmu_reset, +}; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9afa233b5482..602b974a60a6 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -21,6 +21,7 @@ #include "kvm_cache_regs.h" #include "x86.h" #include "cpuid.h" +#include "pmu.h" #include <linux/module.h> #include <linux/mod_devicetable.h> @@ -28,7 +29,7 @@ #include <linux/vmalloc.h> #include <linux/highmem.h> #include <linux/sched.h> -#include <linux/ftrace_event.h> +#include <linux/trace_events.h> #include <linux/slab.h> #include <asm/perf_event.h> @@ -511,8 +512,10 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm->vmcb->control.next_rip != 0) + if (svm->vmcb->control.next_rip != 0) { + WARN_ON(!static_cpu_has(X86_FEATURE_NRIPS)); svm->next_rip = svm->vmcb->control.next_rip; + } if (!svm->next_rip) { if (emulate_instruction(vcpu, EMULTYPE_SKIP) != @@ -1082,7 +1085,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) return target_tsc - tsc; } -static void init_vmcb(struct vcpu_svm *svm) +static void init_vmcb(struct vcpu_svm *svm, bool init_event) { struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; @@ -1153,17 +1156,17 @@ static void init_vmcb(struct vcpu_svm *svm) init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - svm_set_efer(&svm->vcpu, 0); + if (!init_event) + svm_set_efer(&svm->vcpu, 0); save->dr6 = 0xffff0ff0; kvm_set_rflags(&svm->vcpu, 2); save->rip = 0x0000fff0; svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; /* - * This is the guest-visible cr0 value. * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. + * It also updates the guest-visible cr0 value. */ - svm->vcpu.arch.cr0 = 0; (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); save->cr4 = X86_CR4_PAE; @@ -1176,7 +1179,7 @@ static void init_vmcb(struct vcpu_svm *svm) clr_exception_intercept(svm, PF_VECTOR); clr_cr_intercept(svm, INTERCEPT_CR3_READ); clr_cr_intercept(svm, INTERCEPT_CR3_WRITE); - save->g_pat = 0x0007040600070406ULL; + save->g_pat = svm->vcpu.arch.pat; save->cr3 = 0; save->cr4 = 0; } @@ -1195,13 +1198,19 @@ static void init_vmcb(struct vcpu_svm *svm) enable_gif(svm); } -static void svm_vcpu_reset(struct kvm_vcpu *vcpu) +static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_svm *svm = to_svm(vcpu); u32 dummy; u32 eax = 1; - init_vmcb(svm); + if (!init_event) { + svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE; + if (kvm_vcpu_is_reset_bsp(&svm->vcpu)) + svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; + } + init_vmcb(svm, init_event); kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); kvm_register_write(vcpu, VCPU_REGS_RDX, eax); @@ -1257,12 +1266,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) clear_page(svm->vmcb); svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; - init_vmcb(svm); - - svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_reset_bsp(&svm->vcpu)) - svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; + init_vmcb(svm, false); svm_init_osvw(&svm->vcpu); @@ -1575,7 +1579,8 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) * does not do it - this results in some delay at * reboot */ - cr0 &= ~(X86_CR0_CD | X86_CR0_NW); + if (!(vcpu->kvm->arch.disabled_quirks & KVM_QUIRK_CD_NW_CLEARED)) + cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; mark_dirty(svm->vmcb, VMCB_CR); update_cr0_intercept(svm); @@ -1883,7 +1888,7 @@ static int shutdown_interception(struct vcpu_svm *svm) * so reinitialize it. */ clear_page(svm->vmcb); - init_vmcb(svm); + init_vmcb(svm, false); kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; @@ -1953,8 +1958,8 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) u64 pdpte; int ret; - ret = kvm_read_guest_page(vcpu->kvm, gpa_to_gfn(cr3), &pdpte, - offset_in_page(cr3) + index * 8, 8); + ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte, + offset_in_page(cr3) + index * 8, 8); if (ret) return 0; return pdpte; @@ -2112,7 +2117,7 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page) might_sleep(); - page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); + page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT); if (is_error_page(page)) goto error; @@ -2151,7 +2156,7 @@ static int nested_svm_intercept_ioio(struct vcpu_svm *svm) mask = (0xf >> (4 - size)) << start_bit; val = 0; - if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, iopm_len)) + if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len)) return NESTED_EXIT_DONE; return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; @@ -2176,7 +2181,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) /* Offset is in 32 bit units but need in 8 bit units */ offset *= 4; - if (kvm_read_guest(svm->vcpu.kvm, svm->nested.vmcb_msrpm + offset, &value, 4)) + if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4)) return NESTED_EXIT_DONE; return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; @@ -2447,7 +2452,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) p = msrpm_offsets[i]; offset = svm->nested.vmcb_msrpm + (p * 4); - if (kvm_read_guest(svm->vcpu.kvm, offset, &value, 4)) + if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4)) return false; svm->nested.msrpm[p] = svm->msrpm[p] | value; @@ -3067,42 +3072,42 @@ static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) svm_scale_tsc(vcpu, host_tsc); } -static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) +static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_svm *svm = to_svm(vcpu); - switch (ecx) { + switch (msr_info->index) { case MSR_IA32_TSC: { - *data = svm->vmcb->control.tsc_offset + + msr_info->data = svm->vmcb->control.tsc_offset + svm_scale_tsc(vcpu, native_read_tsc()); break; } case MSR_STAR: - *data = svm->vmcb->save.star; + msr_info->data = svm->vmcb->save.star; break; #ifdef CONFIG_X86_64 case MSR_LSTAR: - *data = svm->vmcb->save.lstar; + msr_info->data = svm->vmcb->save.lstar; break; case MSR_CSTAR: - *data = svm->vmcb->save.cstar; + msr_info->data = svm->vmcb->save.cstar; break; case MSR_KERNEL_GS_BASE: - *data = svm->vmcb->save.kernel_gs_base; + msr_info->data = svm->vmcb->save.kernel_gs_base; break; case MSR_SYSCALL_MASK: - *data = svm->vmcb->save.sfmask; + msr_info->data = svm->vmcb->save.sfmask; break; #endif case MSR_IA32_SYSENTER_CS: - *data = svm->vmcb->save.sysenter_cs; + msr_info->data = svm->vmcb->save.sysenter_cs; break; case MSR_IA32_SYSENTER_EIP: - *data = svm->sysenter_eip; + msr_info->data = svm->sysenter_eip; break; case MSR_IA32_SYSENTER_ESP: - *data = svm->sysenter_esp; + msr_info->data = svm->sysenter_esp; break; /* * Nobody will change the following 5 values in the VMCB so we can @@ -3110,31 +3115,31 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) * implemented. */ case MSR_IA32_DEBUGCTLMSR: - *data = svm->vmcb->save.dbgctl; + msr_info->data = svm->vmcb->save.dbgctl; break; case MSR_IA32_LASTBRANCHFROMIP: - *data = svm->vmcb->save.br_from; + msr_info->data = svm->vmcb->save.br_from; break; case MSR_IA32_LASTBRANCHTOIP: - *data = svm->vmcb->save.br_to; + msr_info->data = svm->vmcb->save.br_to; break; case MSR_IA32_LASTINTFROMIP: - *data = svm->vmcb->save.last_excp_from; + msr_info->data = svm->vmcb->save.last_excp_from; break; case MSR_IA32_LASTINTTOIP: - *data = svm->vmcb->save.last_excp_to; + msr_info->data = svm->vmcb->save.last_excp_to; break; case MSR_VM_HSAVE_PA: - *data = svm->nested.hsave_msr; + msr_info->data = svm->nested.hsave_msr; break; case MSR_VM_CR: - *data = svm->nested.vm_cr_msr; + msr_info->data = svm->nested.vm_cr_msr; break; case MSR_IA32_UCODE_REV: - *data = 0x01000065; + msr_info->data = 0x01000065; break; default: - return kvm_get_msr_common(vcpu, ecx, data); + return kvm_get_msr_common(vcpu, msr_info); } return 0; } @@ -3142,16 +3147,20 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) static int rdmsr_interception(struct vcpu_svm *svm) { u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); - u64 data; + struct msr_data msr_info; - if (svm_get_msr(&svm->vcpu, ecx, &data)) { + msr_info.index = ecx; + msr_info.host_initiated = false; + if (svm_get_msr(&svm->vcpu, &msr_info)) { trace_kvm_msr_read_ex(ecx); kvm_inject_gp(&svm->vcpu, 0); } else { - trace_kvm_msr_read(ecx, data); + trace_kvm_msr_read(ecx, msr_info.data); - kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, data & 0xffffffff); - kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, data >> 32); + kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, + msr_info.data & 0xffffffff); + kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, + msr_info.data >> 32); svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; skip_emulated_instruction(&svm->vcpu); } @@ -3388,6 +3397,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_MWAIT] = mwait_interception, [SVM_EXIT_XSETBV] = xsetbv_interception, [SVM_EXIT_NPF] = pf_interception, + [SVM_EXIT_RSM] = emulate_on_interception, }; static void dump_vmcb(struct kvm_vcpu *vcpu) @@ -4073,6 +4083,11 @@ static bool svm_cpu_has_accelerated_tpr(void) return false; } +static bool svm_has_high_real_mode_segbase(void) +{ + return true; +} + static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { return 0; @@ -4317,7 +4332,9 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, break; } - vmcb->control.next_rip = info->next_rip; + /* TODO: Advertise NRIPS to guest hypervisor unconditionally */ + if (static_cpu_has(X86_FEATURE_NRIPS)) + vmcb->control.next_rip = info->next_rip; vmcb->control.exit_code = icpt_info.exit_code; vmexit = nested_svm_exit_handled(svm); @@ -4346,6 +4363,7 @@ static struct kvm_x86_ops svm_x86_ops = { .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, + .cpu_has_high_real_mode_segbase = svm_has_high_real_mode_segbase, .vcpu_create = svm_create_vcpu, .vcpu_free = svm_free_vcpu, @@ -4440,6 +4458,8 @@ static struct kvm_x86_ops svm_x86_ops = { .handle_external_intr = svm_handle_external_intr, .sched_in = svm_sched_in, + + .pmu_ops = &amd_pmu_ops, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 7c7bc8bef21f..4eae7c35ddf5 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -952,6 +952,28 @@ TRACE_EVENT(kvm_wait_lapic_expire, __entry->delta < 0 ? "early" : "late") ); +TRACE_EVENT(kvm_enter_smm, + TP_PROTO(unsigned int vcpu_id, u64 smbase, bool entering), + TP_ARGS(vcpu_id, smbase, entering), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( u64, smbase ) + __field( bool, entering ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->smbase = smbase; + __entry->entering = entering; + ), + + TP_printk("vcpu %u: %s SMM, smbase 0x%llx", + __entry->vcpu_id, + __entry->entering ? "entering" : "leaving", + __entry->smbase) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2d73807f0d31..e856dd566f4c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -28,7 +28,7 @@ #include <linux/sched.h> #include <linux/moduleparam.h> #include <linux/mod_devicetable.h> -#include <linux/ftrace_event.h> +#include <linux/trace_events.h> #include <linux/slab.h> #include <linux/tboot.h> #include <linux/hrtimer.h> @@ -40,14 +40,14 @@ #include <asm/vmx.h> #include <asm/virtext.h> #include <asm/mce.h> -#include <asm/i387.h> -#include <asm/xcr.h> +#include <asm/fpu/internal.h> #include <asm/perf_event.h> #include <asm/debugreg.h> #include <asm/kexec.h> #include <asm/apic.h> #include "trace.h" +#include "pmu.h" #define __ex(x) __kvm_handle_fault_on_reboot(x) #define __ex_clear(x, reg) \ @@ -786,7 +786,7 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr) { - struct page *page = gfn_to_page(vcpu->kvm, addr >> PAGE_SHIFT); + struct page *page = kvm_vcpu_gfn_to_page(vcpu, addr >> PAGE_SHIFT); if (is_error_page(page)) return NULL; @@ -1883,7 +1883,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) * If the FPU is not active (through the host task or * the guest vcpu), then restore the cr0.TS bit. */ - if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded) + if (!fpregs_active() && !vmx->vcpu.guest_fpu_loaded) stts(); load_gdt(this_cpu_ptr(&host_gdt)); } @@ -2170,8 +2170,7 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu)) msr_bitmap = vmx_msr_bitmap_nested; - else if (irqchip_in_kernel(vcpu->kvm) && - apic_x2apic_mode(vcpu->arch.apic)) { + else if (vcpu->arch.apic_base & X2APIC_ENABLE) { if (is_long_mode(vcpu)) msr_bitmap = vmx_msr_bitmap_longmode_x2apic; else @@ -2623,76 +2622,69 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - u64 data; struct shared_msr_entry *msr; - if (!pdata) { - printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); - return -EINVAL; - } - - switch (msr_index) { + switch (msr_info->index) { #ifdef CONFIG_X86_64 case MSR_FS_BASE: - data = vmcs_readl(GUEST_FS_BASE); + msr_info->data = vmcs_readl(GUEST_FS_BASE); break; case MSR_GS_BASE: - data = vmcs_readl(GUEST_GS_BASE); + msr_info->data = vmcs_readl(GUEST_GS_BASE); break; case MSR_KERNEL_GS_BASE: vmx_load_host_state(to_vmx(vcpu)); - data = to_vmx(vcpu)->msr_guest_kernel_gs_base; + msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base; break; #endif case MSR_EFER: - return kvm_get_msr_common(vcpu, msr_index, pdata); + return kvm_get_msr_common(vcpu, msr_info); case MSR_IA32_TSC: - data = guest_read_tsc(); + msr_info->data = guest_read_tsc(); break; case MSR_IA32_SYSENTER_CS: - data = vmcs_read32(GUEST_SYSENTER_CS); + msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); break; case MSR_IA32_SYSENTER_EIP: - data = vmcs_readl(GUEST_SYSENTER_EIP); + msr_info->data = vmcs_readl(GUEST_SYSENTER_EIP); break; case MSR_IA32_SYSENTER_ESP: - data = vmcs_readl(GUEST_SYSENTER_ESP); + msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP); break; case MSR_IA32_BNDCFGS: if (!vmx_mpx_supported()) return 1; - data = vmcs_read64(GUEST_BNDCFGS); + msr_info->data = vmcs_read64(GUEST_BNDCFGS); break; case MSR_IA32_FEATURE_CONTROL: if (!nested_vmx_allowed(vcpu)) return 1; - data = to_vmx(vcpu)->nested.msr_ia32_feature_control; + msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control; break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!nested_vmx_allowed(vcpu)) return 1; - return vmx_get_vmx_msr(vcpu, msr_index, pdata); + return vmx_get_vmx_msr(vcpu, msr_info->index, &msr_info->data); case MSR_IA32_XSS: if (!vmx_xsaves_supported()) return 1; - data = vcpu->arch.ia32_xss; + msr_info->data = vcpu->arch.ia32_xss; break; case MSR_TSC_AUX: if (!to_vmx(vcpu)->rdtscp_enabled) return 1; /* Otherwise falls through */ default: - msr = find_msr_entry(to_vmx(vcpu), msr_index); + msr = find_msr_entry(to_vmx(vcpu), msr_info->index); if (msr) { - data = msr->data; + msr_info->data = msr->data; break; } - return kvm_get_msr_common(vcpu, msr_index, pdata); + return kvm_get_msr_common(vcpu, msr_info); } - *pdata = data; return 0; } @@ -4123,7 +4115,7 @@ static int alloc_apic_access_page(struct kvm *kvm) kvm_userspace_mem.flags = 0; kvm_userspace_mem.guest_phys_addr = APIC_DEFAULT_PHYS_BASE; kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); + r = __x86_set_memory_region(kvm, &kvm_userspace_mem); if (r) goto out; @@ -4158,7 +4150,7 @@ static int alloc_identity_pagetable(struct kvm *kvm) kvm_userspace_mem.guest_phys_addr = kvm->arch.ept_identity_map_addr; kvm_userspace_mem.memory_size = PAGE_SIZE; - r = __kvm_set_memory_region(kvm, &kvm_userspace_mem); + r = __x86_set_memory_region(kvm, &kvm_userspace_mem); return r; } @@ -4667,16 +4659,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); - if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { - u32 msr_low, msr_high; - u64 host_pat; - rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); - host_pat = msr_low | ((u64) msr_high << 32); - /* Write the default value follow host pat */ - vmcs_write64(GUEST_IA32_PAT, host_pat); - /* Keep arch.pat sync with GUEST_IA32_PAT */ - vmx->vcpu.arch.pat = host_pat; - } + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) + vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { u32 index = vmx_msr_index[i]; @@ -4708,22 +4692,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) return 0; } -static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) +static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct msr_data apic_base_msr; + u64 cr0; vmx->rmode.vm86_active = 0; vmx->soft_vnmi_blocked = 0; vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); - kvm_set_cr8(&vmx->vcpu, 0); - apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_reset_bsp(&vmx->vcpu)) - apic_base_msr.data |= MSR_IA32_APICBASE_BSP; - apic_base_msr.host_initiated = true; - kvm_set_apic_base(&vmx->vcpu, &apic_base_msr); + kvm_set_cr8(vcpu, 0); + + if (!init_event) { + apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE; + if (kvm_vcpu_is_reset_bsp(vcpu)) + apic_base_msr.data |= MSR_IA32_APICBASE_BSP; + apic_base_msr.host_initiated = true; + kvm_set_apic_base(vcpu, &apic_base_msr); + } vmx_segment_cache_clear(vmx); @@ -4747,9 +4736,12 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); - vmcs_write32(GUEST_SYSENTER_CS, 0); - vmcs_writel(GUEST_SYSENTER_ESP, 0); - vmcs_writel(GUEST_SYSENTER_EIP, 0); + if (!init_event) { + vmcs_write32(GUEST_SYSENTER_CS, 0); + vmcs_writel(GUEST_SYSENTER_ESP, 0); + vmcs_writel(GUEST_SYSENTER_EIP, 0); + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + } vmcs_writel(GUEST_RFLAGS, 0x02); kvm_rip_write(vcpu, 0xfff0); @@ -4764,18 +4756,15 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); - /* Special registers */ - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); - setup_msrs(vmx); vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ - if (cpu_has_vmx_tpr_shadow()) { + if (cpu_has_vmx_tpr_shadow() && !init_event) { vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); - if (vm_need_tpr_shadow(vmx->vcpu.kvm)) + if (vm_need_tpr_shadow(vcpu->kvm)) vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, - __pa(vmx->vcpu.arch.apic->regs)); + __pa(vcpu->arch.apic->regs)); vmcs_write32(TPR_THRESHOLD, 0); } @@ -4787,12 +4776,14 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu) if (vmx->vpid != 0) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */ - vmx_set_cr4(&vmx->vcpu, 0); - vmx_set_efer(&vmx->vcpu, 0); - vmx_fpu_activate(&vmx->vcpu); - update_exception_bitmap(&vmx->vcpu); + cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; + vmx_set_cr0(vcpu, cr0); /* enter rmode */ + vmx->vcpu.arch.cr0 = cr0; + vmx_set_cr4(vcpu, 0); + if (!init_event) + vmx_set_efer(vcpu, 0); + vmx_fpu_activate(vcpu); + update_exception_bitmap(vcpu); vpid_sync_context(vmx); } @@ -4965,7 +4956,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) .flags = 0, }; - ret = kvm_set_memory_region(kvm, &tss_mem); + ret = x86_set_memory_region(kvm, &tss_mem); if (ret) return ret; kvm->arch.tss_addr = addr; @@ -5475,19 +5466,21 @@ static int handle_cpuid(struct kvm_vcpu *vcpu) static int handle_rdmsr(struct kvm_vcpu *vcpu) { u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; - u64 data; + struct msr_data msr_info; - if (vmx_get_msr(vcpu, ecx, &data)) { + msr_info.index = ecx; + msr_info.host_initiated = false; + if (vmx_get_msr(vcpu, &msr_info)) { trace_kvm_msr_read_ex(ecx); kvm_inject_gp(vcpu, 0); return 1; } - trace_kvm_msr_read(ecx, data); + trace_kvm_msr_read(ecx, msr_info.data); /* FIXME: handling of bits 32:63 of rax, rdx */ - vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; - vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u; + vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u; + vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u; skip_emulated_instruction(vcpu); return 1; } @@ -5710,9 +5703,6 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) return 0; } - /* clear all local breakpoint enable flags */ - vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x155); - /* * TODO: What about debug traps on tss switch? * Are we supposed to inject them and update dr6? @@ -7333,7 +7323,7 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, bitmap += (port & 0x7fff) / 8; if (last_bitmap != bitmap) - if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1)) + if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) return true; if (b & (1 << (port & 7))) return true; @@ -7377,7 +7367,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, /* Then read the msr_index'th bit from this bitmap: */ if (msr_index < 1024*8) { unsigned char b; - if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1)) + if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) return true; return 1 & (b >> (msr_index & 7)); } else @@ -7642,9 +7632,9 @@ static void vmx_disable_pml(struct vcpu_vmx *vmx) vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } -static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx) +static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu) { - struct kvm *kvm = vmx->vcpu.kvm; + struct vcpu_vmx *vmx = to_vmx(vcpu); u64 *pml_buf; u16 pml_idx; @@ -7666,7 +7656,7 @@ static void vmx_flush_pml_buffer(struct vcpu_vmx *vmx) gpa = pml_buf[pml_idx]; WARN_ON(gpa & (PAGE_SIZE - 1)); - mark_page_dirty(kvm, gpa >> PAGE_SHIFT); + kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); } /* reset PML index */ @@ -7691,6 +7681,158 @@ static void kvm_flush_pml_buffers(struct kvm *kvm) kvm_vcpu_kick(vcpu); } +static void vmx_dump_sel(char *name, uint32_t sel) +{ + pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n", + name, vmcs_read32(sel), + vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR), + vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR), + vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR)); +} + +static void vmx_dump_dtsel(char *name, uint32_t limit) +{ + pr_err("%s limit=0x%08x, base=0x%016lx\n", + name, vmcs_read32(limit), + vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); +} + +static void dump_vmcs(void) +{ + u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); + u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); + u32 cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + u32 pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL); + u32 secondary_exec_control = 0; + unsigned long cr4 = vmcs_readl(GUEST_CR4); + u64 efer = vmcs_readl(GUEST_IA32_EFER); + int i, n; + + if (cpu_has_secondary_exec_ctrls()) + secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + + pr_err("*** Guest State ***\n"); + pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", + vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW), + vmcs_readl(CR0_GUEST_HOST_MASK)); + pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n", + cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK)); + pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3)); + if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) && + (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA)) + { + pr_err("PDPTR0 = 0x%016lx PDPTR1 = 0x%016lx\n", + vmcs_readl(GUEST_PDPTR0), vmcs_readl(GUEST_PDPTR1)); + pr_err("PDPTR2 = 0x%016lx PDPTR3 = 0x%016lx\n", + vmcs_readl(GUEST_PDPTR2), vmcs_readl(GUEST_PDPTR3)); + } + pr_err("RSP = 0x%016lx RIP = 0x%016lx\n", + vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP)); + pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n", + vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7)); + pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", + vmcs_readl(GUEST_SYSENTER_ESP), + vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP)); + vmx_dump_sel("CS: ", GUEST_CS_SELECTOR); + vmx_dump_sel("DS: ", GUEST_DS_SELECTOR); + vmx_dump_sel("SS: ", GUEST_SS_SELECTOR); + vmx_dump_sel("ES: ", GUEST_ES_SELECTOR); + vmx_dump_sel("FS: ", GUEST_FS_SELECTOR); + vmx_dump_sel("GS: ", GUEST_GS_SELECTOR); + vmx_dump_dtsel("GDTR:", GUEST_GDTR_LIMIT); + vmx_dump_sel("LDTR:", GUEST_LDTR_SELECTOR); + vmx_dump_dtsel("IDTR:", GUEST_IDTR_LIMIT); + vmx_dump_sel("TR: ", GUEST_TR_SELECTOR); + if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) || + (vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER))) + pr_err("EFER = 0x%016llx PAT = 0x%016lx\n", + efer, vmcs_readl(GUEST_IA32_PAT)); + pr_err("DebugCtl = 0x%016lx DebugExceptions = 0x%016lx\n", + vmcs_readl(GUEST_IA32_DEBUGCTL), + vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS)); + if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) + pr_err("PerfGlobCtl = 0x%016lx\n", + vmcs_readl(GUEST_IA32_PERF_GLOBAL_CTRL)); + if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) + pr_err("BndCfgS = 0x%016lx\n", vmcs_readl(GUEST_BNDCFGS)); + pr_err("Interruptibility = %08x ActivityState = %08x\n", + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO), + vmcs_read32(GUEST_ACTIVITY_STATE)); + if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) + pr_err("InterruptStatus = %04x\n", + vmcs_read16(GUEST_INTR_STATUS)); + + pr_err("*** Host State ***\n"); + pr_err("RIP = 0x%016lx RSP = 0x%016lx\n", + vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP)); + pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n", + vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR), + vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR), + vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR), + vmcs_read16(HOST_TR_SELECTOR)); + pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n", + vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE), + vmcs_readl(HOST_TR_BASE)); + pr_err("GDTBase=%016lx IDTBase=%016lx\n", + vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE)); + pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n", + vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3), + vmcs_readl(HOST_CR4)); + pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n", + vmcs_readl(HOST_IA32_SYSENTER_ESP), + vmcs_read32(HOST_IA32_SYSENTER_CS), + vmcs_readl(HOST_IA32_SYSENTER_EIP)); + if (vmexit_ctl & (VM_EXIT_LOAD_IA32_PAT | VM_EXIT_LOAD_IA32_EFER)) + pr_err("EFER = 0x%016lx PAT = 0x%016lx\n", + vmcs_readl(HOST_IA32_EFER), vmcs_readl(HOST_IA32_PAT)); + if (vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) + pr_err("PerfGlobCtl = 0x%016lx\n", + vmcs_readl(HOST_IA32_PERF_GLOBAL_CTRL)); + + pr_err("*** Control State ***\n"); + pr_err("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n", + pin_based_exec_ctrl, cpu_based_exec_ctrl, secondary_exec_control); + pr_err("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl); + pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n", + vmcs_read32(EXCEPTION_BITMAP), + vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK), + vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH)); + pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n", + vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), + vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE), + vmcs_read32(VM_ENTRY_INSTRUCTION_LEN)); + pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n", + vmcs_read32(VM_EXIT_INTR_INFO), + vmcs_read32(VM_EXIT_INTR_ERROR_CODE), + vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); + pr_err(" reason=%08x qualification=%016lx\n", + vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION)); + pr_err("IDTVectoring: info=%08x errcode=%08x\n", + vmcs_read32(IDT_VECTORING_INFO_FIELD), + vmcs_read32(IDT_VECTORING_ERROR_CODE)); + pr_err("TSC Offset = 0x%016lx\n", vmcs_readl(TSC_OFFSET)); + if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) + pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); + if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) + pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV)); + if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)) + pr_err("EPT pointer = 0x%016lx\n", vmcs_readl(EPT_POINTER)); + n = vmcs_read32(CR3_TARGET_COUNT); + for (i = 0; i + 1 < n; i += 4) + pr_err("CR3 target%u=%016lx target%u=%016lx\n", + i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2), + i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2)); + if (i < n) + pr_err("CR3 target%u=%016lx\n", + i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2)); + if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) + pr_err("PLE Gap=%08x Window=%08x\n", + vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW)); + if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID) + pr_err("Virtual processor ID = 0x%04x\n", + vmcs_read16(VIRTUAL_PROCESSOR_ID)); +} + /* * The guest has exited. See if we can fix it or if we need userspace * assistance. @@ -7709,7 +7851,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) * flushed already. */ if (enable_pml) - vmx_flush_pml_buffer(vmx); + vmx_flush_pml_buffer(vcpu); /* If guest state is invalid, start emulating */ if (vmx->emulation_required) @@ -7723,6 +7865,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) } if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { + dump_vmcs(); vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason; @@ -7996,6 +8139,11 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) local_irq_enable(); } +static bool vmx_has_high_real_mode_segbase(void) +{ + return enable_unrestricted_guest || emulate_invalid_guest_state; +} + static bool vmx_mpx_supported(void) { return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && @@ -8480,7 +8628,8 @@ static int get_ept_level(void) static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { - u64 ret; + u8 cache; + u64 ipat = 0; /* For VT-d and EPT combination * 1. MMIO: always map as UC @@ -8493,16 +8642,27 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep * consistent with host MTRR */ - if (is_mmio) - ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT; - else if (kvm_arch_has_noncoherent_dma(vcpu->kvm)) - ret = kvm_get_guest_memory_type(vcpu, gfn) << - VMX_EPT_MT_EPTE_SHIFT; - else - ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) - | VMX_EPT_IPAT_BIT; + if (is_mmio) { + cache = MTRR_TYPE_UNCACHABLE; + goto exit; + } - return ret; + if (!kvm_arch_has_noncoherent_dma(vcpu->kvm)) { + ipat = VMX_EPT_IPAT_BIT; + cache = MTRR_TYPE_WRBACK; + goto exit; + } + + if (kvm_read_cr0(vcpu) & X86_CR0_CD) { + ipat = VMX_EPT_IPAT_BIT; + cache = MTRR_TYPE_UNCACHABLE; + goto exit; + } + + cache = kvm_mtrr_get_guest_memory_type(vcpu, gfn); + +exit: + return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; } static int vmx_get_lpage_level(void) @@ -8924,7 +9084,7 @@ static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, struct vmx_msr_entry *e) { /* x2APIC MSR accesses are not allowed */ - if (apic_x2apic_mode(vcpu->arch.apic) && e->index >> 8 == 0x8) + if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8) return -EINVAL; if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */ e->index == MSR_IA32_UCODE_REV) @@ -8966,8 +9126,8 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) msr.host_initiated = false; for (i = 0; i < count; i++) { - if (kvm_read_guest(vcpu->kvm, gpa + i * sizeof(e), - &e, sizeof(e))) { + if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), + &e, sizeof(e))) { pr_warn_ratelimited( "%s cannot read MSR entry (%u, 0x%08llx)\n", __func__, i, gpa + i * sizeof(e)); @@ -8999,9 +9159,10 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) struct vmx_msr_entry e; for (i = 0; i < count; i++) { - if (kvm_read_guest(vcpu->kvm, - gpa + i * sizeof(e), - &e, 2 * sizeof(u32))) { + struct msr_data msr_info; + if (kvm_vcpu_read_guest(vcpu, + gpa + i * sizeof(e), + &e, 2 * sizeof(u32))) { pr_warn_ratelimited( "%s cannot read MSR entry (%u, 0x%08llx)\n", __func__, i, gpa + i * sizeof(e)); @@ -9013,19 +9174,21 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) __func__, i, e.index, e.reserved); return -EINVAL; } - if (kvm_get_msr(vcpu, e.index, &e.value)) { + msr_info.host_initiated = false; + msr_info.index = e.index; + if (kvm_get_msr(vcpu, &msr_info)) { pr_warn_ratelimited( "%s cannot read MSR (%u, 0x%x)\n", __func__, i, e.index); return -EINVAL; } - if (kvm_write_guest(vcpu->kvm, - gpa + i * sizeof(e) + - offsetof(struct vmx_msr_entry, value), - &e.value, sizeof(e.value))) { + if (kvm_vcpu_write_guest(vcpu, + gpa + i * sizeof(e) + + offsetof(struct vmx_msr_entry, value), + &msr_info.data, sizeof(msr_info.data))) { pr_warn_ratelimited( "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", - __func__, i, e.index, e.value); + __func__, i, e.index, msr_info.data); return -EINVAL; } } @@ -10150,6 +10313,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .hardware_enable = hardware_enable, .hardware_disable = hardware_disable, .cpu_has_accelerated_tpr = report_flexpriority, + .cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase, .vcpu_create = vmx_create_vcpu, .vcpu_free = vmx_free_vcpu, @@ -10255,6 +10419,8 @@ static struct kvm_x86_ops vmx_x86_ops = { .slot_disable_log_dirty = vmx_slot_disable_log_dirty, .flush_log_dirty = vmx_flush_log_dirty, .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, + + .pmu_ops = &intel_pmu_ops, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ea306adbbc13..ac165c2fb8e5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -28,6 +28,7 @@ #include "x86.h" #include "cpuid.h" #include "assigned-dev.h" +#include "pmu.h" #include <linux/clocksource.h> #include <linux/interrupt.h> @@ -57,11 +58,9 @@ #include <asm/debugreg.h> #include <asm/msr.h> #include <asm/desc.h> -#include <asm/mtrr.h> #include <asm/mce.h> -#include <asm/i387.h> -#include <asm/fpu-internal.h> /* Ugh! */ -#include <asm/xcr.h> +#include <linux/kernel_stat.h> +#include <asm/fpu/internal.h> /* Ugh! */ #include <asm/pvclock.h> #include <asm/div64.h> @@ -99,6 +98,9 @@ module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); unsigned int min_timer_period_us = 500; module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); +static bool __read_mostly kvmclock_periodic_sync = true; +module_param(kvmclock_periodic_sync, bool, S_IRUGO); + bool kvm_has_tsc_control; EXPORT_SYMBOL_GPL(kvm_has_tsc_control); u32 kvm_max_guest_tsc_khz; @@ -475,7 +477,7 @@ EXPORT_SYMBOL_GPL(kvm_require_dr); /* * This function will be used to read from the physical memory of the currently - * running guest. The difference to kvm_read_guest_page is that this function + * running guest. The difference to kvm_vcpu_read_guest_page is that this function * can read from guest physical or from the guest's guest physical memory. */ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, @@ -493,7 +495,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, real_gfn = gpa_to_gfn(real_gfn); - return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len); + return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len); } EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); @@ -572,8 +574,7 @@ out: int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { unsigned long old_cr0 = kvm_read_cr0(vcpu); - unsigned long update_bits = X86_CR0_PG | X86_CR0_WP | - X86_CR0_CD | X86_CR0_NW; + unsigned long update_bits = X86_CR0_PG | X86_CR0_WP; cr0 |= X86_CR0_ET; @@ -619,6 +620,10 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if ((cr0 ^ old_cr0) & update_bits) kvm_mmu_reset_context(vcpu); + + if ((cr0 ^ old_cr0) & X86_CR0_CD) + kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL); + return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr0); @@ -908,7 +913,7 @@ bool kvm_rdpmc(struct kvm_vcpu *vcpu) u64 data; int err; - err = kvm_pmu_read_pmc(vcpu, ecx, &data); + err = kvm_pmu_rdpmc(vcpu, ecx, &data); if (err) return err; kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data); @@ -923,17 +928,11 @@ EXPORT_SYMBOL_GPL(kvm_rdpmc); * * This list is modified at module load time to reflect the * capabilities of the host cpu. This capabilities test skips MSRs that are - * kvm-specific. Those are put in the beginning of the list. + * kvm-specific. Those are put in emulated_msrs; filtering of emulated_msrs + * may depend on host virtualization features rather than host cpu features. */ -#define KVM_SAVE_MSRS_BEGIN 12 static u32 msrs_to_save[] = { - MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, - MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, - HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, - HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, - HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, - MSR_KVM_PV_EOI_EN, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -945,14 +944,24 @@ static u32 msrs_to_save[] = { static unsigned num_msrs_to_save; -static const u32 emulated_msrs[] = { +static u32 emulated_msrs[] = { + MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, + HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, + HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, + HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, + MSR_KVM_PV_EOI_EN, + MSR_IA32_TSC_ADJUST, MSR_IA32_TSCDEADLINE, MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, MSR_IA32_MCG_CTL, + MSR_IA32_SMBASE, }; +static unsigned num_emulated_msrs; + bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) { if (efer & efer_reserved_bits) @@ -1046,6 +1055,21 @@ EXPORT_SYMBOL_GPL(kvm_set_msr); /* * Adapt set_msr() to msr_io()'s calling convention */ +static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) +{ + struct msr_data msr; + int r; + + msr.index = index; + msr.host_initiated = true; + r = kvm_get_msr(vcpu, &msr); + if (r) + return r; + + *data = msr.data; + return 0; +} + static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { struct msr_data msr; @@ -1698,6 +1722,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) vcpu->pvclock_set_guest_stopped_request = false; } + pvclock_flags |= PVCLOCK_COUNTS_FROM_ZERO; + /* If the host uses TSC clocksource, then it is stable */ if (use_master_clock) pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; @@ -1768,127 +1794,14 @@ static void kvmclock_sync_fn(struct work_struct *work) kvmclock_sync_work); struct kvm *kvm = container_of(ka, struct kvm, arch); + if (!kvmclock_periodic_sync) + return; + schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); schedule_delayed_work(&kvm->arch.kvmclock_sync_work, KVMCLOCK_SYNC_PERIOD); } -static bool msr_mtrr_valid(unsigned msr) -{ - switch (msr) { - case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: - case MSR_MTRRfix64K_00000: - case MSR_MTRRfix16K_80000: - case MSR_MTRRfix16K_A0000: - case MSR_MTRRfix4K_C0000: - case MSR_MTRRfix4K_C8000: - case MSR_MTRRfix4K_D0000: - case MSR_MTRRfix4K_D8000: - case MSR_MTRRfix4K_E0000: - case MSR_MTRRfix4K_E8000: - case MSR_MTRRfix4K_F0000: - case MSR_MTRRfix4K_F8000: - case MSR_MTRRdefType: - case MSR_IA32_CR_PAT: - return true; - case 0x2f8: - return true; - } - return false; -} - -static bool valid_pat_type(unsigned t) -{ - return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */ -} - -static bool valid_mtrr_type(unsigned t) -{ - return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ -} - -bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - int i; - u64 mask; - - if (!msr_mtrr_valid(msr)) - return false; - - if (msr == MSR_IA32_CR_PAT) { - for (i = 0; i < 8; i++) - if (!valid_pat_type((data >> (i * 8)) & 0xff)) - return false; - return true; - } else if (msr == MSR_MTRRdefType) { - if (data & ~0xcff) - return false; - return valid_mtrr_type(data & 0xff); - } else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) { - for (i = 0; i < 8 ; i++) - if (!valid_mtrr_type((data >> (i * 8)) & 0xff)) - return false; - return true; - } - - /* variable MTRRs */ - WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR)); - - mask = (~0ULL) << cpuid_maxphyaddr(vcpu); - if ((msr & 1) == 0) { - /* MTRR base */ - if (!valid_mtrr_type(data & 0xff)) - return false; - mask |= 0xf00; - } else - /* MTRR mask */ - mask |= 0x7ff; - if (data & mask) { - kvm_inject_gp(vcpu, 0); - return false; - } - - return true; -} -EXPORT_SYMBOL_GPL(kvm_mtrr_valid); - -static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) -{ - u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; - - if (!kvm_mtrr_valid(vcpu, msr, data)) - return 1; - - if (msr == MSR_MTRRdefType) { - vcpu->arch.mtrr_state.def_type = data; - vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; - } else if (msr == MSR_MTRRfix64K_00000) - p[0] = data; - else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) - p[1 + msr - MSR_MTRRfix16K_80000] = data; - else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) - p[3 + msr - MSR_MTRRfix4K_C0000] = data; - else if (msr == MSR_IA32_CR_PAT) - vcpu->arch.pat = data; - else { /* Variable MTRRs */ - int idx, is_mtrr_mask; - u64 *pt; - - idx = (msr - 0x200) / 2; - is_mtrr_mask = msr - 0x200 - 2 * idx; - if (!is_mtrr_mask) - pt = - (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; - else - pt = - (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; - *pt = data; - } - - kvm_mmu_reset_context(vcpu); - return 0; -} - static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data) { u64 mcg_cap = vcpu->arch.mcg_cap; @@ -1947,7 +1860,7 @@ static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data) r = PTR_ERR(page); goto out; } - if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE)) + if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) goto out_free; r = 0; out_free: @@ -2047,13 +1960,13 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; } gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT; - addr = gfn_to_hva(vcpu->kvm, gfn); + addr = kvm_vcpu_gfn_to_hva(vcpu, gfn); if (kvm_is_error_hva(addr)) return 1; if (__clear_user((void __user *)addr, PAGE_SIZE)) return 1; vcpu->arch.hv_vapic = data; - mark_page_dirty(vcpu->kvm, gfn); + kvm_vcpu_mark_page_dirty(vcpu, gfn); if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED)) return 1; break; @@ -2180,7 +2093,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) __func__, data); break; case 0x200 ... 0x2ff: - return set_msr_mtrr(vcpu, msr, data); + return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: return kvm_set_apic_base(vcpu, msr_info); case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: @@ -2200,6 +2113,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MISC_ENABLE: vcpu->arch.ia32_misc_enable_msr = data; break; + case MSR_IA32_SMBASE: + if (!msr_info->host_initiated) + return 1; + vcpu->arch.smbase = data; + break; case MSR_KVM_WALL_CLOCK_NEW: case MSR_KVM_WALL_CLOCK: vcpu->kvm->arch.wall_clock = data; @@ -2220,6 +2138,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) &vcpu->requests); ka->boot_vcpu_runs_old_kvmclock = tmp; + + ka->kvmclock_offset = -get_kernel_ns(); } vcpu->arch.time = data; @@ -2281,37 +2201,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: return set_msr_mce(vcpu, msr, data); - /* Performance counters are not protected by a CPUID bit, - * so we should check all of them in the generic path for the sake of - * cross vendor migration. - * Writing a zero into the event select MSRs disables them, - * which we perfectly emulate ;-). Any other value should be at least - * reported, some guests depend on them. - */ - case MSR_K7_EVNTSEL0: - case MSR_K7_EVNTSEL1: - case MSR_K7_EVNTSEL2: - case MSR_K7_EVNTSEL3: - if (data != 0) - vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); - break; - /* at least RHEL 4 unconditionally writes to the perfctr registers, - * so we ignore writes to make it happy. - */ - case MSR_K7_PERFCTR0: - case MSR_K7_PERFCTR1: - case MSR_K7_PERFCTR2: - case MSR_K7_PERFCTR3: - vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); - break; - case MSR_P6_PERFCTR0: - case MSR_P6_PERFCTR1: - pr = true; - case MSR_P6_EVNTSEL0: - case MSR_P6_EVNTSEL1: - if (kvm_pmu_msr(vcpu, msr)) + case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: + case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: + pr = true; /* fall through */ + case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: + case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: + if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); if (pr || data != 0) @@ -2357,7 +2252,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); - if (kvm_pmu_msr(vcpu, msr)) + if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); if (!ignore_msrs) { vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", @@ -2379,48 +2274,12 @@ EXPORT_SYMBOL_GPL(kvm_set_msr_common); * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { - return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); + return kvm_x86_ops->get_msr(vcpu, msr); } EXPORT_SYMBOL_GPL(kvm_get_msr); -static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) -{ - u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; - - if (!msr_mtrr_valid(msr)) - return 1; - - if (msr == MSR_MTRRdefType) - *pdata = vcpu->arch.mtrr_state.def_type + - (vcpu->arch.mtrr_state.enabled << 10); - else if (msr == MSR_MTRRfix64K_00000) - *pdata = p[0]; - else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) - *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; - else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) - *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; - else if (msr == MSR_IA32_CR_PAT) - *pdata = vcpu->arch.pat; - else { /* Variable MTRRs */ - int idx, is_mtrr_mask; - u64 *pt; - - idx = (msr - 0x200) / 2; - is_mtrr_mask = msr - 0x200 - 2 * idx; - if (!is_mtrr_mask) - pt = - (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; - else - pt = - (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; - *pdata = *pt; - } - - return 0; -} - static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { u64 data; @@ -2518,11 +2377,11 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 0; } -int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { u64 data; - switch (msr) { + switch (msr_info->index) { case MSR_IA32_PLATFORM_ID: case MSR_IA32_EBL_CR_POWERON: case MSR_IA32_DEBUGCTLMSR: @@ -2533,38 +2392,28 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_K8_SYSCFG: case MSR_K7_HWCR: case MSR_VM_HSAVE_PA: - case MSR_K7_EVNTSEL0: - case MSR_K7_EVNTSEL1: - case MSR_K7_EVNTSEL2: - case MSR_K7_EVNTSEL3: - case MSR_K7_PERFCTR0: - case MSR_K7_PERFCTR1: - case MSR_K7_PERFCTR2: - case MSR_K7_PERFCTR3: case MSR_K8_INT_PENDING_MSG: case MSR_AMD64_NB_CFG: case MSR_FAM10H_MMIO_CONF_BASE: case MSR_AMD64_BU_CFG2: - data = 0; + msr_info->data = 0; break; - case MSR_P6_PERFCTR0: - case MSR_P6_PERFCTR1: - case MSR_P6_EVNTSEL0: - case MSR_P6_EVNTSEL1: - if (kvm_pmu_msr(vcpu, msr)) - return kvm_pmu_get_msr(vcpu, msr, pdata); - data = 0; + case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: + case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: + case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: + case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) + return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); + msr_info->data = 0; break; case MSR_IA32_UCODE_REV: - data = 0x100000000ULL; + msr_info->data = 0x100000000ULL; break; case MSR_MTRRcap: - data = 0x500 | KVM_NR_VAR_MTRR; - break; case 0x200 ... 0x2ff: - return get_msr_mtrr(vcpu, msr, pdata); + return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); case 0xcd: /* fsb frequency */ - data = 3; + msr_info->data = 3; break; /* * MSR_EBC_FREQUENCY_ID @@ -2578,48 +2427,53 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) * multiplying by zero otherwise. */ case MSR_EBC_FREQUENCY_ID: - data = 1 << 24; + msr_info->data = 1 << 24; break; case MSR_IA32_APICBASE: - data = kvm_get_apic_base(vcpu); + msr_info->data = kvm_get_apic_base(vcpu); break; case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: - return kvm_x2apic_msr_read(vcpu, msr, pdata); + return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); break; case MSR_IA32_TSCDEADLINE: - data = kvm_get_lapic_tscdeadline_msr(vcpu); + msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu); break; case MSR_IA32_TSC_ADJUST: - data = (u64)vcpu->arch.ia32_tsc_adjust_msr; + msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr; break; case MSR_IA32_MISC_ENABLE: - data = vcpu->arch.ia32_misc_enable_msr; + msr_info->data = vcpu->arch.ia32_misc_enable_msr; + break; + case MSR_IA32_SMBASE: + if (!msr_info->host_initiated) + return 1; + msr_info->data = vcpu->arch.smbase; break; case MSR_IA32_PERF_STATUS: /* TSC increment by tick */ - data = 1000ULL; + msr_info->data = 1000ULL; /* CPU multiplier */ data |= (((uint64_t)4ULL) << 40); break; case MSR_EFER: - data = vcpu->arch.efer; + msr_info->data = vcpu->arch.efer; break; case MSR_KVM_WALL_CLOCK: case MSR_KVM_WALL_CLOCK_NEW: - data = vcpu->kvm->arch.wall_clock; + msr_info->data = vcpu->kvm->arch.wall_clock; break; case MSR_KVM_SYSTEM_TIME: case MSR_KVM_SYSTEM_TIME_NEW: - data = vcpu->arch.time; + msr_info->data = vcpu->arch.time; break; case MSR_KVM_ASYNC_PF_EN: - data = vcpu->arch.apf.msr_val; + msr_info->data = vcpu->arch.apf.msr_val; break; case MSR_KVM_STEAL_TIME: - data = vcpu->arch.st.msr_val; + msr_info->data = vcpu->arch.st.msr_val; break; case MSR_KVM_PV_EOI_EN: - data = vcpu->arch.pv_eoi.msr_val; + msr_info->data = vcpu->arch.pv_eoi.msr_val; break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: @@ -2627,7 +2481,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: - return get_msr_mce(vcpu, msr, pdata); + return get_msr_mce(vcpu, msr_info->index, &msr_info->data); case MSR_K7_CLK_CTL: /* * Provide expected ramp-up count for K7. All other @@ -2638,17 +2492,17 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) * type 6, model 8 and higher from exploding due to * the rdmsr failing. */ - data = 0x20000000; + msr_info->data = 0x20000000; break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: - if (kvm_hv_msr_partition_wide(msr)) { + if (kvm_hv_msr_partition_wide(msr_info->index)) { int r; mutex_lock(&vcpu->kvm->lock); - r = get_msr_hyperv_pw(vcpu, msr, pdata); + r = get_msr_hyperv_pw(vcpu, msr_info->index, &msr_info->data); mutex_unlock(&vcpu->kvm->lock); return r; } else - return get_msr_hyperv(vcpu, msr, pdata); + return get_msr_hyperv(vcpu, msr_info->index, &msr_info->data); break; case MSR_IA32_BBL_CR_CTL3: /* This legacy MSR exists but isn't fully documented in current @@ -2661,31 +2515,30 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) * L2 cache control register 3: 64GB range, 256KB size, * enabled, latency 0x1, configured */ - data = 0xbe702111; + msr_info->data = 0xbe702111; break; case MSR_AMD64_OSVW_ID_LENGTH: if (!guest_cpuid_has_osvw(vcpu)) return 1; - data = vcpu->arch.osvw.length; + msr_info->data = vcpu->arch.osvw.length; break; case MSR_AMD64_OSVW_STATUS: if (!guest_cpuid_has_osvw(vcpu)) return 1; - data = vcpu->arch.osvw.status; + msr_info->data = vcpu->arch.osvw.status; break; default: - if (kvm_pmu_msr(vcpu, msr)) - return kvm_pmu_get_msr(vcpu, msr, pdata); + if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) + return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); if (!ignore_msrs) { - vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); + vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr_info->index); return 1; } else { - vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); - data = 0; + vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr_info->index); + msr_info->data = 0; } break; } - *pdata = data; return 0; } EXPORT_SYMBOL_GPL(kvm_get_msr_common); @@ -2798,12 +2651,25 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_TIME: case KVM_CAP_IOAPIC_POLARITY_IGNORED: case KVM_CAP_TSC_DEADLINE_TIMER: + case KVM_CAP_ENABLE_CAP_VM: + case KVM_CAP_DISABLE_QUIRKS: #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_PCI_2_3: #endif r = 1; break; + case KVM_CAP_X86_SMM: + /* SMBASE is usually relocated above 1M on modern chipsets, + * and SMM handlers might indeed rely on 4G segment limits, + * so do not report SMM to be available if real mode is + * emulated via vm86 mode. Still, do not go to great lengths + * to avoid userspace's usage of the feature, because it is a + * fringe case that is not enabled except via specific settings + * of the module parameters. + */ + r = kvm_x86_ops->cpu_has_high_real_mode_segbase(); + break; case KVM_CAP_COALESCED_MMIO: r = KVM_COALESCED_MMIO_PAGE_OFFSET; break; @@ -2860,7 +2726,7 @@ long kvm_arch_dev_ioctl(struct file *filp, if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list)) goto out; n = msr_list.nmsrs; - msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs); + msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs; if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list)) goto out; r = -E2BIG; @@ -2872,7 +2738,7 @@ long kvm_arch_dev_ioctl(struct file *filp, goto out; if (copy_to_user(user_msr_list->indices + num_msrs_to_save, &emulated_msrs, - ARRAY_SIZE(emulated_msrs) * sizeof(u32))) + num_emulated_msrs * sizeof(u32))) goto out; r = 0; break; @@ -3016,6 +2882,13 @@ static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) return 0; } +static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu) +{ + kvm_make_request(KVM_REQ_SMI, vcpu); + + return 0; +} + static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, struct kvm_tpr_access_ctl *tac) { @@ -3121,8 +2994,15 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->sipi_vector = 0; /* never valid when reporting to user space */ + events->smi.smm = is_smm(vcpu); + events->smi.pending = vcpu->arch.smi_pending; + events->smi.smm_inside_nmi = + !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK); + events->smi.latched_init = kvm_lapic_latched_init(vcpu); + events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING - | KVM_VCPUEVENT_VALID_SHADOW); + | KVM_VCPUEVENT_VALID_SHADOW + | KVM_VCPUEVENT_VALID_SMM); memset(&events->reserved, 0, sizeof(events->reserved)); } @@ -3131,7 +3011,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, { if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR - | KVM_VCPUEVENT_VALID_SHADOW)) + | KVM_VCPUEVENT_VALID_SHADOW + | KVM_VCPUEVENT_VALID_SMM)) return -EINVAL; process_nmi(vcpu); @@ -3156,6 +3037,24 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, kvm_vcpu_has_lapic(vcpu)) vcpu->arch.apic->sipi_vector = events->sipi_vector; + if (events->flags & KVM_VCPUEVENT_VALID_SMM) { + if (events->smi.smm) + vcpu->arch.hflags |= HF_SMM_MASK; + else + vcpu->arch.hflags &= ~HF_SMM_MASK; + vcpu->arch.smi_pending = events->smi.pending; + if (events->smi.smm_inside_nmi) + vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; + else + vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK; + if (kvm_vcpu_has_lapic(vcpu)) { + if (events->smi.latched_init) + set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + else + clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events); + } + } + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; @@ -3194,8 +3093,8 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) { - struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave; - u64 xstate_bv = xsave->xsave_hdr.xstate_bv; + struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave; + u64 xstate_bv = xsave->header.xfeatures; u64 valid; /* @@ -3230,7 +3129,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) { - struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave; + struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave; u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET); u64 valid; @@ -3241,9 +3140,9 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) memcpy(xsave, src, XSAVE_HDR_OFFSET); /* Set XSTATE_BV and possibly XCOMP_BV. */ - xsave->xsave_hdr.xstate_bv = xstate_bv; + xsave->header.xfeatures = xstate_bv; if (cpu_has_xsaves) - xsave->xsave_hdr.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; + xsave->header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; /* * Copy each region from the non-compacted offset to the @@ -3275,8 +3174,8 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, fill_xsave((u8 *) guest_xsave->region, vcpu); } else { memcpy(guest_xsave->region, - &vcpu->arch.guest_fpu.state->fxsave, - sizeof(struct i387_fxsave_struct)); + &vcpu->arch.guest_fpu.state.fxsave, + sizeof(struct fxregs_state)); *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = XSTATE_FPSSE; } @@ -3300,8 +3199,8 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, } else { if (xstate_bv & ~XSTATE_FPSSE) return -EINVAL; - memcpy(&vcpu->arch.guest_fpu.state->fxsave, - guest_xsave->region, sizeof(struct i387_fxsave_struct)); + memcpy(&vcpu->arch.guest_fpu.state.fxsave, + guest_xsave->region, sizeof(struct fxregs_state)); } return 0; } @@ -3415,6 +3314,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_nmi(vcpu); break; } + case KVM_SMI: { + r = kvm_vcpu_ioctl_smi(vcpu); + break; + } case KVM_SET_CPUID: { struct kvm_cpuid __user *cpuid_arg = argp; struct kvm_cpuid cpuid; @@ -3454,7 +3357,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; } case KVM_GET_MSRS: - r = msr_io(vcpu, argp, kvm_get_msr, 1); + r = msr_io(vcpu, argp, do_get_msr, 1); break; case KVM_SET_MSRS: r = msr_io(vcpu, argp, do_set_msr, 0); @@ -3845,6 +3748,26 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, return 0; } +static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap) +{ + int r; + + if (cap->flags) + return -EINVAL; + + switch (cap->cap) { + case KVM_CAP_DISABLE_QUIRKS: + kvm->arch.disabled_quirks = cap->args[0]; + r = 0; + break; + default: + r = -EINVAL; + break; + } + return r; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4097,7 +4020,15 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; break; } + case KVM_ENABLE_CAP: { + struct kvm_enable_cap cap; + r = -EFAULT; + if (copy_from_user(&cap, argp, sizeof(cap))) + goto out; + r = kvm_vm_ioctl_enable_cap(kvm, &cap); + break; + } default: r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); } @@ -4110,8 +4041,7 @@ static void kvm_init_msr_list(void) u32 dummy[2]; unsigned i, j; - /* skip the first msrs in the list. KVM-specific */ - for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { + for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) continue; @@ -4136,6 +4066,22 @@ static void kvm_init_msr_list(void) j++; } num_msrs_to_save = j; + + for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) { + switch (emulated_msrs[i]) { + case MSR_IA32_SMBASE: + if (!kvm_x86_ops->cpu_has_high_real_mode_segbase()) + continue; + break; + default: + break; + } + + if (j < i) + emulated_msrs[j] = emulated_msrs[i]; + j++; + } + num_emulated_msrs = j; } static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, @@ -4253,8 +4199,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; - ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, data, - offset, toread); + ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data, + offset, toread); if (ret < 0) { r = X86EMUL_IO_NEEDED; goto out; @@ -4287,8 +4233,8 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, offset = addr & (PAGE_SIZE-1); if (WARN_ON(offset + bytes > PAGE_SIZE)) bytes = (unsigned)PAGE_SIZE - offset; - ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, val, - offset, bytes); + ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val, + offset, bytes); if (unlikely(ret < 0)) return X86EMUL_IO_NEEDED; @@ -4334,7 +4280,7 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; - ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); + ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite); if (ret < 0) { r = X86EMUL_IO_NEEDED; goto out; @@ -4387,7 +4333,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, { int ret; - ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); + ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes); if (ret < 0) return 0; kvm_mmu_pte_write(vcpu, gpa, val, bytes); @@ -4421,7 +4367,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, void *val, int bytes) { - return !kvm_read_guest(vcpu->kvm, gpa, val, bytes); + return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes); } static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, @@ -4619,7 +4565,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK)) goto emul_write; - page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); + page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT); if (is_error_page(page)) goto emul_write; @@ -4647,7 +4593,7 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, if (!exchanged) return X86EMUL_CMPXCHG_FAILED; - mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT); + kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT); kvm_mmu_pte_write(vcpu, gpa, new, bytes); return X86EMUL_CONTINUE; @@ -4946,7 +4892,17 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata) { - return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); + struct msr_data msr; + int r; + + msr.index = msr_index; + msr.host_initiated = false; + r = kvm_get_msr(emul_to_vcpu(ctxt), &msr); + if (r) + return r; + + *pdata = msr.data; + return 0; } static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, @@ -4960,16 +4916,30 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, return kvm_set_msr(emul_to_vcpu(ctxt), &msr); } +static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt) +{ + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + + return vcpu->arch.smbase; +} + +static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase) +{ + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + + vcpu->arch.smbase = smbase; +} + static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc) { - return kvm_pmu_check_pmc(emul_to_vcpu(ctxt), pmc); + return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc); } static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata) { - return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata); + return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata); } static void emulator_halt(struct x86_emulate_ctxt *ctxt) @@ -5045,6 +5015,8 @@ static const struct x86_emulate_ops emulate_ops = { .cpl = emulator_get_cpl, .get_dr = emulator_get_dr, .set_dr = emulator_set_dr, + .get_smbase = emulator_get_smbase, + .set_smbase = emulator_set_smbase, .set_msr = emulator_set_msr, .get_msr = emulator_get_msr, .check_pmc = emulator_check_pmc, @@ -5106,7 +5078,10 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - ctxt->guest_mode = is_guest_mode(vcpu); + BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK); + BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); + BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK); + ctxt->emul_flags = vcpu->arch.hflags; init_decode_cache(ctxt); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; @@ -5275,6 +5250,34 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, static int complete_emulated_mmio(struct kvm_vcpu *vcpu); static int complete_emulated_pio(struct kvm_vcpu *vcpu); +static void kvm_smm_changed(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.hflags & HF_SMM_MASK)) { + /* This is a good place to trace that we are exiting SMM. */ + trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false); + + if (unlikely(vcpu->arch.smi_pending)) { + kvm_make_request(KVM_REQ_SMI, vcpu); + vcpu->arch.smi_pending = 0; + } else { + /* Process a latched INIT, if any. */ + kvm_make_request(KVM_REQ_EVENT, vcpu); + } + } + + kvm_mmu_reset_context(vcpu); +} + +static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags) +{ + unsigned changed = vcpu->arch.hflags ^ emul_flags; + + vcpu->arch.hflags = emul_flags; + + if (changed & HF_SMM_MASK) + kvm_smm_changed(vcpu); +} + static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, unsigned long *db) { @@ -5474,6 +5477,8 @@ restart: unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; + if (vcpu->arch.hflags != ctxt->emul_flags) + kvm_set_hflags(vcpu, ctxt->emul_flags); kvm_rip_write(vcpu, ctxt->eip); if (r == EMULATE_DONE) kvm_vcpu_check_singlestep(vcpu, rflags, &r); @@ -5952,6 +5957,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) lapic_irq.shorthand = 0; lapic_irq.dest_mode = 0; lapic_irq.dest_id = apicid; + lapic_irq.msi_redir_hint = false; lapic_irq.delivery_mode = APIC_DM_REMRD; kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL); @@ -6039,6 +6045,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu) struct kvm_run *kvm_run = vcpu->run; kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0; + kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0; kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); if (irqchip_in_kernel(vcpu->kvm)) @@ -6162,6 +6169,233 @@ static void process_nmi(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); } +#define put_smstate(type, buf, offset, val) \ + *(type *)((buf) + (offset) - 0x7e00) = val + +static u32 process_smi_get_segment_flags(struct kvm_segment *seg) +{ + u32 flags = 0; + flags |= seg->g << 23; + flags |= seg->db << 22; + flags |= seg->l << 21; + flags |= seg->avl << 20; + flags |= seg->present << 15; + flags |= seg->dpl << 13; + flags |= seg->s << 12; + flags |= seg->type << 8; + return flags; +} + +static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n) +{ + struct kvm_segment seg; + int offset; + + kvm_get_segment(vcpu, &seg, n); + put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector); + + if (n < 3) + offset = 0x7f84 + n * 12; + else + offset = 0x7f2c + (n - 3) * 12; + + put_smstate(u32, buf, offset + 8, seg.base); + put_smstate(u32, buf, offset + 4, seg.limit); + put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg)); +} + +static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n) +{ + struct kvm_segment seg; + int offset; + u16 flags; + + kvm_get_segment(vcpu, &seg, n); + offset = 0x7e00 + n * 16; + + flags = process_smi_get_segment_flags(&seg) >> 8; + put_smstate(u16, buf, offset, seg.selector); + put_smstate(u16, buf, offset + 2, flags); + put_smstate(u32, buf, offset + 4, seg.limit); + put_smstate(u64, buf, offset + 8, seg.base); +} + +static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf) +{ + struct desc_ptr dt; + struct kvm_segment seg; + unsigned long val; + int i; + + put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu)); + put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu)); + put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu)); + put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu)); + + for (i = 0; i < 8; i++) + put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read(vcpu, i)); + + kvm_get_dr(vcpu, 6, &val); + put_smstate(u32, buf, 0x7fcc, (u32)val); + kvm_get_dr(vcpu, 7, &val); + put_smstate(u32, buf, 0x7fc8, (u32)val); + + kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); + put_smstate(u32, buf, 0x7fc4, seg.selector); + put_smstate(u32, buf, 0x7f64, seg.base); + put_smstate(u32, buf, 0x7f60, seg.limit); + put_smstate(u32, buf, 0x7f5c, process_smi_get_segment_flags(&seg)); + + kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); + put_smstate(u32, buf, 0x7fc0, seg.selector); + put_smstate(u32, buf, 0x7f80, seg.base); + put_smstate(u32, buf, 0x7f7c, seg.limit); + put_smstate(u32, buf, 0x7f78, process_smi_get_segment_flags(&seg)); + + kvm_x86_ops->get_gdt(vcpu, &dt); + put_smstate(u32, buf, 0x7f74, dt.address); + put_smstate(u32, buf, 0x7f70, dt.size); + + kvm_x86_ops->get_idt(vcpu, &dt); + put_smstate(u32, buf, 0x7f58, dt.address); + put_smstate(u32, buf, 0x7f54, dt.size); + + for (i = 0; i < 6; i++) + process_smi_save_seg_32(vcpu, buf, i); + + put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu)); + + /* revision id */ + put_smstate(u32, buf, 0x7efc, 0x00020000); + put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase); +} + +static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf) +{ +#ifdef CONFIG_X86_64 + struct desc_ptr dt; + struct kvm_segment seg; + unsigned long val; + int i; + + for (i = 0; i < 16; i++) + put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read(vcpu, i)); + + put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu)); + put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu)); + + kvm_get_dr(vcpu, 6, &val); + put_smstate(u64, buf, 0x7f68, val); + kvm_get_dr(vcpu, 7, &val); + put_smstate(u64, buf, 0x7f60, val); + + put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu)); + put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu)); + put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu)); + + put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase); + + /* revision id */ + put_smstate(u32, buf, 0x7efc, 0x00020064); + + put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer); + + kvm_get_segment(vcpu, &seg, VCPU_SREG_TR); + put_smstate(u16, buf, 0x7e90, seg.selector); + put_smstate(u16, buf, 0x7e92, process_smi_get_segment_flags(&seg) >> 8); + put_smstate(u32, buf, 0x7e94, seg.limit); + put_smstate(u64, buf, 0x7e98, seg.base); + + kvm_x86_ops->get_idt(vcpu, &dt); + put_smstate(u32, buf, 0x7e84, dt.size); + put_smstate(u64, buf, 0x7e88, dt.address); + + kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR); + put_smstate(u16, buf, 0x7e70, seg.selector); + put_smstate(u16, buf, 0x7e72, process_smi_get_segment_flags(&seg) >> 8); + put_smstate(u32, buf, 0x7e74, seg.limit); + put_smstate(u64, buf, 0x7e78, seg.base); + + kvm_x86_ops->get_gdt(vcpu, &dt); + put_smstate(u32, buf, 0x7e64, dt.size); + put_smstate(u64, buf, 0x7e68, dt.address); + + for (i = 0; i < 6; i++) + process_smi_save_seg_64(vcpu, buf, i); +#else + WARN_ON_ONCE(1); +#endif +} + +static void process_smi(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs, ds; + char buf[512]; + u32 cr0; + + if (is_smm(vcpu)) { + vcpu->arch.smi_pending = true; + return; + } + + trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true); + vcpu->arch.hflags |= HF_SMM_MASK; + memset(buf, 0, 512); + if (guest_cpuid_has_longmode(vcpu)) + process_smi_save_state_64(vcpu, buf); + else + process_smi_save_state_32(vcpu, buf); + + kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); + + if (kvm_x86_ops->get_nmi_mask(vcpu)) + vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; + else + kvm_x86_ops->set_nmi_mask(vcpu, true); + + kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); + kvm_rip_write(vcpu, 0x8000); + + cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); + kvm_x86_ops->set_cr0(vcpu, cr0); + vcpu->arch.cr0 = cr0; + + kvm_x86_ops->set_cr4(vcpu, 0); + + __kvm_set_dr(vcpu, 7, DR7_FIXED_1); + + cs.selector = (vcpu->arch.smbase >> 4) & 0xffff; + cs.base = vcpu->arch.smbase; + + ds.selector = 0; + ds.base = 0; + + cs.limit = ds.limit = 0xffffffff; + cs.type = ds.type = 0x3; + cs.dpl = ds.dpl = 0; + cs.db = ds.db = 0; + cs.s = ds.s = 1; + cs.l = ds.l = 0; + cs.g = ds.g = 1; + cs.avl = ds.avl = 0; + cs.present = ds.present = 1; + cs.unusable = ds.unusable = 0; + cs.padding = ds.padding = 0; + + kvm_set_segment(vcpu, &cs, VCPU_SREG_CS); + kvm_set_segment(vcpu, &ds, VCPU_SREG_DS); + kvm_set_segment(vcpu, &ds, VCPU_SREG_ES); + kvm_set_segment(vcpu, &ds, VCPU_SREG_FS); + kvm_set_segment(vcpu, &ds, VCPU_SREG_GS); + kvm_set_segment(vcpu, &ds, VCPU_SREG_SS); + + if (guest_cpuid_has_longmode(vcpu)) + kvm_x86_ops->set_efer(vcpu, 0); + + kvm_update_cpuid(vcpu); + kvm_mmu_reset_context(vcpu); +} + static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) { u64 eoi_exit_bitmap[4]; @@ -6270,12 +6504,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) record_steal_time(vcpu); + if (kvm_check_request(KVM_REQ_SMI, vcpu)) + process_smi(vcpu); if (kvm_check_request(KVM_REQ_NMI, vcpu)) process_nmi(vcpu); if (kvm_check_request(KVM_REQ_PMU, vcpu)) - kvm_handle_pmu_event(vcpu); + kvm_pmu_handle_event(vcpu); if (kvm_check_request(KVM_REQ_PMI, vcpu)) - kvm_deliver_pmi(vcpu); + kvm_pmu_deliver_pmi(vcpu); if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) vcpu_scan_ioapic(vcpu); if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu)) @@ -6347,7 +6583,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (req_immediate_exit) smp_send_reschedule(vcpu->cpu); - kvm_guest_enter(); + __kvm_guest_enter(); if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); @@ -6597,11 +6833,11 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { + struct fpu *fpu = ¤t->thread.fpu; int r; sigset_t sigsaved; - if (!tsk_used_math(current) && init_fpu(current)) - return -ENOMEM; + fpu__activate_curr(fpu); if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); @@ -6971,8 +7207,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct i387_fxsave_struct *fxsave = - &vcpu->arch.guest_fpu.state->fxsave; + struct fxregs_state *fxsave = + &vcpu->arch.guest_fpu.state.fxsave; memcpy(fpu->fpr, fxsave->st_space, 128); fpu->fcw = fxsave->cwd; @@ -6988,8 +7224,8 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct i387_fxsave_struct *fxsave = - &vcpu->arch.guest_fpu.state->fxsave; + struct fxregs_state *fxsave = + &vcpu->arch.guest_fpu.state.fxsave; memcpy(fxsave->st_space, fpu->fpr, 128); fxsave->cwd = fpu->fcw; @@ -7003,17 +7239,11 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) return 0; } -int fx_init(struct kvm_vcpu *vcpu) +static void fx_init(struct kvm_vcpu *vcpu) { - int err; - - err = fpu_alloc(&vcpu->arch.guest_fpu); - if (err) - return err; - - fpu_finit(&vcpu->arch.guest_fpu); + fpstate_init(&vcpu->arch.guest_fpu.state); if (cpu_has_xsaves) - vcpu->arch.guest_fpu.state->xsave.xsave_hdr.xcomp_bv = + vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED; /* @@ -7022,14 +7252,6 @@ int fx_init(struct kvm_vcpu *vcpu) vcpu->arch.xcr0 = XSTATE_FP; vcpu->arch.cr0 |= X86_CR0_ET; - - return 0; -} -EXPORT_SYMBOL_GPL(fx_init); - -static void fx_free(struct kvm_vcpu *vcpu) -{ - fpu_free(&vcpu->arch.guest_fpu); } void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) @@ -7045,7 +7267,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) kvm_put_guest_xcr0(vcpu); vcpu->guest_fpu_loaded = 1; __kernel_fpu_begin(); - fpu_restore_checking(&vcpu->arch.guest_fpu); + __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state); trace_kvm_fpu(1); } @@ -7053,16 +7275,25 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { kvm_put_guest_xcr0(vcpu); - if (!vcpu->guest_fpu_loaded) + if (!vcpu->guest_fpu_loaded) { + vcpu->fpu_counter = 0; return; + } vcpu->guest_fpu_loaded = 0; - fpu_save_init(&vcpu->arch.guest_fpu); + copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); __kernel_fpu_end(); ++vcpu->stat.fpu_reload; - if (!vcpu->arch.eager_fpu) - kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); - + /* + * If using eager FPU mode, or if the guest is a frequent user + * of the FPU, just leave the FPU active for next time. + * Every 255 times fpu_counter rolls over to 0; a guest that uses + * the FPU in bursts will revert to loading it on demand. + */ + if (!vcpu->arch.eager_fpu) { + if (++vcpu->fpu_counter < 5) + kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); + } trace_kvm_fpu(0); } @@ -7071,7 +7302,6 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) kvmclock_reset(vcpu); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); - fx_free(vcpu); kvm_x86_ops->vcpu_free(vcpu); } @@ -7099,14 +7329,13 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { int r; - vcpu->arch.mtrr_state.have_fixed = 1; + kvm_vcpu_mtrr_init(vcpu); r = vcpu_load(vcpu); if (r) return r; - kvm_vcpu_reset(vcpu); + kvm_vcpu_reset(vcpu, false); kvm_mmu_setup(vcpu); vcpu_put(vcpu); - return r; } @@ -7123,6 +7352,9 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) kvm_write_tsc(vcpu, &msr); vcpu_put(vcpu); + if (!kvmclock_periodic_sync) + return; + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, KVMCLOCK_SYNC_PERIOD); } @@ -7137,12 +7369,13 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_mmu_unload(vcpu); vcpu_put(vcpu); - fx_free(vcpu); kvm_x86_ops->vcpu_free(vcpu); } -void kvm_vcpu_reset(struct kvm_vcpu *vcpu) +void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { + vcpu->arch.hflags = 0; + atomic_set(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = 0; vcpu->arch.nmi_injected = false; @@ -7168,13 +7401,16 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu) kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; - kvm_pmu_reset(vcpu); + if (!init_event) { + kvm_pmu_reset(vcpu); + vcpu->arch.smbase = 0x30000; + } memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); vcpu->arch.regs_avail = ~0; vcpu->arch.regs_dirty = ~0; - kvm_x86_ops->vcpu_reset(vcpu); + kvm_x86_ops->vcpu_reset(vcpu, init_event); } void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) @@ -7363,9 +7599,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) goto fail_free_mce_banks; } - r = fx_init(vcpu); - if (r) - goto fail_free_wbinvd_dirty_mask; + fx_init(vcpu); vcpu->arch.ia32_tsc_adjust_msr = 0x0; vcpu->arch.pv_time_enabled = false; @@ -7375,12 +7609,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); + vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; + kvm_async_pf_hash_reset(vcpu); kvm_pmu_init(vcpu); return 0; -fail_free_wbinvd_dirty_mask: - free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); + fail_free_mce_banks: kfree(vcpu->arch.mce_banks); fail_free_lapic: @@ -7482,6 +7717,40 @@ void kvm_arch_sync_events(struct kvm *kvm) kvm_free_pit(kvm); } +int __x86_set_memory_region(struct kvm *kvm, + const struct kvm_userspace_memory_region *mem) +{ + int i, r; + + /* Called with kvm->slots_lock held. */ + BUG_ON(mem->slot >= KVM_MEM_SLOTS_NUM); + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + struct kvm_userspace_memory_region m = *mem; + + m.slot |= i << 16; + r = __kvm_set_memory_region(kvm, &m); + if (r < 0) + return r; + } + + return 0; +} +EXPORT_SYMBOL_GPL(__x86_set_memory_region); + +int x86_set_memory_region(struct kvm *kvm, + const struct kvm_userspace_memory_region *mem) +{ + int r; + + mutex_lock(&kvm->slots_lock); + r = __x86_set_memory_region(kvm, mem); + mutex_unlock(&kvm->slots_lock); + + return r; +} +EXPORT_SYMBOL_GPL(x86_set_memory_region); + void kvm_arch_destroy_vm(struct kvm *kvm) { if (current->mm == kvm->mm) { @@ -7493,13 +7762,13 @@ void kvm_arch_destroy_vm(struct kvm *kvm) struct kvm_userspace_memory_region mem; memset(&mem, 0, sizeof(mem)); mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; - kvm_set_memory_region(kvm, &mem); + x86_set_memory_region(kvm, &mem); mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; - kvm_set_memory_region(kvm, &mem); + x86_set_memory_region(kvm, &mem); mem.slot = TSS_PRIVATE_MEMSLOT; - kvm_set_memory_region(kvm, &mem); + x86_set_memory_region(kvm, &mem); } kvm_iommu_unmap_guest(kvm); kfree(kvm->arch.vpic); @@ -7588,18 +7857,18 @@ out_free: return -ENOMEM; } -void kvm_arch_memslots_updated(struct kvm *kvm) +void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) { /* * memslots->generation has been incremented. * mmio generation may have reached its maximum value. */ - kvm_mmu_invalidate_mmio_sptes(kvm); + kvm_mmu_invalidate_mmio_sptes(kvm, slots); } int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem, + const struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { /* @@ -7677,14 +7946,14 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, } void kvm_arch_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, + const struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, + const struct kvm_memory_slot *new, enum kvm_mr_change change) { - struct kvm_memory_slot *new; int nr_mmu_pages = 0; - if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) { + if (change == KVM_MR_DELETE && old->id >= KVM_USER_MEM_SLOTS) { int ret; ret = vm_munmap(old->userspace_addr, @@ -7701,9 +7970,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, if (nr_mmu_pages) kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); - /* It's OK to get 'new' slot here as it has already been installed */ - new = id_to_memslot(kvm->memslots, mem->slot); - /* * Dirty logging tracks sptes in 4k granularity, meaning that large * sptes have to be split. If live migration is successful, the guest @@ -7728,9 +7994,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * been zapped so no dirty logging staff is needed for old slot. For * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the * new and it's also covered when dealing with the new slot. + * + * FIXME: const-ify all uses of struct kvm_memory_slot. */ if (change != KVM_MR_DELETE) - kvm_mmu_slot_apply_flags(kvm, new); + kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new); } void kvm_arch_flush_shadow_all(struct kvm *kvm) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index f5fef1868096..edc8cdcd786b 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -4,6 +4,8 @@ #include <linux/kvm_host.h> #include "kvm_cache_regs.h" +#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL + static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) { vcpu->arch.exception.pending = false; @@ -160,7 +162,13 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception); +void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu); +u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); +bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, + int page_num); #define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ | XSTATE_BNDREGS | XSTATE_BNDCSR \ |