From 83287ea420ced7242a704488aab0fcdcf2ced9ab Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 16 Sep 2012 15:10:57 +0300 Subject: KVM: VMX: Make lto-friendly LTO (link-time optimization) doesn't like local labels to be referred to from a different function, since the two functions may be built in separate compilation units. Use an external variable instead. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d62b4139a292..5faf12ace546 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -127,6 +127,8 @@ module_param(ple_gap, int, S_IRUGO); static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; module_param(ple_window, int, S_IRUGO); +extern const ulong vmx_return; + #define NR_AUTOLOAD_MSRS 8 #define VMCS02_POOL_SIZE 1 @@ -3724,8 +3726,7 @@ static void vmx_set_constant_host_state(void) native_store_idt(&dt); vmcs_writel(HOST_IDTR_BASE, dt.address); /* 22.2.4 */ - asm("mov $.Lkvm_vmx_return, %0" : "=r"(tmpl)); - vmcs_writel(HOST_RIP, tmpl); /* 22.2.5 */ + vmcs_writel(HOST_RIP, vmx_return); /* 22.2.5 */ rdmsr(MSR_IA32_SYSENTER_CS, low32, high32); vmcs_write32(HOST_IA32_SYSENTER_CS, low32); @@ -6276,11 +6277,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */ /* Enter guest mode */ - "jne .Llaunched \n\t" + "jne 1f \n\t" __ex(ASM_VMX_VMLAUNCH) "\n\t" - "jmp .Lkvm_vmx_return \n\t" - ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" - ".Lkvm_vmx_return: " + "jmp 2f \n\t" + "1: " __ex(ASM_VMX_VMRESUME) "\n\t" + "2: " /* Save guest registers, load host registers, keep flags */ "mov %0, %c[wordsize](%%"R"sp) \n\t" "pop %0 \n\t" @@ -6306,6 +6307,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "pop %%"R"bp; pop %%"R"dx \n\t" "setbe %c[fail](%0) \n\t" + ".pushsection .rodata \n\t" + ".global vmx_return \n\t" + "vmx_return: " _ASM_PTR " 2b \n\t" + ".popsection" : : "c"(vmx), "d"((unsigned long)HOST_RSP), [launched]"i"(offsetof(struct vcpu_vmx, __launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), -- cgit v1.2.3 From b188c81f2e1a188ddda6a3d353e5b546c30a9b90 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 16 Sep 2012 15:10:58 +0300 Subject: KVM: VMX: Make use of asm.h Use macros for bitness-insensitive register names, instead of rolling our own. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 69 ++++++++++++++++++++++++------------------------------ 1 file changed, 30 insertions(+), 39 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5faf12ace546..30bcb953afee 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6184,14 +6184,6 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host); } -#ifdef CONFIG_X86_64 -#define R "r" -#define Q "q" -#else -#define R "e" -#define Q "l" -#endif - static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6240,30 +6232,30 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->__launched = vmx->loaded_vmcs->launched; asm( /* Store host registers */ - "push %%"R"dx; push %%"R"bp;" - "push %%"R"cx \n\t" /* placeholder for guest rcx */ - "push %%"R"cx \n\t" - "cmp %%"R"sp, %c[host_rsp](%0) \n\t" + "push %%" _ASM_DX "; push %%" _ASM_BP ";" + "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */ + "push %%" _ASM_CX " \n\t" + "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t" "je 1f \n\t" - "mov %%"R"sp, %c[host_rsp](%0) \n\t" + "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t" __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" "1: \n\t" /* Reload cr2 if changed */ - "mov %c[cr2](%0), %%"R"ax \n\t" - "mov %%cr2, %%"R"dx \n\t" - "cmp %%"R"ax, %%"R"dx \n\t" + "mov %c[cr2](%0), %%" _ASM_AX " \n\t" + "mov %%cr2, %%" _ASM_DX " \n\t" + "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t" "je 2f \n\t" - "mov %%"R"ax, %%cr2 \n\t" + "mov %%" _ASM_AX", %%cr2 \n\t" "2: \n\t" /* Check if vmlaunch of vmresume is needed */ "cmpl $0, %c[launched](%0) \n\t" /* Load guest registers. Don't clobber flags. */ - "mov %c[rax](%0), %%"R"ax \n\t" - "mov %c[rbx](%0), %%"R"bx \n\t" - "mov %c[rdx](%0), %%"R"dx \n\t" - "mov %c[rsi](%0), %%"R"si \n\t" - "mov %c[rdi](%0), %%"R"di \n\t" - "mov %c[rbp](%0), %%"R"bp \n\t" + "mov %c[rax](%0), %%" _ASM_AX " \n\t" + "mov %c[rbx](%0), %%" _ASM_BX " \n\t" + "mov %c[rdx](%0), %%" _ASM_DX " \n\t" + "mov %c[rsi](%0), %%" _ASM_SI " \n\t" + "mov %c[rdi](%0), %%" _ASM_DI " \n\t" + "mov %c[rbp](%0), %%" _ASM_BP " \n\t" #ifdef CONFIG_X86_64 "mov %c[r8](%0), %%r8 \n\t" "mov %c[r9](%0), %%r9 \n\t" @@ -6274,7 +6266,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "mov %c[r14](%0), %%r14 \n\t" "mov %c[r15](%0), %%r15 \n\t" #endif - "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */ + "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %0 (ecx) */ /* Enter guest mode */ "jne 1f \n\t" @@ -6283,15 +6275,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "1: " __ex(ASM_VMX_VMRESUME) "\n\t" "2: " /* Save guest registers, load host registers, keep flags */ - "mov %0, %c[wordsize](%%"R"sp) \n\t" + "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t" "pop %0 \n\t" - "mov %%"R"ax, %c[rax](%0) \n\t" - "mov %%"R"bx, %c[rbx](%0) \n\t" - "pop"Q" %c[rcx](%0) \n\t" - "mov %%"R"dx, %c[rdx](%0) \n\t" - "mov %%"R"si, %c[rsi](%0) \n\t" - "mov %%"R"di, %c[rdi](%0) \n\t" - "mov %%"R"bp, %c[rbp](%0) \n\t" + "mov %%" _ASM_AX ", %c[rax](%0) \n\t" + "mov %%" _ASM_BX ", %c[rbx](%0) \n\t" + __ASM_SIZE(pop) " %c[rcx](%0) \n\t" + "mov %%" _ASM_DX ", %c[rdx](%0) \n\t" + "mov %%" _ASM_SI ", %c[rsi](%0) \n\t" + "mov %%" _ASM_DI ", %c[rdi](%0) \n\t" + "mov %%" _ASM_BP ", %c[rbp](%0) \n\t" #ifdef CONFIG_X86_64 "mov %%r8, %c[r8](%0) \n\t" "mov %%r9, %c[r9](%0) \n\t" @@ -6302,10 +6294,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) "mov %%r14, %c[r14](%0) \n\t" "mov %%r15, %c[r15](%0) \n\t" #endif - "mov %%cr2, %%"R"ax \n\t" - "mov %%"R"ax, %c[cr2](%0) \n\t" + "mov %%cr2, %%" _ASM_AX " \n\t" + "mov %%" _ASM_AX ", %c[cr2](%0) \n\t" - "pop %%"R"bp; pop %%"R"dx \n\t" + "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t" "setbe %c[fail](%0) \n\t" ".pushsection .rodata \n\t" ".global vmx_return \n\t" @@ -6335,9 +6327,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), [wordsize]"i"(sizeof(ulong)) : "cc", "memory" - , R"ax", R"bx", R"di", R"si" #ifdef CONFIG_X86_64 + , "rax", "rbx", "rdi", "rsi" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" +#else + , "eax", "ebx", "edi", "esi" #endif ); @@ -6389,9 +6383,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_complete_interrupts(vmx); } -#undef R -#undef Q - static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); -- cgit v1.2.3 From 7454766f7bead388251aedee35a478356a7f4e72 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 16 Sep 2012 15:10:59 +0300 Subject: KVM: SVM: Make use of asm.h Use macros for bitness-insensitive register names, instead of rolling our own. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/svm.c | 46 ++++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 611c72875fb9..818fceb3091e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3782,12 +3782,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) svm_complete_interrupts(svm); } -#ifdef CONFIG_X86_64 -#define R "r" -#else -#define R "e" -#endif - static void svm_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3814,13 +3808,13 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) local_irq_enable(); asm volatile ( - "push %%"R"bp; \n\t" - "mov %c[rbx](%[svm]), %%"R"bx \n\t" - "mov %c[rcx](%[svm]), %%"R"cx \n\t" - "mov %c[rdx](%[svm]), %%"R"dx \n\t" - "mov %c[rsi](%[svm]), %%"R"si \n\t" - "mov %c[rdi](%[svm]), %%"R"di \n\t" - "mov %c[rbp](%[svm]), %%"R"bp \n\t" + "push %%" _ASM_BP "; \n\t" + "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t" + "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t" + "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t" + "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t" + "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t" + "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t" #ifdef CONFIG_X86_64 "mov %c[r8](%[svm]), %%r8 \n\t" "mov %c[r9](%[svm]), %%r9 \n\t" @@ -3833,20 +3827,20 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) #endif /* Enter guest mode */ - "push %%"R"ax \n\t" - "mov %c[vmcb](%[svm]), %%"R"ax \n\t" + "push %%" _ASM_AX " \n\t" + "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t" __ex(SVM_VMLOAD) "\n\t" __ex(SVM_VMRUN) "\n\t" __ex(SVM_VMSAVE) "\n\t" - "pop %%"R"ax \n\t" + "pop %%" _ASM_AX " \n\t" /* Save guest registers, load host registers */ - "mov %%"R"bx, %c[rbx](%[svm]) \n\t" - "mov %%"R"cx, %c[rcx](%[svm]) \n\t" - "mov %%"R"dx, %c[rdx](%[svm]) \n\t" - "mov %%"R"si, %c[rsi](%[svm]) \n\t" - "mov %%"R"di, %c[rdi](%[svm]) \n\t" - "mov %%"R"bp, %c[rbp](%[svm]) \n\t" + "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t" + "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t" + "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t" + "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t" + "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t" + "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t" #ifdef CONFIG_X86_64 "mov %%r8, %c[r8](%[svm]) \n\t" "mov %%r9, %c[r9](%[svm]) \n\t" @@ -3857,7 +3851,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) "mov %%r14, %c[r14](%[svm]) \n\t" "mov %%r15, %c[r15](%[svm]) \n\t" #endif - "pop %%"R"bp" + "pop %%" _ASM_BP : : [svm]"a"(svm), [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), @@ -3878,9 +3872,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) #endif : "cc", "memory" - , R"bx", R"cx", R"dx", R"si", R"di" #ifdef CONFIG_X86_64 + , "rbx", "rcx", "rdx", "rsi", "rdi" , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" +#else + , "ebx", "ecx", "edx", "esi", "edi" #endif ); @@ -3940,8 +3936,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) mark_all_clean(svm->vmcb); } -#undef R - static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); -- cgit v1.2.3 From 9fc77441e5e1bf80b794cc546d2243ee9f4afb75 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 16 Sep 2012 11:50:30 +0300 Subject: KVM: make processes waiting on vcpu mutex killable vcpu mutex can be held for unlimited time so taking it with mutex_lock on an ioctl is wrong: one process could be passed a vcpu fd and call this ioctl on the vcpu used by another process, it will then be unkillable until the owner exits. Call mutex_lock_killable instead and return status. Note: mutex_lock_interruptible would be even nicer, but I am not sure all users are prepared to handle EINTR from these ioctls. They might misinterpret it as an error. Cleanup paths expect a vcpu that can't be used by any userspace so this will always succeed - catch bugs by calling BUG_ON. Catch callers that don't check return state by adding __must_check. Signed-off-by: Michael S. Tsirkin Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 12 +++++++++--- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 10 +++++++--- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c4d451ed1573..19047eafa38d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6016,7 +6016,9 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) int r; vcpu->arch.mtrr_state.have_fixed = 1; - vcpu_load(vcpu); + r = vcpu_load(vcpu); + if (r) + return r; r = kvm_arch_vcpu_reset(vcpu); if (r == 0) r = kvm_mmu_setup(vcpu); @@ -6027,9 +6029,11 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { + int r; vcpu->arch.apf.msr_val = 0; - vcpu_load(vcpu); + r = vcpu_load(vcpu); + BUG_ON(r); kvm_mmu_unload(vcpu); vcpu_put(vcpu); @@ -6275,7 +6279,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) { - vcpu_load(vcpu); + int r; + r = vcpu_load(vcpu); + BUG_ON(r); kvm_mmu_unload(vcpu); vcpu_put(vcpu); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 40791930bc15..80bfc880921e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -408,7 +408,7 @@ static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); -void vcpu_load(struct kvm_vcpu *vcpu); +int __must_check vcpu_load(struct kvm_vcpu *vcpu); void vcpu_put(struct kvm_vcpu *vcpu); int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4fe02d900810..cc3f6dc506e4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -131,11 +131,12 @@ bool kvm_is_mmio_pfn(pfn_t pfn) /* * Switches to specified vcpu, until a matching vcpu_put() */ -void vcpu_load(struct kvm_vcpu *vcpu) +int vcpu_load(struct kvm_vcpu *vcpu) { int cpu; - mutex_lock(&vcpu->mutex); + if (mutex_lock_killable(&vcpu->mutex)) + return -EINTR; if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { /* The thread running this VCPU changed. */ struct pid *oldpid = vcpu->pid; @@ -148,6 +149,7 @@ void vcpu_load(struct kvm_vcpu *vcpu) preempt_notifier_register(&vcpu->preempt_notifier); kvm_arch_vcpu_load(vcpu, cpu); put_cpu(); + return 0; } void vcpu_put(struct kvm_vcpu *vcpu) @@ -1891,7 +1893,9 @@ static long kvm_vcpu_ioctl(struct file *filp, #endif - vcpu_load(vcpu); + r = vcpu_load(vcpu); + if (r) + return r; switch (ioctl) { case KVM_RUN: r = -EINVAL; -- cgit v1.2.3 From 879238fecc051d95037ae76332916209a7770709 Mon Sep 17 00:00:00 2001 From: Stefan Fritsch Date: Sun, 16 Sep 2012 12:55:40 +0200 Subject: KVM: clarify kvmclock documentation - mention that system time needs to be added to wallclock time - positive tsc_shift means left shift, not right - mention additional 32bit right shift Signed-off-by: Stefan Fritsch Signed-off-by: Marcelo Tosatti --- Documentation/virtual/kvm/msr.txt | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt index 730471048583..6d470ae7b073 100644 --- a/Documentation/virtual/kvm/msr.txt +++ b/Documentation/virtual/kvm/msr.txt @@ -34,9 +34,12 @@ MSR_KVM_WALL_CLOCK_NEW: 0x4b564d00 time information and check that they are both equal and even. An odd version indicates an in-progress update. - sec: number of seconds for wallclock. + sec: number of seconds for wallclock at time of boot. - nsec: number of nanoseconds for wallclock. + nsec: number of nanoseconds for wallclock at time of boot. + + In order to get the current wallclock time, the system_time from + MSR_KVM_SYSTEM_TIME_NEW needs to be added. Note that although MSRs are per-CPU entities, the effect of this particular MSR is global. @@ -82,20 +85,25 @@ MSR_KVM_SYSTEM_TIME_NEW: 0x4b564d01 time at the time this structure was last updated. Unit is nanoseconds. - tsc_to_system_mul: a function of the tsc frequency. One has - to multiply any tsc-related quantity by this value to get - a value in nanoseconds, besides dividing by 2^tsc_shift + tsc_to_system_mul: multiplier to be used when converting + tsc-related quantity to nanoseconds - tsc_shift: cycle to nanosecond divider, as a power of two, to - allow for shift rights. One has to shift right any tsc-related - quantity by this value to get a value in nanoseconds, besides - multiplying by tsc_to_system_mul. + tsc_shift: shift to be used when converting tsc-related + quantity to nanoseconds. This shift will ensure that + multiplication with tsc_to_system_mul does not overflow. + A positive value denotes a left shift, a negative value + a right shift. - With this information, guests can derive per-CPU time by - doing: + The conversion from tsc to nanoseconds involves an additional + right shift by 32 bits. With this information, guests can + derive per-CPU time by doing: time = (current_tsc - tsc_timestamp) - time = (time * tsc_to_system_mul) >> tsc_shift + if (tsc_shift >= 0) + time <<= tsc_shift; + else + time >>= -tsc_shift; + time = (time * tsc_to_system_mul) >> 32 time = time + system_time flags: bits in this field indicate extended capabilities -- cgit v1.2.3 From 8ea667f259e3767fd3ee85a885c14e417835695e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 12 Sep 2012 13:44:53 +0300 Subject: KVM: MMU: Push clean gpte write protection out of gpte_access() gpte_access() computes the access permissions of a guest pte and also write-protects clean gptes. This is wrong when we are servicing a write fault (since we'll be setting the dirty bit momentarily) but correct when instantiating a speculative spte, or when servicing a read fault (since we'll want to trap a following write in order to set the dirty bit). It doesn't seem to hurt in practice, but in order to make the code readable, push the write protection out of gpte_access() and into a new protect_clean_gpte() which is called explicitly when needed. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 12 ++++++++++++ arch/x86/kvm/mmu.h | 3 ++- arch/x86/kvm/paging_tmpl.h | 24 ++++++++++++------------ 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index aa0b469ee07d..54c9cb4fdfa4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3408,6 +3408,18 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; } +static inline void protect_clean_gpte(unsigned *access, unsigned gpte) +{ + unsigned mask; + + BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); + + mask = (unsigned)~ACC_WRITE_MASK; + /* Allow write access to dirty gptes */ + mask |= (gpte >> (PT_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK; + *access &= mask; +} + static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, int *nr_present) { diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index e374db9af021..2832081e9b2e 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -18,7 +18,8 @@ #define PT_PCD_MASK (1ULL << 4) #define PT_ACCESSED_SHIFT 5 #define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT) -#define PT_DIRTY_MASK (1ULL << 6) +#define PT_DIRTY_SHIFT 6 +#define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT) #define PT_PAGE_SIZE_MASK (1ULL << 7) #define PT_PAT_MASK (1ULL << 7) #define PT_GLOBAL_MASK (1ULL << 8) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index bf8c42bf50fe..bf7b4ffafab8 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -101,14 +101,11 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return (ret != orig_pte); } -static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte, - bool last) +static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) { unsigned access; access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; - if (last && !is_dirty_gpte(gpte)) - access &= ~ACC_WRITE_MASK; #if PTTYPE == 64 if (vcpu->arch.mmu.nx) @@ -222,8 +219,7 @@ retry_walk: last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte); if (last_gpte) { - pte_access = pt_access & - FNAME(gpte_access)(vcpu, pte, true); + pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); /* check if the kernel is fetching from user page */ if (unlikely(pte_access & PT_USER_MASK) && kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) @@ -274,7 +270,7 @@ retry_walk: break; } - pt_access &= FNAME(gpte_access)(vcpu, pte, false); + pt_access &= FNAME(gpte_access)(vcpu, pte); --walker->level; } @@ -283,7 +279,9 @@ retry_walk: goto error; } - if (write_fault && unlikely(!is_dirty_gpte(pte))) { + if (!write_fault) + protect_clean_gpte(&pte_access, pte); + else if (unlikely(!is_dirty_gpte(pte))) { int ret; trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); @@ -368,7 +366,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return; pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true); + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + protect_clean_gpte(&pte_access, gpte); pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); if (mmu_invalid_pfn(pfn)) return; @@ -441,8 +440,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) continue; - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, - true); + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + protect_clean_gpte(&pte_access, gpte); gfn = gpte_to_gfn(gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, pte_access & ACC_WRITE_MASK); @@ -794,7 +793,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) gfn = gpte_to_gfn(gpte); pte_access = sp->role.access; - pte_access &= FNAME(gpte_access)(vcpu, gpte, true); + pte_access &= FNAME(gpte_access)(vcpu, gpte); + protect_clean_gpte(&pte_access, gpte); if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) continue; -- cgit v1.2.3 From edc2ae84eb40a3c062210fe01af1cae1633cc810 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 12 Sep 2012 13:53:08 +0300 Subject: KVM: MMU: Optimize gpte_access() slightly If nx is disabled, then is gpte[63] is set we will hit a reserved bit set fault before checking permissions; so we can ignore the setting of efer.nxe. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index bf7b4ffafab8..064bcb32d84e 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -106,10 +106,8 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) unsigned access; access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; - #if PTTYPE == 64 - if (vcpu->arch.mmu.nx) - access &= ~(gpte >> PT64_NX_SHIFT); + access &= ~(gpte >> PT64_NX_SHIFT); #endif return access; } -- cgit v1.2.3 From 3d34adec7081621ff51c195be045b87d75c0c49d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 12 Sep 2012 14:03:28 +0300 Subject: KVM: MMU: Move gpte_access() out of paging_tmpl.h We no longer rely on paging_tmpl.h defines; so we can move the function to mmu.c. Rely on zero extension to 64 bits to get the correct nx behaviour. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 10 ++++++++++ arch/x86/kvm/paging_tmpl.h | 21 +++++---------------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 54c9cb4fdfa4..f297a2ccf4f6 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3437,6 +3437,16 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, return false; } +static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte) +{ + unsigned access; + + access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; + access &= ~(gpte >> PT64_NX_SHIFT); + + return access; +} + #define PTTYPE 64 #include "paging_tmpl.h" #undef PTTYPE diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 064bcb32d84e..1cbf576852ca 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -101,17 +101,6 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return (ret != orig_pte); } -static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) -{ - unsigned access; - - access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; -#if PTTYPE == 64 - access &= ~(gpte >> PT64_NX_SHIFT); -#endif - return access; -} - static bool FNAME(is_last_gpte)(struct guest_walker *walker, struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pt_element_t gpte) @@ -217,7 +206,7 @@ retry_walk: last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte); if (last_gpte) { - pte_access = pt_access & FNAME(gpte_access)(vcpu, pte); + pte_access = pt_access & gpte_access(vcpu, pte); /* check if the kernel is fetching from user page */ if (unlikely(pte_access & PT_USER_MASK) && kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) @@ -268,7 +257,7 @@ retry_walk: break; } - pt_access &= FNAME(gpte_access)(vcpu, pte); + pt_access &= gpte_access(vcpu, pte); --walker->level; } @@ -364,7 +353,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return; pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + pte_access = sp->role.access & gpte_access(vcpu, gpte); protect_clean_gpte(&pte_access, gpte); pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); if (mmu_invalid_pfn(pfn)) @@ -438,7 +427,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) continue; - pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + pte_access = sp->role.access & gpte_access(vcpu, gpte); protect_clean_gpte(&pte_access, gpte); gfn = gpte_to_gfn(gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, @@ -791,7 +780,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) gfn = gpte_to_gfn(gpte); pte_access = sp->role.access; - pte_access &= FNAME(gpte_access)(vcpu, gpte); + pte_access &= gpte_access(vcpu, gpte); protect_clean_gpte(&pte_access, gpte); if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present)) -- cgit v1.2.3 From 8cbc70696f149e44753b0fe60162b4ff96c2dd2b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 16 Sep 2012 14:18:51 +0300 Subject: KVM: MMU: Update accessed and dirty bits after guest pagetable walk While unspecified, the behaviour of Intel processors is to first perform the page table walk, then, if the walk was successful, to atomically update the accessed and dirty bits of walked paging elements. While we are not required to follow this exactly, doing so will allow us to perform the access permissions check after the walk is complete, rather than after each walk step. (the tricky case is SMEP: a zero in any pte's U bit makes the referenced page a supervisor page, so we can't fault on a one bit during the walk itself). Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 76 ++++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 29 deletions(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 1cbf576852ca..35a05dd2f69c 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -63,10 +63,12 @@ */ struct guest_walker { int level; + unsigned max_level; gfn_t table_gfn[PT_MAX_FULL_LEVELS]; pt_element_t ptes[PT_MAX_FULL_LEVELS]; pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; + pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; unsigned pt_access; unsigned pte_access; gfn_t gfn; @@ -119,6 +121,43 @@ static bool FNAME(is_last_gpte)(struct guest_walker *walker, return false; } +static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, + struct kvm_mmu *mmu, + struct guest_walker *walker, + int write_fault) +{ + unsigned level, index; + pt_element_t pte, orig_pte; + pt_element_t __user *ptep_user; + gfn_t table_gfn; + int ret; + + for (level = walker->max_level; level >= walker->level; --level) { + pte = orig_pte = walker->ptes[level - 1]; + table_gfn = walker->table_gfn[level - 1]; + ptep_user = walker->ptep_user[level - 1]; + index = offset_in_page(ptep_user) / sizeof(pt_element_t); + if (!(pte & PT_ACCESSED_MASK)) { + trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); + pte |= PT_ACCESSED_MASK; + } + if (level == walker->level && write_fault && !is_dirty_gpte(pte)) { + trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); + pte |= PT_DIRTY_MASK; + } + if (pte == orig_pte) + continue; + + ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte); + if (ret) + return ret; + + mark_page_dirty(vcpu->kvm, table_gfn); + walker->ptes[level] = pte; + } + return 0; +} + /* * Fetch a guest pte for a guest virtual address */ @@ -126,6 +165,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gva_t addr, u32 access) { + int ret; pt_element_t pte; pt_element_t __user *uninitialized_var(ptep_user); gfn_t table_gfn; @@ -153,6 +193,7 @@ retry_walk: --walker->level; } #endif + walker->max_level = walker->level; ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); @@ -183,6 +224,7 @@ retry_walk: ptep_user = (pt_element_t __user *)((void *)host_addr + offset); if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) goto error; + walker->ptep_user[walker->level - 1] = ptep_user; trace_kvm_mmu_paging_element(pte, walker->level); @@ -214,21 +256,6 @@ retry_walk: eperm = true; } - if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) { - int ret; - trace_kvm_mmu_set_accessed_bit(table_gfn, index, - sizeof(pte)); - ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, - pte, pte|PT_ACCESSED_MASK); - if (unlikely(ret < 0)) - goto error; - else if (ret) - goto retry_walk; - - mark_page_dirty(vcpu->kvm, table_gfn); - pte |= PT_ACCESSED_MASK; - } - walker->ptes[walker->level - 1] = pte; if (last_gpte) { @@ -268,21 +295,12 @@ retry_walk: if (!write_fault) protect_clean_gpte(&pte_access, pte); - else if (unlikely(!is_dirty_gpte(pte))) { - int ret; - trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); - ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, - pte, pte|PT_DIRTY_MASK); - if (unlikely(ret < 0)) - goto error; - else if (ret) - goto retry_walk; - - mark_page_dirty(vcpu->kvm, table_gfn); - pte |= PT_DIRTY_MASK; - walker->ptes[walker->level - 1] = pte; - } + ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); + if (unlikely(ret < 0)) + goto error; + else if (ret) + goto retry_walk; walker->pt_access = pt_access; walker->pte_access = pte_access; -- cgit v1.2.3 From 97d64b788114be1c4dc4bfe7a8ba2bf9643fe6af Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 12 Sep 2012 14:52:00 +0300 Subject: KVM: MMU: Optimize pte permission checks walk_addr_generic() permission checks are a maze of branchy code, which is performed four times per lookup. It depends on the type of access, efer.nxe, cr0.wp, cr4.smep, and in the near future, cr4.smap. Optimize this away by precalculating all variants and storing them in a bitmap. The bitmap is recalculated when rarely-changing variables change (cr0, cr4) and is indexed by the often-changing variables (page fault error code, pte access permissions). The permission check is moved to the end of the loop, otherwise an SMEP fault could be reported as a false positive, when PDE.U=1 but PTE.U=0. Noted by Xiao Guangrong. The result is short, branch-free code. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 7 +++++++ arch/x86/kvm/mmu.c | 38 ++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/mmu.h | 19 ++++++++----------- arch/x86/kvm/paging_tmpl.h | 22 ++++------------------ arch/x86/kvm/x86.c | 11 ++++------- 5 files changed, 61 insertions(+), 36 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 64adb6117e19..3318bde206a5 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -287,6 +287,13 @@ struct kvm_mmu { union kvm_mmu_page_role base_role; bool direct_map; + /* + * Bitmap; bit set = permission fault + * Byte index: page fault error code [4:1] + * Bit index: pte permissions in ACC_* format + */ + u8 permissions[16]; + u64 *pae_root; u64 *lm_root; u64 rsvd_bits_mask[2][4]; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f297a2ccf4f6..9c6188931f87 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3516,6 +3516,38 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, } } +static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +{ + unsigned bit, byte, pfec; + u8 map; + bool fault, x, w, u, wf, uf, ff, smep; + + smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); + for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) { + pfec = byte << 1; + map = 0; + wf = pfec & PFERR_WRITE_MASK; + uf = pfec & PFERR_USER_MASK; + ff = pfec & PFERR_FETCH_MASK; + for (bit = 0; bit < 8; ++bit) { + x = bit & ACC_EXEC_MASK; + w = bit & ACC_WRITE_MASK; + u = bit & ACC_USER_MASK; + + /* Not really needed: !nx will cause pte.nx to fault */ + x |= !mmu->nx; + /* Allow supervisor writes if !cr0.wp */ + w |= !is_write_protection(vcpu) && !uf; + /* Disallow supervisor fetches of user code if cr4.smep */ + x &= !(smep && u && !uf); + + fault = (ff && !x) || (uf && !u) || (wf && !w); + map |= fault << bit; + } + mmu->permissions[byte] = map; + } +} + static int paging64_init_context_common(struct kvm_vcpu *vcpu, struct kvm_mmu *context, int level) @@ -3524,6 +3556,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, context->root_level = level; reset_rsvds_bits_mask(vcpu, context); + update_permission_bitmask(vcpu, context); ASSERT(is_pae(vcpu)); context->new_cr3 = paging_new_cr3; @@ -3552,6 +3585,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, context->root_level = PT32_ROOT_LEVEL; reset_rsvds_bits_mask(vcpu, context); + update_permission_bitmask(vcpu, context); context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; @@ -3612,6 +3646,8 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->gva_to_gpa = paging32_gva_to_gpa; } + update_permission_bitmask(vcpu, context); + return 0; } @@ -3687,6 +3723,8 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) g_context->gva_to_gpa = paging32_gva_to_gpa_nested; } + update_permission_bitmask(vcpu, g_context); + return 0; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 2832081e9b2e..584660775d08 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -89,17 +89,14 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu) return kvm_read_cr0_bits(vcpu, X86_CR0_WP); } -static inline bool check_write_user_access(struct kvm_vcpu *vcpu, - bool write_fault, bool user_fault, - unsigned long pte) +/* + * Will a fault with a given page-fault error code (pfec) cause a permission + * fault with the given access (in ACC_* format)? + */ +static inline bool permission_fault(struct kvm_mmu *mmu, unsigned pte_access, + unsigned pfec) { - if (unlikely(write_fault && !is_writable_pte(pte) - && (user_fault || is_write_protection(vcpu)))) - return false; - - if (unlikely(user_fault && !(pte & PT_USER_MASK))) - return false; - - return true; + return (mmu->permissions[pfec >> 1] >> pte_access) & 1; } + #endif diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 35a05dd2f69c..8f6c59fadbbe 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -169,7 +169,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, pt_element_t pte; pt_element_t __user *uninitialized_var(ptep_user); gfn_t table_gfn; - unsigned index, pt_access, uninitialized_var(pte_access); + unsigned index, pt_access, pte_access; gpa_t pte_gpa; bool eperm, last_gpte; int offset; @@ -237,24 +237,9 @@ retry_walk: goto error; } - if (!check_write_user_access(vcpu, write_fault, user_fault, - pte)) - eperm = true; - -#if PTTYPE == 64 - if (unlikely(fetch_fault && (pte & PT64_NX_MASK))) - eperm = true; -#endif + pte_access = pt_access & gpte_access(vcpu, pte); last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte); - if (last_gpte) { - pte_access = pt_access & gpte_access(vcpu, pte); - /* check if the kernel is fetching from user page */ - if (unlikely(pte_access & PT_USER_MASK) && - kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) - if (fetch_fault && !user_fault) - eperm = true; - } walker->ptes[walker->level - 1] = pte; @@ -284,10 +269,11 @@ retry_walk: break; } - pt_access &= gpte_access(vcpu, pte); + pt_access &= pte_access; --walker->level; } + eperm |= permission_fault(mmu, pte_access, access); if (unlikely(eperm)) { errcode |= PFERR_PRESENT_MASK; goto error; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 19047eafa38d..497226e49d4b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3672,20 +3672,17 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t *gpa, struct x86_exception *exception, bool write) { - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0) + | (write ? PFERR_WRITE_MASK : 0); - if (vcpu_match_mmio_gva(vcpu, gva) && - check_write_user_access(vcpu, write, access, - vcpu->arch.access)) { + if (vcpu_match_mmio_gva(vcpu, gva) + && !permission_fault(vcpu->arch.walk_mmu, vcpu->arch.access, access)) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); return 1; } - if (write) - access |= PFERR_WRITE_MASK; - *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); if (*gpa == UNMAPPED_GVA) -- cgit v1.2.3 From 13d22b6aebb000aeaf137862c6c0e0c4d138d798 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 12 Sep 2012 15:12:09 +0300 Subject: KVM: MMU: Simplify walk_addr_generic() loop The page table walk is coded as an infinite loop, with a special case on the last pte. Code it as an ordinary loop with a termination condition on the last pte (large page or walk length exhausted), and put the last pte handling code after the loop where it belongs. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 60 +++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 35 deletions(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 8f6c59fadbbe..1b4c14d235a0 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -171,12 +171,15 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, gfn_t table_gfn; unsigned index, pt_access, pte_access; gpa_t pte_gpa; - bool eperm, last_gpte; + bool eperm; int offset; const int write_fault = access & PFERR_WRITE_MASK; const int user_fault = access & PFERR_USER_MASK; const int fetch_fault = access & PFERR_FETCH_MASK; u16 errcode = 0; + gpa_t real_gpa; + gfn_t gfn; + u32 ac; trace_kvm_mmu_pagetable_walk(addr, access); retry_walk: @@ -197,12 +200,16 @@ retry_walk: ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); - pt_access = ACC_ALL; + pt_access = pte_access = ACC_ALL; + ++walker->level; - for (;;) { + do { gfn_t real_gfn; unsigned long host_addr; + pt_access &= pte_access; + --walker->level; + index = PT_INDEX(addr, walker->level); table_gfn = gpte_to_gfn(pte); @@ -239,39 +246,8 @@ retry_walk: pte_access = pt_access & gpte_access(vcpu, pte); - last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte); - walker->ptes[walker->level - 1] = pte; - - if (last_gpte) { - int lvl = walker->level; - gpa_t real_gpa; - gfn_t gfn; - u32 ac; - - gfn = gpte_to_gfn_lvl(pte, lvl); - gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; - - if (PTTYPE == 32 && - walker->level == PT_DIRECTORY_LEVEL && - is_cpuid_PSE36()) - gfn += pse36_gfn_delta(pte); - - ac = write_fault | fetch_fault | user_fault; - - real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), - ac); - if (real_gpa == UNMAPPED_GVA) - return 0; - - walker->gfn = real_gpa >> PAGE_SHIFT; - - break; - } - - pt_access &= pte_access; - --walker->level; - } + } while (!FNAME(is_last_gpte)(walker, vcpu, mmu, pte)); eperm |= permission_fault(mmu, pte_access, access); if (unlikely(eperm)) { @@ -279,6 +255,20 @@ retry_walk: goto error; } + gfn = gpte_to_gfn_lvl(pte, walker->level); + gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; + + if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36()) + gfn += pse36_gfn_delta(pte); + + ac = write_fault | fetch_fault | user_fault; + + real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), ac); + if (real_gpa == UNMAPPED_GVA) + return 0; + + walker->gfn = real_gpa >> PAGE_SHIFT; + if (!write_fault) protect_clean_gpte(&pte_access, pte); -- cgit v1.2.3 From 6fd01b711bee96ce3356f7b6f370ab708e37504b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 12 Sep 2012 20:46:56 +0300 Subject: KVM: MMU: Optimize is_last_gpte() Instead of branchy code depending on level, gpte.ps, and mmu configuration, prepare everything in a bitmap during mode changes and look it up during runtime. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 7 +++++++ arch/x86/kvm/mmu.c | 31 +++++++++++++++++++++++++++++++ arch/x86/kvm/mmu.h | 3 ++- arch/x86/kvm/paging_tmpl.h | 20 +------------------- 4 files changed, 41 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3318bde206a5..43aeb9422839 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -298,6 +298,13 @@ struct kvm_mmu { u64 *lm_root; u64 rsvd_bits_mask[2][4]; + /* + * Bitmap: bit set = last pte in walk + * index[0:1]: level (zero-based) + * index[2]: pte.ps + */ + u8 last_pte_bitmap; + bool nx; u64 pdptrs[4]; /* pae */ diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9c6188931f87..d289fee1ffb8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3447,6 +3447,15 @@ static inline unsigned gpte_access(struct kvm_vcpu *vcpu, u64 gpte) return access; } +static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) +{ + unsigned index; + + index = level - 1; + index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2); + return mmu->last_pte_bitmap & (1 << index); +} + #define PTTYPE 64 #include "paging_tmpl.h" #undef PTTYPE @@ -3548,6 +3557,24 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu } } +static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) +{ + u8 map; + unsigned level, root_level = mmu->root_level; + const unsigned ps_set_index = 1 << 2; /* bit 2 of index: ps */ + + if (root_level == PT32E_ROOT_LEVEL) + --root_level; + /* PT_PAGE_TABLE_LEVEL always terminates */ + map = 1 | (1 << ps_set_index); + for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) { + if (level <= PT_PDPE_LEVEL + && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu))) + map |= 1 << (ps_set_index | (level - 1)); + } + mmu->last_pte_bitmap = map; +} + static int paging64_init_context_common(struct kvm_vcpu *vcpu, struct kvm_mmu *context, int level) @@ -3557,6 +3584,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, reset_rsvds_bits_mask(vcpu, context); update_permission_bitmask(vcpu, context); + update_last_pte_bitmap(vcpu, context); ASSERT(is_pae(vcpu)); context->new_cr3 = paging_new_cr3; @@ -3586,6 +3614,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, reset_rsvds_bits_mask(vcpu, context); update_permission_bitmask(vcpu, context); + update_last_pte_bitmap(vcpu, context); context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; @@ -3647,6 +3676,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) } update_permission_bitmask(vcpu, context); + update_last_pte_bitmap(vcpu, context); return 0; } @@ -3724,6 +3754,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) } update_permission_bitmask(vcpu, g_context); + update_last_pte_bitmap(vcpu, g_context); return 0; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 584660775d08..69871080e866 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -20,7 +20,8 @@ #define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT) #define PT_DIRTY_SHIFT 6 #define PT_DIRTY_MASK (1ULL << PT_DIRTY_SHIFT) -#define PT_PAGE_SIZE_MASK (1ULL << 7) +#define PT_PAGE_SIZE_SHIFT 7 +#define PT_PAGE_SIZE_MASK (1ULL << PT_PAGE_SIZE_SHIFT) #define PT_PAT_MASK (1ULL << 7) #define PT_GLOBAL_MASK (1ULL << 8) #define PT64_NX_SHIFT 63 diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 1b4c14d235a0..134ea7b1c585 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -103,24 +103,6 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, return (ret != orig_pte); } -static bool FNAME(is_last_gpte)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, - pt_element_t gpte) -{ - if (walker->level == PT_PAGE_TABLE_LEVEL) - return true; - - if ((walker->level == PT_DIRECTORY_LEVEL) && is_large_pte(gpte) && - (PTTYPE == 64 || is_pse(vcpu))) - return true; - - if ((walker->level == PT_PDPE_LEVEL) && is_large_pte(gpte) && - (mmu->root_level == PT64_ROOT_LEVEL)) - return true; - - return false; -} - static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct guest_walker *walker, @@ -247,7 +229,7 @@ retry_walk: pte_access = pt_access & gpte_access(vcpu, pte); walker->ptes[walker->level - 1] = pte; - } while (!FNAME(is_last_gpte)(walker, vcpu, mmu, pte)); + } while (!is_last_gpte(mmu, walker->level, pte)); eperm |= permission_fault(mmu, pte_access, access); if (unlikely(eperm)) { -- cgit v1.2.3 From 71331a1da1e3a66d14bb3864f99e32d84ab5a76f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 16 Sep 2012 14:49:15 +0300 Subject: KVM: MMU: Eliminate eperm temporary 'eperm' is no longer used in the walker loop, so we can eliminate it. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 134ea7b1c585..95a64d1dccc7 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -153,7 +153,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, gfn_t table_gfn; unsigned index, pt_access, pte_access; gpa_t pte_gpa; - bool eperm; int offset; const int write_fault = access & PFERR_WRITE_MASK; const int user_fault = access & PFERR_USER_MASK; @@ -165,7 +164,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, trace_kvm_mmu_pagetable_walk(addr, access); retry_walk: - eperm = false; walker->level = mmu->root_level; pte = mmu->get_cr3(vcpu); @@ -231,8 +229,7 @@ retry_walk: walker->ptes[walker->level - 1] = pte; } while (!is_last_gpte(mmu, walker->level, pte)); - eperm |= permission_fault(mmu, pte_access, access); - if (unlikely(eperm)) { + if (unlikely(permission_fault(mmu, pte_access, access))) { errcode |= PFERR_PRESENT_MASK; goto error; } -- cgit v1.2.3 From b514c30f7729b66af481fde3c1225e832e339d5b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 16 Sep 2012 15:03:02 +0300 Subject: KVM: MMU: Avoid access/dirty update loop if all is well Keep track of accessed/dirty bits; if they are all set, do not enter the accessed/dirty update loop. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 95a64d1dccc7..810c1da2ee44 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -151,7 +151,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, pt_element_t pte; pt_element_t __user *uninitialized_var(ptep_user); gfn_t table_gfn; - unsigned index, pt_access, pte_access; + unsigned index, pt_access, pte_access, accessed_dirty, shift; gpa_t pte_gpa; int offset; const int write_fault = access & PFERR_WRITE_MASK; @@ -180,6 +180,7 @@ retry_walk: ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); + accessed_dirty = PT_ACCESSED_MASK; pt_access = pte_access = ACC_ALL; ++walker->level; @@ -224,6 +225,7 @@ retry_walk: goto error; } + accessed_dirty &= pte; pte_access = pt_access & gpte_access(vcpu, pte); walker->ptes[walker->level - 1] = pte; @@ -251,11 +253,23 @@ retry_walk: if (!write_fault) protect_clean_gpte(&pte_access, pte); - ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); - if (unlikely(ret < 0)) - goto error; - else if (ret) - goto retry_walk; + /* + * On a write fault, fold the dirty bit into accessed_dirty by shifting it one + * place right. + * + * On a read fault, do nothing. + */ + shift = write_fault >> ilog2(PFERR_WRITE_MASK); + shift *= PT_DIRTY_SHIFT - PT_ACCESSED_SHIFT; + accessed_dirty &= pte >> shift; + + if (unlikely(!accessed_dirty)) { + ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault); + if (unlikely(ret < 0)) + goto error; + else if (ret) + goto retry_walk; + } walker->pt_access = pt_access; walker->pte_access = pte_access; -- cgit v1.2.3 From c5421519f30bd5ed77857a78de6dc8414385e602 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 19 Sep 2012 19:33:48 +0300 Subject: KVM: MMU: Eliminate pointless temporary 'ac' 'ac' essentially reconstructs the 'access' variable we already have, except for the PFERR_PRESENT_MASK and PFERR_RSVD_MASK. As these are not used by callees, just use 'access' directly. Reviewed-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 810c1da2ee44..714e2c01a6fe 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -160,7 +160,6 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, u16 errcode = 0; gpa_t real_gpa; gfn_t gfn; - u32 ac; trace_kvm_mmu_pagetable_walk(addr, access); retry_walk: @@ -242,9 +241,7 @@ retry_walk: if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36()) gfn += pse36_gfn_delta(pte); - ac = write_fault | fetch_fault | user_fault; - - real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), ac); + real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access); if (real_gpa == UNMAPPED_GVA) return 0; -- cgit v1.2.3