From bf998156d24bcb127318ad5bf531ac3bdfcd6449 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Mon, 31 May 2010 14:28:19 +0800
Subject: KVM: Avoid killing userspace through guest SRAO MCE on unmapped pages

In common cases, guest SRAO MCE will cause corresponding poisoned page
be un-mapped and SIGBUS be sent to QEMU-KVM, then QEMU-KVM will relay
the MCE to guest OS.

But it is reported that if the poisoned page is accessed in guest
after unmapping and before MCE is relayed to guest OS, userspace will
be killed.

The reason is as follows. Because poisoned page has been un-mapped,
guest access will cause guest exit and kvm_mmu_page_fault will be
called. kvm_mmu_page_fault can not get the poisoned page for fault
address, so kernel and user space MMIO processing is tried in turn. In
user MMIO processing, poisoned page is accessed again, then userspace
is killed by force_sig_info.

To fix the bug, kvm_mmu_page_fault send HWPOISON signal to QEMU-KVM
and do not try kernel and user space MMIO processing for poisoned
page.

[xiao: fix warning introduced by avi]

Reported-by: Max Asbock <masbock@linux.vnet.ibm.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 34 ++++++++++++++++++++++++++--------
 arch/x86/kvm/paging_tmpl.h |  7 ++-----
 include/linux/kvm_host.h   |  1 +
 include/linux/mm.h         |  8 ++++++++
 mm/memory-failure.c        | 30 ++++++++++++++++++++++++++++++
 virt/kvm/kvm_main.c        | 30 ++++++++++++++++++++++++++++--
 6 files changed, 95 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b1ed0a1a5913..b666d8d106a9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -32,6 +32,7 @@
 #include <linux/compiler.h>
 #include <linux/srcu.h>
 #include <linux/slab.h>
+#include <linux/uaccess.h>
 
 #include <asm/page.h>
 #include <asm/cmpxchg.h>
@@ -1960,6 +1961,27 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 	return pt_write;
 }
 
+static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
+{
+	char buf[1];
+	void __user *hva;
+	int r;
+
+	/* Touch the page, so send SIGBUS */
+	hva = (void __user *)gfn_to_hva(kvm, gfn);
+	r = copy_from_user(buf, hva, 1);
+}
+
+static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
+{
+	kvm_release_pfn_clean(pfn);
+	if (is_hwpoison_pfn(pfn)) {
+		kvm_send_hwpoison_signal(kvm, gfn);
+		return 0;
+	}
+	return 1;
+}
+
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 {
 	int r;
@@ -1983,10 +2005,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
 
 	/* mmio */
-	if (is_error_pfn(pfn)) {
-		kvm_release_pfn_clean(pfn);
-		return 1;
-	}
+	if (is_error_pfn(pfn))
+		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2198,10 +2218,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
-	if (is_error_pfn(pfn)) {
-		kvm_release_pfn_clean(pfn);
-		return 1;
-	}
+	if (is_error_pfn(pfn))
+		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 2331bdc2b549..c7f27779c998 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -431,11 +431,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
 
 	/* mmio */
-	if (is_error_pfn(pfn)) {
-		pgprintk("gfn %lx is mmio\n", walker.gfn);
-		kvm_release_pfn_clean(pfn);
-		return 1;
-	}
+	if (is_error_pfn(pfn))
+		return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7cb116afa1cd..a0e019769f5d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -266,6 +266,7 @@ extern pfn_t bad_pfn;
 
 int is_error_page(struct page *page);
 int is_error_pfn(pfn_t pfn);
+int is_hwpoison_pfn(pfn_t pfn);
 int kvm_is_error_hva(unsigned long addr);
 int kvm_set_memory_region(struct kvm *kvm,
 			  struct kvm_userspace_memory_region *mem,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a2b48041b910..7a9ab7db1975 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1465,6 +1465,14 @@ extern int sysctl_memory_failure_recovery;
 extern void shake_page(struct page *p, int access);
 extern atomic_long_t mce_bad_pages;
 extern int soft_offline_page(struct page *page, int flags);
+#ifdef CONFIG_MEMORY_FAILURE
+int is_hwpoison_address(unsigned long addr);
+#else
+static inline int is_hwpoison_address(unsigned long addr)
+{
+	return 0;
+}
+#endif
 
 extern void dump_page(struct page *page);
 
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 620b0b461593..378b0f61fd3c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -45,6 +45,7 @@
 #include <linux/page-isolation.h>
 #include <linux/suspend.h>
 #include <linux/slab.h>
+#include <linux/swapops.h>
 #include "internal.h"
 
 int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -1296,3 +1297,32 @@ done:
 	/* keep elevated page count for bad page */
 	return ret;
 }
+
+int is_hwpoison_address(unsigned long addr)
+{
+	pgd_t *pgdp;
+	pud_t pud, *pudp;
+	pmd_t pmd, *pmdp;
+	pte_t pte, *ptep;
+	swp_entry_t entry;
+
+	pgdp = pgd_offset(current->mm, addr);
+	if (!pgd_present(*pgdp))
+		return 0;
+	pudp = pud_offset(pgdp, addr);
+	pud = *pudp;
+	if (!pud_present(pud) || pud_large(pud))
+		return 0;
+	pmdp = pmd_offset(pudp, addr);
+	pmd = *pmdp;
+	if (!pmd_present(pmd) || pmd_large(pmd))
+		return 0;
+	ptep = pte_offset_map(pmdp, addr);
+	pte = *ptep;
+	pte_unmap(ptep);
+	if (!is_swap_pte(pte))
+		return 0;
+	entry = pte_to_swp_entry(pte);
+	return is_hwpoison_entry(entry);
+}
+EXPORT_SYMBOL_GPL(is_hwpoison_address);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f032806a212f..187aa8d984a7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -92,6 +92,9 @@ static bool kvm_rebooting;
 
 static bool largepages_enabled = true;
 
+struct page *hwpoison_page;
+pfn_t hwpoison_pfn;
+
 inline int kvm_is_mmio_pfn(pfn_t pfn)
 {
 	if (pfn_valid(pfn)) {
@@ -810,16 +813,22 @@ EXPORT_SYMBOL_GPL(kvm_disable_largepages);
 
 int is_error_page(struct page *page)
 {
-	return page == bad_page;
+	return page == bad_page || page == hwpoison_page;
 }
 EXPORT_SYMBOL_GPL(is_error_page);
 
 int is_error_pfn(pfn_t pfn)
 {
-	return pfn == bad_pfn;
+	return pfn == bad_pfn || pfn == hwpoison_pfn;
 }
 EXPORT_SYMBOL_GPL(is_error_pfn);
 
+int is_hwpoison_pfn(pfn_t pfn)
+{
+	return pfn == hwpoison_pfn;
+}
+EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
+
 static inline unsigned long bad_hva(void)
 {
 	return PAGE_OFFSET;
@@ -945,6 +954,11 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
 	if (unlikely(npages != 1)) {
 		struct vm_area_struct *vma;
 
+		if (is_hwpoison_address(addr)) {
+			get_page(hwpoison_page);
+			return page_to_pfn(hwpoison_page);
+		}
+
 		down_read(&current->mm->mmap_sem);
 		vma = find_vma(current->mm, addr);
 
@@ -2197,6 +2211,15 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 
 	bad_pfn = page_to_pfn(bad_page);
 
+	hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+
+	if (hwpoison_page == NULL) {
+		r = -ENOMEM;
+		goto out_free_0;
+	}
+
+	hwpoison_pfn = page_to_pfn(hwpoison_page);
+
 	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
 		r = -ENOMEM;
 		goto out_free_0;
@@ -2269,6 +2292,8 @@ out_free_1:
 out_free_0a:
 	free_cpumask_var(cpus_hardware_enabled);
 out_free_0:
+	if (hwpoison_page)
+		__free_page(hwpoison_page);
 	__free_page(bad_page);
 out:
 	kvm_arch_exit();
@@ -2290,6 +2315,7 @@ void kvm_exit(void)
 	kvm_arch_hardware_unsetup();
 	kvm_arch_exit();
 	free_cpumask_var(cpus_hardware_enabled);
+	__free_page(hwpoison_page);
 	__free_page(bad_page);
 }
 EXPORT_SYMBOL_GPL(kvm_exit);
-- 
cgit v1.2.3


From c332c83ae736c72dcf072e96e98a774fce39e722 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 4 May 2010 12:24:12 +0300
Subject: KVM: VMX: Simplify vmx_get_nmi_mask()

!! is not needed due to the cast to bool.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ee03679efe78..64252075796a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2826,9 +2826,7 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
 {
 	if (!cpu_has_virtual_nmis())
 		return to_vmx(vcpu)->soft_vnmi_blocked;
-	else
-		return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-			  GUEST_INTR_STATE_NMI);
+	return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)	& GUEST_INTR_STATE_NMI;
 }
 
 static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
-- 
cgit v1.2.3


From 914ebccd2d8fa439e01fe93b5229534b9e179a69 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Wed, 28 Apr 2010 18:50:36 +0900
Subject: KVM: x86: avoid unnecessary bitmap allocation when memslot is clean

Although we always allocate a new dirty bitmap in x86's get_dirty_log(),
it is only used as a zero-source of copy_to_user() and freed right after
that when memslot is clean. This patch uses clear_user() instead of doing
this unnecessary zero-source allocation.

Performance improvement: as we can expect easily, the time needed to
allocate a bitmap is completely reduced. In my test, the improved ioctl
was about 4 to 10 times faster than the original one for clean slots.
Furthermore, reducing memory allocations and copies will produce good
effects to caches too.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 37 +++++++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7fa89c39c64f..1b270fd60634 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2797,7 +2797,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 	struct kvm_memory_slot *memslot;
 	unsigned long n;
 	unsigned long is_dirty = 0;
-	unsigned long *dirty_bitmap = NULL;
 
 	mutex_lock(&kvm->slots_lock);
 
@@ -2812,27 +2811,30 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 
 	n = kvm_dirty_bitmap_bytes(memslot);
 
-	r = -ENOMEM;
-	dirty_bitmap = vmalloc(n);
-	if (!dirty_bitmap)
-		goto out;
-	memset(dirty_bitmap, 0, n);
-
 	for (i = 0; !is_dirty && i < n/sizeof(long); i++)
 		is_dirty = memslot->dirty_bitmap[i];
 
 	/* If nothing is dirty, don't bother messing with page tables. */
 	if (is_dirty) {
 		struct kvm_memslots *slots, *old_slots;
+		unsigned long *dirty_bitmap;
 
 		spin_lock(&kvm->mmu_lock);
 		kvm_mmu_slot_remove_write_access(kvm, log->slot);
 		spin_unlock(&kvm->mmu_lock);
 
-		slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
-		if (!slots)
-			goto out_free;
+		r = -ENOMEM;
+		dirty_bitmap = vmalloc(n);
+		if (!dirty_bitmap)
+			goto out;
+		memset(dirty_bitmap, 0, n);
 
+		r = -ENOMEM;
+		slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+		if (!slots) {
+			vfree(dirty_bitmap);
+			goto out;
+		}
 		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
 		slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
 
@@ -2841,13 +2843,20 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 		synchronize_srcu_expedited(&kvm->srcu);
 		dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
 		kfree(old_slots);
+
+		r = -EFAULT;
+		if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) {
+			vfree(dirty_bitmap);
+			goto out;
+		}
+		vfree(dirty_bitmap);
+	} else {
+		r = -EFAULT;
+		if (clear_user(log->dirty_bitmap, n))
+			goto out;
 	}
 
 	r = 0;
-	if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
-		r = -EFAULT;
-out_free:
-	vfree(dirty_bitmap);
 out:
 	mutex_unlock(&kvm->slots_lock);
 	return r;
-- 
cgit v1.2.3


From 08acfa187117046f8b5044b4a4cdc910f3ceeeb5 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 4 May 2010 13:00:55 +0300
Subject: KVM: kvm_pdptr_read() may sleep

Annotate it thusly.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/kvm_cache_regs.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index cff851cf5322..d2a98f8f9af5 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -36,6 +36,8 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
 
 static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
 {
+	might_sleep();  /* on svm */
+
 	if (!test_bit(VCPU_EXREG_PDPTR,
 		      (unsigned long *)&vcpu->arch.regs_avail))
 		kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
-- 
cgit v1.2.3


From 1c11e713576edf33b95669be9c2dc0ff1e0c90d3 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 3 May 2010 16:05:44 +0300
Subject: KVM: VMX: Avoid writing HOST_CR0 every entry

cr0.ts may change between entries, so we copy cr0 to HOST_CR0 before each
entry.  That is slow, so instead, set HOST_CR0 to have TS set unconditionally
(which is a safe value), and issue a clts() just before exiting vcpu context
if the task indeed owns the fpu.

Saves ~50 cycles/exit.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 9 +++------
 arch/x86/kvm/x86.c | 2 +-
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 64252075796a..598931734251 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -812,6 +812,8 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 		wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
 	}
 #endif
+	if (current_thread_info()->status & TS_USEDFPU)
+		clts();
 }
 
 static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -2507,7 +2509,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
 	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
 
-	vmcs_writel(HOST_CR0, read_cr0());  /* 22.2.3 */
+	vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS);  /* 22.2.3 */
 	vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
 	vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
 
@@ -3859,11 +3861,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		vmx_set_interrupt_shadow(vcpu, 0);
 
-	/*
-	 * Loading guest fpu may have cleared host cr0.ts
-	 */
-	vmcs_writel(HOST_CR0, read_cr0());
-
 	asm(
 		/* Store host registers */
 		"push %%"R"dx; push %%"R"bp;"
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1b270fd60634..801afc6461ed 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1731,8 +1731,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
-	kvm_put_guest_fpu(vcpu);
 	kvm_x86_ops->vcpu_put(vcpu);
+	kvm_put_guest_fpu(vcpu);
 }
 
 static int is_efer_nx(void)
-- 
cgit v1.2.3


From 9de41573675cbace09b02ef386f3e9c8739d495c Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:22 +0300
Subject: KVM: x86 emulator: introduce read cache

Introduce read cache which is needed for instruction that require more
then one exit to userspace. After returning from userspace the instruction
will be re-executed with cached read value.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  1 +
 arch/x86/kvm/emulate.c             | 56 ++++++++++++++++++++++++++++----------
 2 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0b2729bf2070..288cbedcab1c 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -186,6 +186,7 @@ struct decode_cache {
 	unsigned long modrm_val;
 	struct fetch_cache fetch;
 	struct read_cache io_read;
+	struct read_cache mem_read;
 };
 
 struct x86_emulate_ctxt {
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 5ac0bb465ed6..776874b8e50e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1263,6 +1263,33 @@ done:
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
 }
 
+static int read_emulated(struct x86_emulate_ctxt *ctxt,
+			 struct x86_emulate_ops *ops,
+			 unsigned long addr, void *dest, unsigned size)
+{
+	int rc;
+	struct read_cache *mc = &ctxt->decode.mem_read;
+
+	while (size) {
+		int n = min(size, 8u);
+		size -= n;
+		if (mc->pos < mc->end)
+			goto read_cached;
+
+		rc = ops->read_emulated(addr, mc->data + mc->end, n, ctxt->vcpu);
+		if (rc != X86EMUL_CONTINUE)
+			return rc;
+		mc->end += n;
+
+	read_cached:
+		memcpy(dest, mc->data + mc->pos, n);
+		mc->pos += n;
+		dest += n;
+		addr += n;
+	}
+	return X86EMUL_CONTINUE;
+}
+
 static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
 			   struct x86_emulate_ops *ops,
 			   unsigned int size, unsigned short port,
@@ -1504,9 +1531,9 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
-	rc = ops->read_emulated(register_address(c, ss_base(ctxt),
-						 c->regs[VCPU_REGS_RSP]),
-				dest, len, ctxt->vcpu);
+	rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt),
+						       c->regs[VCPU_REGS_RSP]),
+			   dest, len);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
@@ -2475,6 +2502,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	int saved_dst_type = c->dst.type;
 
 	ctxt->interruptibility = 0;
+	ctxt->decode.mem_read.pos = 0;
 
 	/* Shadow copy of register state. Committed on successful emulation.
 	 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
@@ -2529,20 +2557,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	}
 
 	if (c->src.type == OP_MEM) {
-		rc = ops->read_emulated((unsigned long)c->src.ptr,
-					&c->src.val,
-					c->src.bytes,
-					ctxt->vcpu);
+		rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr,
+					&c->src.val, c->src.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		c->src.orig_val = c->src.val;
 	}
 
 	if (c->src2.type == OP_MEM) {
-		rc = ops->read_emulated((unsigned long)c->src2.ptr,
-					&c->src2.val,
-					c->src2.bytes,
-					ctxt->vcpu);
+		rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr,
+					&c->src2.val, c->src2.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 	}
@@ -2553,8 +2577,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 
 	if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
 		/* optimisation - avoid slow emulated read if Mov */
-		rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val,
-					c->dst.bytes, ctxt->vcpu);
+		rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr,
+				   &c->dst.val, c->dst.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 	}
@@ -2981,7 +3005,11 @@ writeback:
 		    (rc->end != 0 && rc->end == rc->pos))
 			ctxt->restart = false;
 	}
-
+	/*
+	 * reset read cache here in case string instruction is restared
+	 * without decoding
+	 */
+	ctxt->decode.mem_read.end = 0;
 	/* Commit shadow register state. */
 	memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
 	kvm_rip_write(ctxt->vcpu, c->eip);
-- 
cgit v1.2.3


From 054fe9f6e3b76877516b37ac7d83d58c7f37c1b6 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:23 +0300
Subject: KVM: x86 emulator: fix Move r/m16 to segment register decoding

This instruction does not need generic decoding for its dst operand.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 776874b8e50e..a81e6bfcade7 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -171,7 +171,7 @@ static u32 opcode_table[256] = {
 	ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
 	ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
 	DstMem | SrcReg | ModRM | Mov, ModRM | DstReg,
-	DstReg | SrcMem | ModRM | Mov, Group | Group1A,
+	ImplicitOps | SrcMem | ModRM, Group | Group1A,
 	/* 0x90 - 0x97 */
 	DstReg, DstReg, DstReg, DstReg,	DstReg, DstReg, DstReg, DstReg,
 	/* 0x98 - 0x9F */
-- 
cgit v1.2.3


From f0c13ef1a8f31be08bf1b1244fe4565f11f4b009 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:24 +0300
Subject: KVM: x86 emulator: cleanup xchg emulation

Dst operand is already initialized during decoding stage. No need to
reinitialize.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a81e6bfcade7..a99d49cc8934 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2804,8 +2804,8 @@ special_insn:
 			break;
 		}
 	case 0x91 ... 0x97: /* xchg reg,rax */
-		c->src.type = c->dst.type = OP_REG;
-		c->src.bytes = c->dst.bytes = c->op_bytes;
+		c->src.type = OP_REG;
+		c->src.bytes = c->op_bytes;
 		c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
 		c->src.val = *(c->src.ptr);
 		goto xchg;
-- 
cgit v1.2.3


From b8a98945ea5b735e083eaf92906aa0ff9ece92e8 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:25 +0300
Subject: KVM: x86 emulator: cleanup nop emulation

Make it more explicit what we are checking for.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a99d49cc8934..03a72912d7b9 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2799,8 +2799,8 @@ special_insn:
 			goto done;
 		break;
 	case 0x90: /* nop / xchg r8,rax */
-		if (!(c->rex_prefix & 1)) { /* nop */
-			c->dst.type = OP_NONE;
+		if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) {
+			c->dst.type = OP_NONE;  /* nop */
 			break;
 		}
 	case 0x91 ... 0x97: /* xchg reg,rax */
-- 
cgit v1.2.3


From 414e6277fd148f6470261cef50a7fed0d88a2825 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:26 +0300
Subject: KVM: x86 emulator: handle "far address" source operand

ljmp/lcall instruction operand contains address and segment.
It can be 10 bytes long. Currently we decode it as two different
operands. Fix it by introducing new kind of operand that can hold
entire far address.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  6 +++-
 arch/x86/kvm/emulate.c             | 56 ++++++++++++++++++++++----------------
 2 files changed, 37 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 288cbedcab1c..69a64a6a36f4 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -143,7 +143,11 @@ struct x86_emulate_ops {
 struct operand {
 	enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
 	unsigned int bytes;
-	unsigned long val, orig_val, *ptr;
+	unsigned long orig_val, *ptr;
+	union {
+		unsigned long val;
+		char valptr[sizeof(unsigned long) + 2];
+	};
 };
 
 struct fetch_cache {
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 03a72912d7b9..687ea0906b79 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -67,6 +67,8 @@
 #define SrcImmUByte (8<<4)      /* 8-bit unsigned immediate operand. */
 #define SrcImmU     (9<<4)      /* Immediate operand, unsigned */
 #define SrcSI       (0xa<<4)	/* Source is in the DS:RSI */
+#define SrcImmFAddr (0xb<<4)	/* Source is immediate far address */
+#define SrcMemFAddr (0xc<<4)	/* Source is far address in memory */
 #define SrcMask     (0xf<<4)
 /* Generic ModRM decode. */
 #define ModRM       (1<<8)
@@ -88,10 +90,6 @@
 #define Src2CL      (1<<29)
 #define Src2ImmByte (2<<29)
 #define Src2One     (3<<29)
-#define Src2Imm16   (4<<29)
-#define Src2Mem16   (5<<29) /* Used for Ep encoding. First argument has to be
-			       in memory and second argument is located
-			       immediately after the first one in memory. */
 #define Src2Mask    (7<<29)
 
 enum {
@@ -175,7 +173,7 @@ static u32 opcode_table[256] = {
 	/* 0x90 - 0x97 */
 	DstReg, DstReg, DstReg, DstReg,	DstReg, DstReg, DstReg, DstReg,
 	/* 0x98 - 0x9F */
-	0, 0, SrcImm | Src2Imm16 | No64, 0,
+	0, 0, SrcImmFAddr | No64, 0,
 	ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
 	/* 0xA0 - 0xA7 */
 	ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
@@ -215,7 +213,7 @@ static u32 opcode_table[256] = {
 	ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
 	/* 0xE8 - 0xEF */
 	SrcImm | Stack, SrcImm | ImplicitOps,
-	SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
+	SrcImmFAddr | No64, SrcImmByte | ImplicitOps,
 	SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
 	SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
 	/* 0xF0 - 0xF7 */
@@ -350,7 +348,7 @@ static u32 group_table[] = {
 	[Group5*8] =
 	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
 	SrcMem | ModRM | Stack, 0,
-	SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps,
+	SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps,
 	SrcMem | ModRM | Stack, 0,
 	[Group7*8] =
 	0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
@@ -576,6 +574,13 @@ static u32 group2_table[] = {
 	(_type)_x;							\
 })
 
+#define insn_fetch_arr(_arr, _size, _eip)                                \
+({	rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size));		\
+	if (rc != X86EMUL_CONTINUE)					\
+		goto done;						\
+	(_eip) += (_size);						\
+})
+
 static inline unsigned long ad_mask(struct decode_cache *c)
 {
 	return (1UL << (c->ad_bytes << 3)) - 1;
@@ -1160,6 +1165,17 @@ done_prefixes:
 					 c->regs[VCPU_REGS_RSI]);
 		c->src.val = 0;
 		break;
+	case SrcImmFAddr:
+		c->src.type = OP_IMM;
+		c->src.ptr = (unsigned long *)c->eip;
+		c->src.bytes = c->op_bytes + 2;
+		insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
+		break;
+	case SrcMemFAddr:
+		c->src.type = OP_MEM;
+		c->src.ptr = (unsigned long *)c->modrm_ea;
+		c->src.bytes = c->op_bytes + 2;
+		break;
 	}
 
 	/*
@@ -1179,22 +1195,10 @@ done_prefixes:
 		c->src2.bytes = 1;
 		c->src2.val = insn_fetch(u8, 1, c->eip);
 		break;
-	case Src2Imm16:
-		c->src2.type = OP_IMM;
-		c->src2.ptr = (unsigned long *)c->eip;
-		c->src2.bytes = 2;
-		c->src2.val = insn_fetch(u16, 2, c->eip);
-		break;
 	case Src2One:
 		c->src2.bytes = 1;
 		c->src2.val = 1;
 		break;
-	case Src2Mem16:
-		c->src2.type = OP_MEM;
-		c->src2.bytes = 2;
-		c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes);
-		c->src2.val = 0;
-		break;
 	}
 
 	/* Decode and fetch the destination operand: register or memory. */
@@ -2558,7 +2562,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 
 	if (c->src.type == OP_MEM) {
 		rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr,
-					&c->src.val, c->src.bytes);
+					c->src.valptr, c->src.bytes);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		c->src.orig_val = c->src.val;
@@ -2884,14 +2888,18 @@ special_insn:
 	}
 	case 0xe9: /* jmp rel */
 		goto jmp;
-	case 0xea: /* jmp far */
+	case 0xea: { /* jmp far */
+		unsigned short sel;
 	jump_far:
-		if (load_segment_descriptor(ctxt, ops, c->src2.val,
-					    VCPU_SREG_CS))
+		memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+
+		if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS))
 			goto done;
 
-		c->eip = c->src.val;
+		c->eip = 0;
+		memcpy(&c->eip, c->src.valptr, c->op_bytes);
 		break;
+	}
 	case 0xeb:
 	      jmp:		/* jmp rel short */
 		jmp_rel(c, c->src.val);
-- 
cgit v1.2.3


From 35aa5375d407ecadcc3adb5cb31d27044bf7f29f Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:27 +0300
Subject: KVM: x86 emulator: add (set|get)_dr callbacks to x86_emulate_ops

Add (set|get)_dr callbacks to x86_emulate_ops instead of calling
them directly.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  2 ++
 arch/x86/include/asm/kvm_host.h    |  4 ----
 arch/x86/kvm/emulate.c             |  7 +++++--
 arch/x86/kvm/x86.c                 | 12 ++++++------
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 69a64a6a36f4..c37296d0e909 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -137,6 +137,8 @@ struct x86_emulate_ops {
 	void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
 	int (*cpl)(struct kvm_vcpu *vcpu);
 	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+	int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu);
+	int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu);
 };
 
 /* Type, address-of, and value of an instruction's operand. */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 76f5483cffec..97774ae3c874 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -591,10 +591,6 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
 int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
 int emulate_clts(struct kvm_vcpu *vcpu);
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
-		    unsigned long *dest);
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
-		    unsigned long value);
 
 void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 687ea0906b79..8a4aa73ff1e4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3132,7 +3132,7 @@ twobyte_insn:
 			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
 			goto done;
 		}
-		emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
+		ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu);
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x22: /* mov reg, cr */
@@ -3145,7 +3145,10 @@ twobyte_insn:
 			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
 			goto done;
 		}
-		emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]);
+
+		ops->set_dr(c->modrm_reg,c->regs[c->modrm_rm] &
+			    ((ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U),
+			ctxt->vcpu);
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x30:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 801afc6461ed..059d63de169b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3620,16 +3620,14 @@ int emulate_clts(struct kvm_vcpu *vcpu)
 	return X86EMUL_CONTINUE;
 }
 
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
+int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu)
 {
-	return kvm_get_dr(ctxt->vcpu, dr, dest);
+	return kvm_get_dr(vcpu, dr, dest);
 }
 
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
+int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu)
 {
-	unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
-
-	return kvm_set_dr(ctxt->vcpu, dr, value & mask);
+	return kvm_set_dr(vcpu, dr, value);
 }
 
 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
@@ -3811,6 +3809,8 @@ static struct x86_emulate_ops emulate_ops = {
 	.set_cr              = emulator_set_cr,
 	.cpl                 = emulator_get_cpl,
 	.set_rflags          = emulator_set_rflags,
+	.get_dr              = emulator_get_dr,
+	.set_dr              = emulator_set_dr,
 };
 
 static void cache_all_regs(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 3fb1b5dbd397d16a855c97c3fb80fe6e9196ce7c Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:28 +0300
Subject: KVM: x86 emulator: add (set|get)_msr callbacks to x86_emulate_ops

Add (set|get)_msr callbacks to x86_emulate_ops instead of calling
them directly.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  2 ++
 arch/x86/kvm/emulate.c             | 36 ++++++++++++++++++------------------
 arch/x86/kvm/x86.c                 |  2 ++
 3 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index c37296d0e909..f751657be732 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -139,6 +139,8 @@ struct x86_emulate_ops {
 	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
 	int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu);
 	int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu);
+	int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+	int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
 };
 
 /* Type, address-of, and value of an instruction's operand. */
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8a4aa73ff1e4..7c8ed560fd41 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1875,7 +1875,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
 }
 
 static int
-emulate_syscall(struct x86_emulate_ctxt *ctxt)
+emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
 	struct kvm_segment cs, ss;
@@ -1890,7 +1890,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
 
 	setup_syscalls_segments(ctxt, &cs, &ss);
 
-	kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+	ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
 	msr_data >>= 32;
 	cs.selector = (u16)(msr_data & 0xfffc);
 	ss.selector = (u16)(msr_data + 8);
@@ -1907,17 +1907,17 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
 #ifdef CONFIG_X86_64
 		c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
 
-		kvm_x86_ops->get_msr(ctxt->vcpu,
-			ctxt->mode == X86EMUL_MODE_PROT64 ?
-			MSR_LSTAR : MSR_CSTAR, &msr_data);
+		ops->get_msr(ctxt->vcpu,
+			     ctxt->mode == X86EMUL_MODE_PROT64 ?
+			     MSR_LSTAR : MSR_CSTAR, &msr_data);
 		c->eip = msr_data;
 
-		kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
+		ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
 		ctxt->eflags &= ~(msr_data | EFLG_RF);
 #endif
 	} else {
 		/* legacy mode */
-		kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+		ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
 		c->eip = (u32)msr_data;
 
 		ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
@@ -1927,7 +1927,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
 }
 
 static int
-emulate_sysenter(struct x86_emulate_ctxt *ctxt)
+emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
 	struct kvm_segment cs, ss;
@@ -1949,7 +1949,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
 
 	setup_syscalls_segments(ctxt, &cs, &ss);
 
-	kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+	ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
 	switch (ctxt->mode) {
 	case X86EMUL_MODE_PROT32:
 		if ((msr_data & 0xfffc) == 0x0) {
@@ -1979,17 +1979,17 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
 	kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
 	kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
 
-	kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
+	ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
 	c->eip = msr_data;
 
-	kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
+	ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
 	c->regs[VCPU_REGS_RSP] = msr_data;
 
 	return X86EMUL_CONTINUE;
 }
 
 static int
-emulate_sysexit(struct x86_emulate_ctxt *ctxt)
+emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
 	struct kvm_segment cs, ss;
@@ -2012,7 +2012,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
 
 	cs.dpl = 3;
 	ss.dpl = 3;
-	kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+	ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
 	switch (usermode) {
 	case X86EMUL_MODE_PROT32:
 		cs.selector = (u16)(msr_data + 16);
@@ -3099,7 +3099,7 @@ twobyte_insn:
 		}
 		break;
 	case 0x05: 		/* syscall */
-		rc = emulate_syscall(ctxt);
+		rc = emulate_syscall(ctxt, ops);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		else
@@ -3155,7 +3155,7 @@ twobyte_insn:
 		/* wrmsr */
 		msr_data = (u32)c->regs[VCPU_REGS_RAX]
 			| ((u64)c->regs[VCPU_REGS_RDX] << 32);
-		if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
+		if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
 			goto done;
 		}
@@ -3164,7 +3164,7 @@ twobyte_insn:
 		break;
 	case 0x32:
 		/* rdmsr */
-		if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
+		if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
 			kvm_inject_gp(ctxt->vcpu, 0);
 			goto done;
 		} else {
@@ -3175,14 +3175,14 @@ twobyte_insn:
 		c->dst.type = OP_NONE;
 		break;
 	case 0x34:		/* sysenter */
-		rc = emulate_sysenter(ctxt);
+		rc = emulate_sysenter(ctxt, ops);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		else
 			goto writeback;
 		break;
 	case 0x35:		/* sysexit */
-		rc = emulate_sysexit(ctxt);
+		rc = emulate_sysexit(ctxt, ops);
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		else
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 059d63de169b..e3a5455049b0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3811,6 +3811,8 @@ static struct x86_emulate_ops emulate_ops = {
 	.set_rflags          = emulator_set_rflags,
 	.get_dr              = emulator_get_dr,
 	.set_dr              = emulator_set_dr,
+	.set_msr             = kvm_set_msr,
+	.get_msr             = kvm_get_msr,
 };
 
 static void cache_all_regs(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 5951c4423724759906b10a26aa6a8817c4afa615 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:29 +0300
Subject: KVM: x86 emulator: add get_cached_segment_base() callback to
 x86_emulate_ops

On VMX it is expensive to call get_cached_descriptor() just to get segment
base since multiple vmcs_reads are done instead of only one. Introduce
new call back get_cached_segment_base() for efficiency.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  1 +
 arch/x86/kvm/emulate.c             | 13 +------------
 arch/x86/kvm/x86.c                 |  7 +++++++
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index f751657be732..df53ba2294b6 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -132,6 +132,7 @@ struct x86_emulate_ops {
 				      int seg, struct kvm_vcpu *vcpu);
 	u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
 	void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
+	unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu);
 	void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
 	ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
 	void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7c8ed560fd41..8228778ace38 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2097,17 +2097,6 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
 	return true;
 }
 
-static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt,
-				      struct x86_emulate_ops *ops,
-				      int seg)
-{
-	struct desc_struct desc;
-	if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu))
-		return get_desc_base(&desc);
-	else
-		return ~0;
-}
-
 static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
 				struct x86_emulate_ops *ops,
 				struct tss_segment_16 *tss)
@@ -2383,7 +2372,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 	int ret;
 	u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
 	ulong old_tss_base =
-		get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR);
+		ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu);
 	u32 desc_limit;
 
 	/* FIXME: old_tss_base == ~0 ? */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e3a5455049b0..9a469df6011c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3714,6 +3714,12 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
 	kvm_x86_ops->get_gdt(vcpu, dt);
 }
 
+static unsigned long emulator_get_cached_segment_base(int seg,
+						      struct kvm_vcpu *vcpu)
+{
+	return get_segment_base(vcpu, seg);
+}
+
 static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
 					   struct kvm_vcpu *vcpu)
 {
@@ -3804,6 +3810,7 @@ static struct x86_emulate_ops emulate_ops = {
 	.set_cached_descriptor = emulator_set_cached_descriptor,
 	.get_segment_selector = emulator_get_segment_selector,
 	.set_segment_selector = emulator_set_segment_selector,
+	.get_cached_segment_base = emulator_get_cached_segment_base,
 	.get_gdt             = emulator_get_gdt,
 	.get_cr              = emulator_get_cr,
 	.set_cr              = emulator_set_cr,
-- 
cgit v1.2.3


From 79168fd1a307ffee46ee03b7f8711559241738c7 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:30 +0300
Subject: KVM: x86 emulator: cleanup some direct calls into kvm to use existing
 callbacks

Use callbacks from x86_emulate_ops to access segments instead of calling
into kvm directly.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 200 ++++++++++++++++++++++++++-----------------------
 1 file changed, 105 insertions(+), 95 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8228778ace38..f56ec486393e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -622,31 +622,35 @@ static void set_seg_override(struct decode_cache *c, int seg)
 	c->seg_override = seg;
 }
 
-static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
+static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
+			      struct x86_emulate_ops *ops, int seg)
 {
 	if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
 		return 0;
 
-	return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg);
+	return ops->get_cached_segment_base(seg, ctxt->vcpu);
 }
 
 static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
+				       struct x86_emulate_ops *ops,
 				       struct decode_cache *c)
 {
 	if (!c->has_seg_override)
 		return 0;
 
-	return seg_base(ctxt, c->seg_override);
+	return seg_base(ctxt, ops, c->seg_override);
 }
 
-static unsigned long es_base(struct x86_emulate_ctxt *ctxt)
+static unsigned long es_base(struct x86_emulate_ctxt *ctxt,
+			     struct x86_emulate_ops *ops)
 {
-	return seg_base(ctxt, VCPU_SREG_ES);
+	return seg_base(ctxt, ops, VCPU_SREG_ES);
 }
 
-static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
+static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
+			     struct x86_emulate_ops *ops)
 {
-	return seg_base(ctxt, VCPU_SREG_SS);
+	return seg_base(ctxt, ops, VCPU_SREG_SS);
 }
 
 static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@@ -941,7 +945,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	memset(c, 0, sizeof(struct decode_cache));
 	c->eip = ctxt->eip;
 	c->fetch.start = c->fetch.end = c->eip;
-	ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
+	ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
 	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
 
 	switch (mode) {
@@ -1065,7 +1069,7 @@ done_prefixes:
 		set_seg_override(c, VCPU_SREG_DS);
 
 	if (!(!c->twobyte && c->b == 0x8d))
-		c->modrm_ea += seg_override_base(ctxt, c);
+		c->modrm_ea += seg_override_base(ctxt, ops, c);
 
 	if (c->ad_bytes != 8)
 		c->modrm_ea = (u32)c->modrm_ea;
@@ -1161,7 +1165,7 @@ done_prefixes:
 		c->src.type = OP_MEM;
 		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 		c->src.ptr = (unsigned long *)
-			register_address(c,  seg_override_base(ctxt, c),
+			register_address(c,  seg_override_base(ctxt, ops, c),
 					 c->regs[VCPU_REGS_RSI]);
 		c->src.val = 0;
 		break;
@@ -1257,7 +1261,7 @@ done_prefixes:
 		c->dst.type = OP_MEM;
 		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 		c->dst.ptr = (unsigned long *)
-			register_address(c, es_base(ctxt),
+			register_address(c, es_base(ctxt, ops),
 					 c->regs[VCPU_REGS_RDI]);
 		c->dst.val = 0;
 		break;
@@ -1516,7 +1520,8 @@ exception:
 	return X86EMUL_PROPAGATE_FAULT;
 }
 
-static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
+static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
+				struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
 
@@ -1524,7 +1529,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
 	c->dst.bytes = c->op_bytes;
 	c->dst.val = c->src.val;
 	register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
-	c->dst.ptr = (void *) register_address(c, ss_base(ctxt),
+	c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops),
 					       c->regs[VCPU_REGS_RSP]);
 }
 
@@ -1535,7 +1540,7 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
-	rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt),
+	rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops),
 						       c->regs[VCPU_REGS_RSP]),
 			   dest, len);
 	if (rc != X86EMUL_CONTINUE)
@@ -1588,15 +1593,14 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
 	return rc;
 }
 
-static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
+static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
+			      struct x86_emulate_ops *ops, int seg)
 {
 	struct decode_cache *c = &ctxt->decode;
-	struct kvm_segment segment;
 
-	kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg);
+	c->src.val = ops->get_segment_selector(seg, ctxt->vcpu);
 
-	c->src.val = segment.selector;
-	emulate_push(ctxt);
+	emulate_push(ctxt, ops);
 }
 
 static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
@@ -1614,7 +1618,8 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
 	return rc;
 }
 
-static void emulate_pusha(struct x86_emulate_ctxt *ctxt)
+static void emulate_pusha(struct x86_emulate_ctxt *ctxt,
+			  struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
 	unsigned long old_esp = c->regs[VCPU_REGS_RSP];
@@ -1624,7 +1629,7 @@ static void emulate_pusha(struct x86_emulate_ctxt *ctxt)
 		(reg == VCPU_REGS_RSP) ?
 		(c->src.val = old_esp) : (c->src.val = c->regs[reg]);
 
-		emulate_push(ctxt);
+		emulate_push(ctxt, ops);
 		++reg;
 	}
 }
@@ -1726,14 +1731,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
 		old_eip = c->eip;
 		c->eip = c->src.val;
 		c->src.val = old_eip;
-		emulate_push(ctxt);
+		emulate_push(ctxt, ops);
 		break;
 	}
 	case 4: /* jmp abs */
 		c->eip = c->src.val;
 		break;
 	case 6:	/* push */
-		emulate_push(ctxt);
+		emulate_push(ctxt, ops);
 		break;
 	}
 	return X86EMUL_CONTINUE;
@@ -1847,39 +1852,40 @@ static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
 
 static inline void
 setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
-	struct kvm_segment *cs, struct kvm_segment *ss)
+			struct x86_emulate_ops *ops, struct desc_struct *cs,
+			struct desc_struct *ss)
 {
-	memset(cs, 0, sizeof(struct kvm_segment));
-	kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS);
-	memset(ss, 0, sizeof(struct kvm_segment));
+	memset(cs, 0, sizeof(struct desc_struct));
+	ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu);
+	memset(ss, 0, sizeof(struct desc_struct));
 
 	cs->l = 0;		/* will be adjusted later */
-	cs->base = 0;		/* flat segment */
+	set_desc_base(cs, 0);	/* flat segment */
 	cs->g = 1;		/* 4kb granularity */
-	cs->limit = 0xffffffff;	/* 4GB limit */
+	set_desc_limit(cs, 0xfffff);	/* 4GB limit */
 	cs->type = 0x0b;	/* Read, Execute, Accessed */
 	cs->s = 1;
 	cs->dpl = 0;		/* will be adjusted later */
-	cs->present = 1;
-	cs->db = 1;
+	cs->p = 1;
+	cs->d = 1;
 
-	ss->unusable = 0;
-	ss->base = 0;		/* flat segment */
-	ss->limit = 0xffffffff;	/* 4GB limit */
+	set_desc_base(ss, 0);	/* flat segment */
+	set_desc_limit(ss, 0xfffff);	/* 4GB limit */
 	ss->g = 1;		/* 4kb granularity */
 	ss->s = 1;
 	ss->type = 0x03;	/* Read/Write, Accessed */
-	ss->db = 1;		/* 32bit stack segment */
+	ss->d = 1;		/* 32bit stack segment */
 	ss->dpl = 0;
-	ss->present = 1;
+	ss->p = 1;
 }
 
 static int
 emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	struct kvm_segment cs, ss;
+	struct desc_struct cs, ss;
 	u64 msr_data;
+	u16 cs_sel, ss_sel;
 
 	/* syscall is not available in real mode */
 	if (ctxt->mode == X86EMUL_MODE_REAL ||
@@ -1888,19 +1894,21 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 
-	setup_syscalls_segments(ctxt, &cs, &ss);
+	setup_syscalls_segments(ctxt, ops, &cs, &ss);
 
 	ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
 	msr_data >>= 32;
-	cs.selector = (u16)(msr_data & 0xfffc);
-	ss.selector = (u16)(msr_data + 8);
+	cs_sel = (u16)(msr_data & 0xfffc);
+	ss_sel = (u16)(msr_data + 8);
 
 	if (is_long_mode(ctxt->vcpu)) {
-		cs.db = 0;
+		cs.d = 0;
 		cs.l = 1;
 	}
-	kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
-	kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+	ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+	ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
 
 	c->regs[VCPU_REGS_RCX] = c->eip;
 	if (is_long_mode(ctxt->vcpu)) {
@@ -1930,8 +1938,9 @@ static int
 emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	struct kvm_segment cs, ss;
+	struct desc_struct cs, ss;
 	u64 msr_data;
+	u16 cs_sel, ss_sel;
 
 	/* inject #GP if in real mode */
 	if (ctxt->mode == X86EMUL_MODE_REAL) {
@@ -1947,7 +1956,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 
-	setup_syscalls_segments(ctxt, &cs, &ss);
+	setup_syscalls_segments(ctxt, ops, &cs, &ss);
 
 	ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
 	switch (ctxt->mode) {
@@ -1966,18 +1975,20 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	}
 
 	ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
-	cs.selector = (u16)msr_data;
-	cs.selector &= ~SELECTOR_RPL_MASK;
-	ss.selector = cs.selector + 8;
-	ss.selector &= ~SELECTOR_RPL_MASK;
+	cs_sel = (u16)msr_data;
+	cs_sel &= ~SELECTOR_RPL_MASK;
+	ss_sel = cs_sel + 8;
+	ss_sel &= ~SELECTOR_RPL_MASK;
 	if (ctxt->mode == X86EMUL_MODE_PROT64
 		|| is_long_mode(ctxt->vcpu)) {
-		cs.db = 0;
+		cs.d = 0;
 		cs.l = 1;
 	}
 
-	kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
-	kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+	ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+	ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
 
 	ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
 	c->eip = msr_data;
@@ -1992,9 +2003,10 @@ static int
 emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
-	struct kvm_segment cs, ss;
+	struct desc_struct cs, ss;
 	u64 msr_data;
 	int usermode;
+	u16 cs_sel, ss_sel;
 
 	/* inject #GP if in real mode or Virtual 8086 mode */
 	if (ctxt->mode == X86EMUL_MODE_REAL ||
@@ -2003,7 +2015,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 
-	setup_syscalls_segments(ctxt, &cs, &ss);
+	setup_syscalls_segments(ctxt, ops, &cs, &ss);
 
 	if ((c->rex_prefix & 0x8) != 0x0)
 		usermode = X86EMUL_MODE_PROT64;
@@ -2015,29 +2027,31 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
 	switch (usermode) {
 	case X86EMUL_MODE_PROT32:
-		cs.selector = (u16)(msr_data + 16);
+		cs_sel = (u16)(msr_data + 16);
 		if ((msr_data & 0xfffc) == 0x0) {
 			kvm_inject_gp(ctxt->vcpu, 0);
 			return X86EMUL_PROPAGATE_FAULT;
 		}
-		ss.selector = (u16)(msr_data + 24);
+		ss_sel = (u16)(msr_data + 24);
 		break;
 	case X86EMUL_MODE_PROT64:
-		cs.selector = (u16)(msr_data + 32);
+		cs_sel = (u16)(msr_data + 32);
 		if (msr_data == 0x0) {
 			kvm_inject_gp(ctxt->vcpu, 0);
 			return X86EMUL_PROPAGATE_FAULT;
 		}
-		ss.selector = cs.selector + 8;
-		cs.db = 0;
+		ss_sel = cs_sel + 8;
+		cs.d = 0;
 		cs.l = 1;
 		break;
 	}
-	cs.selector |= SELECTOR_RPL_MASK;
-	ss.selector |= SELECTOR_RPL_MASK;
+	cs_sel |= SELECTOR_RPL_MASK;
+	ss_sel |= SELECTOR_RPL_MASK;
 
-	kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
-	kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+	ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+	ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
 
 	c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
 	c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
@@ -2061,25 +2075,25 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
 					    struct x86_emulate_ops *ops,
 					    u16 port, u16 len)
 {
-	struct kvm_segment tr_seg;
+	struct desc_struct tr_seg;
 	int r;
 	u16 io_bitmap_ptr;
 	u8 perm, bit_idx = port & 0x7;
 	unsigned mask = (1 << len) - 1;
 
-	kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR);
-	if (tr_seg.unusable)
+	ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu);
+	if (!tr_seg.p)
 		return false;
-	if (tr_seg.limit < 103)
+	if (desc_limit_scaled(&tr_seg) < 103)
 		return false;
-	r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu,
-			  NULL);
+	r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2,
+			  ctxt->vcpu, NULL);
 	if (r != X86EMUL_CONTINUE)
 		return false;
-	if (io_bitmap_ptr + port/8 > tr_seg.limit)
+	if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
 		return false;
-	r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1,
-			  ctxt->vcpu, NULL);
+	r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8,
+			  &perm, 1, ctxt->vcpu, NULL);
 	if (r != X86EMUL_CONTINUE)
 		return false;
 	if ((perm >> bit_idx) & mask)
@@ -2445,7 +2459,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 		c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
 		c->lock_prefix = 0;
 		c->src.val = (unsigned long) error_code;
-		emulate_push(ctxt);
+		emulate_push(ctxt, ops);
 	}
 
 	return ret;
@@ -2588,7 +2602,7 @@ special_insn:
 		emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
 		break;
 	case 0x06:		/* push es */
-		emulate_push_sreg(ctxt, VCPU_SREG_ES);
+		emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);
 		break;
 	case 0x07:		/* pop es */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
@@ -2600,14 +2614,14 @@ special_insn:
 		emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
 		break;
 	case 0x0e:		/* push cs */
-		emulate_push_sreg(ctxt, VCPU_SREG_CS);
+		emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);
 		break;
 	case 0x10 ... 0x15:
 	      adc:		/* adc */
 		emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
 		break;
 	case 0x16:		/* push ss */
-		emulate_push_sreg(ctxt, VCPU_SREG_SS);
+		emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);
 		break;
 	case 0x17:		/* pop ss */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
@@ -2619,7 +2633,7 @@ special_insn:
 		emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
 		break;
 	case 0x1e:		/* push ds */
-		emulate_push_sreg(ctxt, VCPU_SREG_DS);
+		emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);
 		break;
 	case 0x1f:		/* pop ds */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
@@ -2649,7 +2663,7 @@ special_insn:
 		emulate_1op("dec", c->dst, ctxt->eflags);
 		break;
 	case 0x50 ... 0x57:  /* push reg */
-		emulate_push(ctxt);
+		emulate_push(ctxt, ops);
 		break;
 	case 0x58 ... 0x5f: /* pop reg */
 	pop_instruction:
@@ -2658,7 +2672,7 @@ special_insn:
 			goto done;
 		break;
 	case 0x60:	/* pusha */
-		emulate_pusha(ctxt);
+		emulate_pusha(ctxt, ops);
 		break;
 	case 0x61:	/* popa */
 		rc = emulate_popa(ctxt, ops);
@@ -2672,7 +2686,7 @@ special_insn:
 		break;
 	case 0x68: /* push imm */
 	case 0x6a: /* push imm8 */
-		emulate_push(ctxt);
+		emulate_push(ctxt, ops);
 		break;
 	case 0x6c:		/* insb */
 	case 0x6d:		/* insw/insd */
@@ -2752,18 +2766,13 @@ special_insn:
 		break;
 	case 0x88 ... 0x8b:	/* mov */
 		goto mov;
-	case 0x8c: { /* mov r/m, sreg */
-		struct kvm_segment segreg;
-
-		if (c->modrm_reg <= VCPU_SREG_GS)
-			kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
-		else {
+	case 0x8c:  /* mov r/m, sreg */
+		if (c->modrm_reg > VCPU_SREG_GS) {
 			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
 			goto done;
 		}
-		c->dst.val = segreg.selector;
+		c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
 		break;
-	}
 	case 0x8d: /* lea r16/r32, m */
 		c->dst.val = c->modrm_ea;
 		break;
@@ -2804,7 +2813,7 @@ special_insn:
 		goto xchg;
 	case 0x9c: /* pushf */
 		c->src.val =  (unsigned long) ctxt->eflags;
-		emulate_push(ctxt);
+		emulate_push(ctxt, ops);
 		break;
 	case 0x9d: /* popf */
 		c->dst.type = OP_REG;
@@ -2872,7 +2881,7 @@ special_insn:
 		long int rel = c->src.val;
 		c->src.val = (unsigned long) c->eip;
 		jmp_rel(c, rel);
-		emulate_push(ctxt);
+		emulate_push(ctxt, ops);
 		break;
 	}
 	case 0xe9: /* jmp rel */
@@ -2985,11 +2994,12 @@ writeback:
 	c->dst.type = saved_dst_type;
 
 	if ((c->d & SrcMask) == SrcSI)
-		string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI,
-				&c->src);
+		string_addr_inc(ctxt, seg_override_base(ctxt, ops, c),
+				VCPU_REGS_RSI, &c->src);
 
 	if ((c->d & DstMask) == DstDI)
-		string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst);
+		string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI,
+				&c->dst);
 
 	if (c->rep_prefix && (c->d & String)) {
 		struct read_cache *rc = &ctxt->decode.io_read;
@@ -3188,7 +3198,7 @@ twobyte_insn:
 		c->dst.type = OP_NONE;
 		break;
 	case 0xa0:	  /* push fs */
-		emulate_push_sreg(ctxt, VCPU_SREG_FS);
+		emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
 		break;
 	case 0xa1:	 /* pop fs */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
@@ -3207,7 +3217,7 @@ twobyte_insn:
 		emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
 		break;
 	case 0xa8:	/* push gs */
-		emulate_push_sreg(ctxt, VCPU_SREG_GS);
+		emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);
 		break;
 	case 0xa9:	/* pop gs */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
-- 
cgit v1.2.3


From 0f12244fe70e8a94a491f6cd7ed70a352ab6c26c Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:31 +0300
Subject: KVM: x86 emulator: make set_cr() callback return error if it fails

Make set_cr() callback return error if it fails instead of injecting #GP
behind emulator's back.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |   2 +-
 arch/x86/kvm/emulate.c             |  10 ++-
 arch/x86/kvm/x86.c                 | 148 +++++++++++++++++++------------------
 3 files changed, 84 insertions(+), 76 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index df53ba2294b6..6c4f4918db5e 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -135,7 +135,7 @@ struct x86_emulate_ops {
 	unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu);
 	void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
 	ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
-	void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
+	int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
 	int (*cpl)(struct kvm_vcpu *vcpu);
 	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
 	int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index f56ec486393e..061f7d37c9f7 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2272,7 +2272,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 	int ret;
 
-	ops->set_cr(3, tss->cr3, ctxt->vcpu);
+	if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) {
+		kvm_inject_gp(ctxt->vcpu, 0);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
 	c->eip = tss->eip;
 	ctxt->eflags = tss->eflags | 2;
 	c->regs[VCPU_REGS_RAX] = tss->eax;
@@ -3135,7 +3138,10 @@ twobyte_insn:
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x22: /* mov reg, cr */
-		ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu);
+		if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) {
+			kvm_inject_gp(ctxt->vcpu, 0);
+			goto done;
+		}
 		c->dst.type = OP_NONE;
 		break;
 	case 0x23: /* mov from reg to dr */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9a469df6011c..64c6e7a31411 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -414,57 +414,49 @@ out:
 	return changed;
 }
 
-void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+static int __kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
 	cr0 |= X86_CR0_ET;
 
 #ifdef CONFIG_X86_64
-	if (cr0 & 0xffffffff00000000UL) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+	if (cr0 & 0xffffffff00000000UL)
+		return 1;
 #endif
 
 	cr0 &= ~CR0_RESERVED_BITS;
 
-	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
+		return 1;
 
-	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
+		return 1;
 
 	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 #ifdef CONFIG_X86_64
 		if ((vcpu->arch.efer & EFER_LME)) {
 			int cs_db, cs_l;
 
-			if (!is_pae(vcpu)) {
-				kvm_inject_gp(vcpu, 0);
-				return;
-			}
+			if (!is_pae(vcpu))
+				return 1;
 			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-			if (cs_l) {
-				kvm_inject_gp(vcpu, 0);
-				return;
-
-			}
+			if (cs_l)
+				return 1;
 		} else
 #endif
-		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
-			kvm_inject_gp(vcpu, 0);
-			return;
-		}
-
+		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3))
+			return 1;
 	}
 
 	kvm_x86_ops->set_cr0(vcpu, cr0);
 
 	kvm_mmu_reset_context(vcpu);
-	return;
+	return 0;
+}
+
+void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+{
+	if (__kvm_set_cr0(vcpu, cr0))
+		kvm_inject_gp(vcpu, 0);
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr0);
 
@@ -474,61 +466,56 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 }
 EXPORT_SYMBOL_GPL(kvm_lmsw);
 
-void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
 	unsigned long old_cr4 = kvm_read_cr4(vcpu);
 	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
 
-	if (cr4 & CR4_RESERVED_BITS) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+	if (cr4 & CR4_RESERVED_BITS)
+		return 1;
 
 	if (is_long_mode(vcpu)) {
-		if (!(cr4 & X86_CR4_PAE)) {
-			kvm_inject_gp(vcpu, 0);
-			return;
-		}
+		if (!(cr4 & X86_CR4_PAE))
+			return 1;
 	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
 		   && ((cr4 ^ old_cr4) & pdptr_bits)
-		   && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+		   && !load_pdptrs(vcpu, vcpu->arch.cr3))
+		return 1;
+
+	if (cr4 & X86_CR4_VMXE)
+		return 1;
 
-	if (cr4 & X86_CR4_VMXE) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
 	kvm_x86_ops->set_cr4(vcpu, cr4);
 	vcpu->arch.cr4 = cr4;
 	kvm_mmu_reset_context(vcpu);
+
+	return 0;
+}
+
+void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+	if (__kvm_set_cr4(vcpu, cr4))
+		kvm_inject_gp(vcpu, 0);
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
 
-void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+static int __kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
 	if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
 		kvm_mmu_sync_roots(vcpu);
 		kvm_mmu_flush_tlb(vcpu);
-		return;
+		return 0;
 	}
 
 	if (is_long_mode(vcpu)) {
-		if (cr3 & CR3_L_MODE_RESERVED_BITS) {
-			kvm_inject_gp(vcpu, 0);
-			return;
-		}
+		if (cr3 & CR3_L_MODE_RESERVED_BITS)
+			return 1;
 	} else {
 		if (is_pae(vcpu)) {
-			if (cr3 & CR3_PAE_RESERVED_BITS) {
-				kvm_inject_gp(vcpu, 0);
-				return;
-			}
-			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
-				kvm_inject_gp(vcpu, 0);
-				return;
-			}
+			if (cr3 & CR3_PAE_RESERVED_BITS)
+				return 1;
+			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3))
+				return 1;
 		}
 		/*
 		 * We don't check reserved bits in nonpae mode, because
@@ -546,24 +533,34 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 	 * to debug) behavior on the guest side.
 	 */
 	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
+		return 1;
+	vcpu->arch.cr3 = cr3;
+	vcpu->arch.mmu.new_cr3(vcpu);
+	return 0;
+}
+
+void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+{
+	if (__kvm_set_cr3(vcpu, cr3))
 		kvm_inject_gp(vcpu, 0);
-	else {
-		vcpu->arch.cr3 = cr3;
-		vcpu->arch.mmu.new_cr3(vcpu);
-	}
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr3);
 
-void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
-	if (cr8 & CR8_RESERVED_BITS) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+	if (cr8 & CR8_RESERVED_BITS)
+		return 1;
 	if (irqchip_in_kernel(vcpu->kvm))
 		kvm_lapic_set_tpr(vcpu, cr8);
 	else
 		vcpu->arch.cr8 = cr8;
+	return 0;
+}
+
+void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+	if (__kvm_set_cr8(vcpu, cr8))
+		kvm_inject_gp(vcpu, 0);
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr8);
 
@@ -3681,27 +3678,32 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
 	return value;
 }
 
-static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
+static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
 {
+	int res = 0;
+
 	switch (cr) {
 	case 0:
-		kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
+		res = __kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
 		break;
 	case 2:
 		vcpu->arch.cr2 = val;
 		break;
 	case 3:
-		kvm_set_cr3(vcpu, val);
+		res = __kvm_set_cr3(vcpu, val);
 		break;
 	case 4:
-		kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+		res = __kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
 		break;
 	case 8:
-		kvm_set_cr8(vcpu, val & 0xfUL);
+		res = __kvm_set_cr8(vcpu, val & 0xfUL);
 		break;
 	default:
 		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+		res = -1;
 	}
+
+	return res;
 }
 
 static int emulator_get_cpl(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 338dbc9781eb5acd0b12809d95d4006135f29767 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:32 +0300
Subject: KVM: x86 emulator: make (get|set)_dr() callback return error if it
 fails

Make (get|set)_dr() callback return error if it fails instead of
injecting exception behind emulator's back.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 11 ++++++---
 arch/x86/kvm/x86.c     | 63 +++++++++++++++++++++++++++++---------------------
 2 files changed, 45 insertions(+), 29 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 061f7d37c9f7..d5979ecc2521 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3151,9 +3151,14 @@ twobyte_insn:
 			goto done;
 		}
 
-		ops->set_dr(c->modrm_reg,c->regs[c->modrm_rm] &
-			    ((ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U),
-			ctxt->vcpu);
+		if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] &
+				((ctxt->mode == X86EMUL_MODE_PROT64) ?
+				 ~0ULL : ~0U), ctxt->vcpu) < 0) {
+			/* #UD condition is already handled by the code above */
+			kvm_inject_gp(ctxt->vcpu, 0);
+			goto done;
+		}
+
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 	case 0x30:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 64c6e7a31411..44a546b136fc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -573,7 +573,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 
-int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 {
 	switch (dr) {
 	case 0 ... 3:
@@ -582,29 +582,21 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 			vcpu->arch.eff_db[dr] = val;
 		break;
 	case 4:
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
-			kvm_queue_exception(vcpu, UD_VECTOR);
-			return 1;
-		}
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+			return 1; /* #UD */
 		/* fall through */
 	case 6:
-		if (val & 0xffffffff00000000ULL) {
-			kvm_inject_gp(vcpu, 0);
-			return 1;
-		}
+		if (val & 0xffffffff00000000ULL)
+			return -1; /* #GP */
 		vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
 		break;
 	case 5:
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
-			kvm_queue_exception(vcpu, UD_VECTOR);
-			return 1;
-		}
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+			return 1; /* #UD */
 		/* fall through */
 	default: /* 7 */
-		if (val & 0xffffffff00000000ULL) {
-			kvm_inject_gp(vcpu, 0);
-			return 1;
-		}
+		if (val & 0xffffffff00000000ULL)
+			return -1; /* #GP */
 		vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
 		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
 			kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
@@ -615,28 +607,37 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 
 	return 0;
 }
+
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+{
+	int res;
+
+	res = __kvm_set_dr(vcpu, dr, val);
+	if (res > 0)
+		kvm_queue_exception(vcpu, UD_VECTOR);
+	else if (res < 0)
+		kvm_inject_gp(vcpu, 0);
+
+	return res;
+}
 EXPORT_SYMBOL_GPL(kvm_set_dr);
 
-int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 {
 	switch (dr) {
 	case 0 ... 3:
 		*val = vcpu->arch.db[dr];
 		break;
 	case 4:
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
-			kvm_queue_exception(vcpu, UD_VECTOR);
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
 			return 1;
-		}
 		/* fall through */
 	case 6:
 		*val = vcpu->arch.dr6;
 		break;
 	case 5:
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
-			kvm_queue_exception(vcpu, UD_VECTOR);
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
 			return 1;
-		}
 		/* fall through */
 	default: /* 7 */
 		*val = vcpu->arch.dr7;
@@ -645,6 +646,15 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 
 	return 0;
 }
+
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+{
+	if (_kvm_get_dr(vcpu, dr, val)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+	return 0;
+}
 EXPORT_SYMBOL_GPL(kvm_get_dr);
 
 static inline u32 bit(int bitno)
@@ -3619,12 +3629,13 @@ int emulate_clts(struct kvm_vcpu *vcpu)
 
 int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu)
 {
-	return kvm_get_dr(vcpu, dr, dest);
+	return _kvm_get_dr(vcpu, dr, dest);
 }
 
 int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu)
 {
-	return kvm_set_dr(vcpu, dr, value);
+
+	return __kvm_set_dr(vcpu, dr, value);
 }
 
 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
-- 
cgit v1.2.3


From e680080e653b8c8725ca620bf22a5f8480f40cb5 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:33 +0300
Subject: KVM: x86 emulator: fix X86EMUL_RETRY_INSTR and X86EMUL_CMPXCHG_FAILED
 values

Currently X86EMUL_PROPAGATE_FAULT, X86EMUL_RETRY_INSTR and
X86EMUL_CMPXCHG_FAILED have the same value so caller cannot
distinguish why function such as emulator_cmpxchg_emulated()
(which can return both X86EMUL_PROPAGATE_FAULT and
X86EMUL_CMPXCHG_FAILED) failed.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 6c4f4918db5e..0cf4311db0d6 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -51,8 +51,9 @@ struct x86_emulate_ctxt;
 #define X86EMUL_UNHANDLEABLE    1
 /* Terminate emulation but return success to the caller. */
 #define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
-#define X86EMUL_RETRY_INSTR     2 /* retry the instruction for some reason */
-#define X86EMUL_CMPXCHG_FAILED  2 /* cmpxchg did not see expected value */
+#define X86EMUL_RETRY_INSTR     3 /* retry the instruction for some reason */
+#define X86EMUL_CMPXCHG_FAILED  4 /* cmpxchg did not see expected value */
+
 struct x86_emulate_ops {
 	/*
 	 * read_std: Read bytes of standard (non-emulated/special) memory.
-- 
cgit v1.2.3


From 411c35b7ef02aefb91e166ffeffad0891d955fcb Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:34 +0300
Subject: KVM: fill in run->mmio details in (read|write)_emulated function

Fill in run->mmio details in (read|write)_emulated function just like
pio does. There is no point in filling only vcpu fields there just to
copy them into vcpu->run a little bit later.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 44a546b136fc..b976c4c1fa8f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3386,9 +3386,10 @@ mmio:
 	trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
 
 	vcpu->mmio_needed = 1;
-	vcpu->mmio_phys_addr = gpa;
-	vcpu->mmio_size = bytes;
-	vcpu->mmio_is_write = 0;
+	vcpu->run->exit_reason = KVM_EXIT_MMIO;
+	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
+	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
 
 	return X86EMUL_UNHANDLEABLE;
 }
@@ -3436,10 +3437,11 @@ mmio:
 		return X86EMUL_CONTINUE;
 
 	vcpu->mmio_needed = 1;
-	vcpu->mmio_phys_addr = gpa;
-	vcpu->mmio_size = bytes;
-	vcpu->mmio_is_write = 1;
-	memcpy(vcpu->mmio_data, val, bytes);
+	vcpu->run->exit_reason = KVM_EXIT_MMIO;
+	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
+	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
+	memcpy(vcpu->run->mmio.data, val, bytes);
 
 	return X86EMUL_CONTINUE;
 }
@@ -3850,7 +3852,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 {
 	int r, shadow_mask;
 	struct decode_cache *c;
-	struct kvm_run *run = vcpu->run;
 
 	kvm_clear_exception_queue(vcpu);
 	vcpu->arch.mmio_fault_cr2 = cr2;
@@ -3937,14 +3938,6 @@ restart:
 		return EMULATE_DO_MMIO;
 	}
 
-	if (r || vcpu->mmio_is_write) {
-		run->exit_reason = KVM_EXIT_MMIO;
-		run->mmio.phys_addr = vcpu->mmio_phys_addr;
-		memcpy(run->mmio.data, vcpu->mmio_data, 8);
-		run->mmio.len = vcpu->mmio_size;
-		run->mmio.is_write = vcpu->mmio_is_write;
-	}
-
 	if (r) {
 		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
 			goto done;
-- 
cgit v1.2.3


From c3cd7ffaf57ae6ead5b394cebaeb76164059a57f Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:35 +0300
Subject: KVM: x86 emulator: x86_emulate_insn() return -1 only in case of
 emulation failure

Currently emulator returns -1 when emulation failed or IO is needed.
Caller tries to guess whether emulation failed by looking at other
variables. Make it easier for caller to recognise error condition by
always returning -1 in case of failure. For this new emulator
internal return value X86EMUL_IO_NEEDED is introduced. It is used to
distinguish between error condition (which returns X86EMUL_UNHANDLEABLE)
and condition that requires IO exit to userspace to continue emulation.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  1 +
 arch/x86/kvm/x86.c                 | 36 ++++++++++++++++++------------------
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0cf4311db0d6..777240d4524e 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -53,6 +53,7 @@ struct x86_emulate_ctxt;
 #define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
 #define X86EMUL_RETRY_INSTR     3 /* retry the instruction for some reason */
 #define X86EMUL_CMPXCHG_FAILED  4 /* cmpxchg did not see expected value */
+#define X86EMUL_IO_NEEDED       5 /* IO is needed to complete emulation */
 
 struct x86_emulate_ops {
 	/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b976c4c1fa8f..4cb65d82abca 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3275,7 +3275,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
 		}
 		ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
 		if (ret < 0) {
-			r = X86EMUL_UNHANDLEABLE;
+			r = X86EMUL_IO_NEEDED;
 			goto out;
 		}
 
@@ -3331,7 +3331,7 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
 		}
 		ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
 		if (ret < 0) {
-			r = X86EMUL_UNHANDLEABLE;
+			r = X86EMUL_IO_NEEDED;
 			goto out;
 		}
 
@@ -3391,7 +3391,7 @@ mmio:
 	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
 	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
 
-	return X86EMUL_UNHANDLEABLE;
+	return X86EMUL_IO_NEEDED;
 }
 
 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -3863,8 +3863,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	 */
 	cache_all_regs(vcpu);
 
-	vcpu->mmio_is_write = 0;
-
 	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
 		int cs_db, cs_l;
 		kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
@@ -3938,24 +3936,26 @@ restart:
 		return EMULATE_DO_MMIO;
 	}
 
-	if (r) {
-		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
-			goto done;
-		if (!vcpu->mmio_needed) {
-			++vcpu->stat.insn_emulation_fail;
-			trace_kvm_emulate_insn_failed(vcpu);
-			kvm_report_emulation_failure(vcpu, "mmio");
-			return EMULATE_FAIL;
-		}
+	if (vcpu->mmio_needed) {
+		if (vcpu->mmio_is_write)
+			vcpu->mmio_needed = 0;
 		return EMULATE_DO_MMIO;
 	}
 
-	if (vcpu->mmio_is_write) {
-		vcpu->mmio_needed = 0;
-		return EMULATE_DO_MMIO;
+	if (r) { /* emulation failed */
+		/*
+		 * if emulation was due to access to shadowed page table
+		 * and it failed try to unshadow page and re-entetr the
+		 * guest to let CPU execute the instruction.
+		 */
+		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+			return EMULATE_DONE;
+
+		trace_kvm_emulate_insn_failed(vcpu);
+		kvm_report_emulation_failure(vcpu, "mmio");
+		return EMULATE_FAIL;
 	}
 
-done:
 	if (vcpu->arch.exception.pending)
 		vcpu->arch.emulate_ctxt.restart = false;
 
-- 
cgit v1.2.3


From f181b96d4c769b8915849eb9070c18116fd8d44e Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:36 +0300
Subject: KVM: remove export of emulator_write_emulated()

It is not called directly outside of the file it's defined in anymore.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 5 -----
 arch/x86/kvm/x86.c              | 1 -
 2 files changed, 6 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 97774ae3c874..2ca1867ed97a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -628,11 +628,6 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
 void fx_init(struct kvm_vcpu *vcpu);
 
-int emulator_write_emulated(unsigned long addr,
-			    const void *val,
-			    unsigned int bytes,
-			    struct kvm_vcpu *vcpu);
-
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		       const u8 *new, int bytes,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4cb65d82abca..15a4b754a451 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3465,7 +3465,6 @@ int emulator_write_emulated(unsigned long addr,
 	}
 	return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
 }
-EXPORT_SYMBOL_GPL(emulator_write_emulated);
 
 #define CMPXCHG_TYPE(t, ptr, old, new) \
 	(cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
-- 
cgit v1.2.3


From 8fe681e984b6505d4d12125c0776399304803ec7 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:37 +0300
Subject: KVM: do not inject #PF in (read|write)_emulated() callbacks

Return error to x86 emulator instead of injection exception behind its back.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |  3 +++
 arch/x86/kvm/emulate.c             | 12 +++++++++++-
 arch/x86/kvm/x86.c                 | 28 ++++++++++++++--------------
 3 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 777240d4524e..b7e00cb21c6e 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -94,6 +94,7 @@ struct x86_emulate_ops {
 	int (*read_emulated)(unsigned long addr,
 			     void *val,
 			     unsigned int bytes,
+			     unsigned int *error,
 			     struct kvm_vcpu *vcpu);
 
 	/*
@@ -106,6 +107,7 @@ struct x86_emulate_ops {
 	int (*write_emulated)(unsigned long addr,
 			      const void *val,
 			      unsigned int bytes,
+			      unsigned int *error,
 			      struct kvm_vcpu *vcpu);
 
 	/*
@@ -120,6 +122,7 @@ struct x86_emulate_ops {
 				const void *old,
 				const void *new,
 				unsigned int bytes,
+				unsigned int *error,
 				struct kvm_vcpu *vcpu);
 
 	int (*pio_in_emulated)(int size, unsigned short port, void *val,
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d5979ecc2521..d7a18a0f80a2 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1277,6 +1277,7 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
 {
 	int rc;
 	struct read_cache *mc = &ctxt->decode.mem_read;
+	u32 err;
 
 	while (size) {
 		int n = min(size, 8u);
@@ -1284,7 +1285,10 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
 		if (mc->pos < mc->end)
 			goto read_cached;
 
-		rc = ops->read_emulated(addr, mc->data + mc->end, n, ctxt->vcpu);
+		rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
+					ctxt->vcpu);
+		if (rc == X86EMUL_PROPAGATE_FAULT)
+			kvm_inject_page_fault(ctxt->vcpu, addr, err);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		mc->end += n;
@@ -1789,6 +1793,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 {
 	int rc;
 	struct decode_cache *c = &ctxt->decode;
+	u32 err;
 
 	switch (c->dst.type) {
 	case OP_REG:
@@ -1817,13 +1822,18 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 					&c->dst.orig_val,
 					&c->dst.val,
 					c->dst.bytes,
+					&err,
 					ctxt->vcpu);
 		else
 			rc = ops->write_emulated(
 					(unsigned long)c->dst.ptr,
 					&c->dst.val,
 					c->dst.bytes,
+					&err,
 					ctxt->vcpu);
+		if (rc == X86EMUL_PROPAGATE_FAULT)
+			kvm_inject_page_fault(ctxt->vcpu,
+					      (unsigned long)c->dst.ptr, err);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		break;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 15a4b754a451..51402d8a46fa 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3346,10 +3346,10 @@ out:
 static int emulator_read_emulated(unsigned long addr,
 				  void *val,
 				  unsigned int bytes,
+				  unsigned int *error_code,
 				  struct kvm_vcpu *vcpu)
 {
 	gpa_t                 gpa;
-	u32 error_code;
 
 	if (vcpu->mmio_read_completed) {
 		memcpy(val, vcpu->mmio_data, bytes);
@@ -3359,12 +3359,10 @@ static int emulator_read_emulated(unsigned long addr,
 		return X86EMUL_CONTINUE;
 	}
 
-	gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
+	gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code);
 
-	if (gpa == UNMAPPED_GVA) {
-		kvm_inject_page_fault(vcpu, addr, error_code);
+	if (gpa == UNMAPPED_GVA)
 		return X86EMUL_PROPAGATE_FAULT;
-	}
 
 	/* For APIC access vmexit */
 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3409,17 +3407,15 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 static int emulator_write_emulated_onepage(unsigned long addr,
 					   const void *val,
 					   unsigned int bytes,
+					   unsigned int *error_code,
 					   struct kvm_vcpu *vcpu)
 {
 	gpa_t                 gpa;
-	u32 error_code;
 
-	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
+	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code);
 
-	if (gpa == UNMAPPED_GVA) {
-		kvm_inject_page_fault(vcpu, addr, error_code);
+	if (gpa == UNMAPPED_GVA)
 		return X86EMUL_PROPAGATE_FAULT;
-	}
 
 	/* For APIC access vmexit */
 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3449,6 +3445,7 @@ mmio:
 int emulator_write_emulated(unsigned long addr,
 			    const void *val,
 			    unsigned int bytes,
+			    unsigned int *error_code,
 			    struct kvm_vcpu *vcpu)
 {
 	/* Crossing a page boundary? */
@@ -3456,14 +3453,16 @@ int emulator_write_emulated(unsigned long addr,
 		int rc, now;
 
 		now = -addr & ~PAGE_MASK;
-		rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
+		rc = emulator_write_emulated_onepage(addr, val, now, error_code,
+						     vcpu);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		addr += now;
 		val += now;
 		bytes -= now;
 	}
-	return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
+	return emulator_write_emulated_onepage(addr, val, bytes, error_code,
+					       vcpu);
 }
 
 #define CMPXCHG_TYPE(t, ptr, old, new) \
@@ -3480,6 +3479,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 				     const void *old,
 				     const void *new,
 				     unsigned int bytes,
+				     unsigned int *error_code,
 				     struct kvm_vcpu *vcpu)
 {
 	gpa_t gpa;
@@ -3533,7 +3533,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 emul_write:
 	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
 
-	return emulator_write_emulated(addr, new, bytes, vcpu);
+	return emulator_write_emulated(addr, new, bytes, error_code, vcpu);
 }
 
 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -4293,7 +4293,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
 
 	kvm_x86_ops->patch_hypercall(vcpu, instruction);
 
-	return emulator_write_emulated(rip, instruction, 3, vcpu);
+	return emulator_write_emulated(rip, instruction, 3, NULL, vcpu);
 }
 
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
-- 
cgit v1.2.3


From 3457e4192e367fd4e0da5e9f46f9df85fa99cd11 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:38 +0300
Subject: KVM: handle emulation failure case first

If emulation failed return immediately.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 51402d8a46fa..9e5a833f3392 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3924,22 +3924,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 
 restart:
 	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
-	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
-
-	if (r == 0)
-		kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
-
-	if (vcpu->arch.pio.count) {
-		if (!vcpu->arch.pio.in)
-			vcpu->arch.pio.count = 0;
-		return EMULATE_DO_MMIO;
-	}
-
-	if (vcpu->mmio_needed) {
-		if (vcpu->mmio_is_write)
-			vcpu->mmio_needed = 0;
-		return EMULATE_DO_MMIO;
-	}
 
 	if (r) { /* emulation failed */
 		/*
@@ -3955,6 +3939,21 @@ restart:
 		return EMULATE_FAIL;
 	}
 
+	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
+	kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
+
+	if (vcpu->arch.pio.count) {
+		if (!vcpu->arch.pio.in)
+			vcpu->arch.pio.count = 0;
+		return EMULATE_DO_MMIO;
+	}
+
+	if (vcpu->mmio_needed) {
+		if (vcpu->mmio_is_write)
+			vcpu->mmio_needed = 0;
+		return EMULATE_DO_MMIO;
+	}
+
 	if (vcpu->arch.exception.pending)
 		vcpu->arch.emulate_ctxt.restart = false;
 
-- 
cgit v1.2.3


From 95c5588652f7742a21c33d9dcce0e043e057d04f Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:39 +0300
Subject: KVM: x86 emulator: advance RIP outside x86 emulator code

Return new RIP as part of instruction emulation result instead of
updating KVM's RIP from x86 emulator code.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 7 ++++---
 arch/x86/kvm/x86.c     | 2 ++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d7a18a0f80a2..437f31bcffea 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2496,8 +2496,9 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 
 	if (rc == X86EMUL_CONTINUE) {
 		memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
-		kvm_rip_write(ctxt->vcpu, c->eip);
 		rc = writeback(ctxt, ops);
+		if (rc == X86EMUL_CONTINUE)
+			ctxt->eip = c->eip;
 	}
 
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
@@ -2554,7 +2555,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
 		string_done:
 			ctxt->restart = false;
-			kvm_rip_write(ctxt->vcpu, c->eip);
+			ctxt->eip = c->eip;
 			goto done;
 		}
 		/* The second termination condition only applies for REPE
@@ -3032,7 +3033,7 @@ writeback:
 	ctxt->decode.mem_read.end = 0;
 	/* Commit shadow register state. */
 	memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
-	kvm_rip_write(ctxt->vcpu, c->eip);
+	ctxt->eip = c->eip;
 	ops->set_rflags(ctxt->vcpu, ctxt->eflags);
 
 done:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9e5a833f3392..8f45cc712dda 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3941,6 +3941,7 @@ restart:
 
 	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
 	kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
+	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
 
 	if (vcpu->arch.pio.count) {
 		if (!vcpu->arch.pio.in)
@@ -4945,6 +4946,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 	if (ret)
 		return EMULATE_FAIL;
 
+	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
 	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
 	return EMULATE_DONE;
 }
-- 
cgit v1.2.3


From ef050dc0390176ec6888f373edb776587c88be3d Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:40 +0300
Subject: KVM: x86 emulator: set RFLAGS outside x86 emulator code

Removes the need for set_flags() callback.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h | 1 -
 arch/x86/kvm/emulate.c             | 1 -
 arch/x86/kvm/x86.c                 | 7 +------
 3 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b7e00cb21c6e..a87d95f09572 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -142,7 +142,6 @@ struct x86_emulate_ops {
 	ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
 	int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
 	int (*cpl)(struct kvm_vcpu *vcpu);
-	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
 	int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu);
 	int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu);
 	int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 437f31bcffea..291e220c69a5 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3034,7 +3034,6 @@ writeback:
 	/* Commit shadow register state. */
 	memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
 	ctxt->eip = c->eip;
-	ops->set_rflags(ctxt->vcpu, ctxt->eflags);
 
 done:
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8f45cc712dda..04ca343ee512 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3806,11 +3806,6 @@ static void emulator_set_segment_selector(u16 sel, int seg,
 	kvm_set_segment(vcpu, &kvm_seg, seg);
 }
 
-static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
-{
-	kvm_x86_ops->set_rflags(vcpu, rflags);
-}
-
 static struct x86_emulate_ops emulate_ops = {
 	.read_std            = kvm_read_guest_virt_system,
 	.write_std           = kvm_write_guest_virt_system,
@@ -3829,7 +3824,6 @@ static struct x86_emulate_ops emulate_ops = {
 	.get_cr              = emulator_get_cr,
 	.set_cr              = emulator_set_cr,
 	.cpl                 = emulator_get_cpl,
-	.set_rflags          = emulator_set_rflags,
 	.get_dr              = emulator_get_dr,
 	.set_dr              = emulator_set_dr,
 	.set_msr             = kvm_set_msr,
@@ -3941,6 +3935,7 @@ restart:
 
 	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
 	kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
+	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
 	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
 
 	if (vcpu->arch.pio.count) {
-- 
cgit v1.2.3


From bdb475a323858101f4a5ad6a1a04b1dd8885325a Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:41 +0300
Subject: KVM: x86 emulator: use shadowed register in emulate_sysexit()

emulate_sysexit() should use shadowed registers copy instead of
looking into vcpu state directly.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 291e220c69a5..42cb7d71ff55 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2063,8 +2063,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
 	ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
 
-	c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
-	c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
+	c->eip = c->regs[VCPU_REGS_RDX];
+	c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];
 
 	return X86EMUL_CONTINUE;
 }
-- 
cgit v1.2.3


From 4d2179e1e9cb74b25a8181a506600d96e15504fb Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:42 +0300
Subject: KVM: x86 emulator: handle shadowed registers outside emulator

Emulator shouldn't access vcpu directly.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 15 ---------------
 arch/x86/kvm/x86.c     | 14 ++++++++++++--
 2 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 42cb7d71ff55..97a42e8c00d0 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -941,12 +941,9 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	/* we cannot decode insn before we complete previous rep insn */
 	WARN_ON(ctxt->restart);
 
-	/* Shadow copy of register state. Committed on successful emulation. */
-	memset(c, 0, sizeof(struct decode_cache));
 	c->eip = ctxt->eip;
 	c->fetch.start = c->fetch.end = c->eip;
 	ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
-	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
 
 	switch (mode) {
 	case X86EMUL_MODE_REAL:
@@ -2486,16 +2483,13 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
-	memset(c, 0, sizeof(struct decode_cache));
 	c->eip = ctxt->eip;
-	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
 	c->dst.type = OP_NONE;
 
 	rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
 				     has_error_code, error_code);
 
 	if (rc == X86EMUL_CONTINUE) {
-		memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
 		rc = writeback(ctxt, ops);
 		if (rc == X86EMUL_CONTINUE)
 			ctxt->eip = c->eip;
@@ -2525,13 +2519,6 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	ctxt->interruptibility = 0;
 	ctxt->decode.mem_read.pos = 0;
 
-	/* Shadow copy of register state. Committed on successful emulation.
-	 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
-	 * modify them.
-	 */
-
-	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
-
 	if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
 		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
 		goto done;
@@ -3031,8 +3018,6 @@ writeback:
 	 * without decoding
 	 */
 	ctxt->decode.mem_read.end = 0;
-	/* Commit shadow register state. */
-	memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
 	ctxt->eip = c->eip;
 
 done:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 04ca343ee512..21d36081a9d9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3844,7 +3844,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 			int emulation_type)
 {
 	int r, shadow_mask;
-	struct decode_cache *c;
+	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
 
 	kvm_clear_exception_queue(vcpu);
 	vcpu->arch.mmio_fault_cr2 = cr2;
@@ -3869,13 +3869,14 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 			? X86EMUL_MODE_VM86 : cs_l
 			? X86EMUL_MODE_PROT64 :	cs_db
 			? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+		memset(c, 0, sizeof(struct decode_cache));
+		memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
 
 		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
 		trace_kvm_emulate_insn_start(vcpu);
 
 		/* Only allow emulation of specific instructions on #UD
 		 * (namely VMMCALL, sysenter, sysexit, syscall)*/
-		c = &vcpu->arch.emulate_ctxt.decode;
 		if (emulation_type & EMULTYPE_TRAP_UD) {
 			if (!c->twobyte)
 				return EMULATE_FAIL;
@@ -3916,6 +3917,10 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		return EMULATE_DONE;
 	}
 
+	/* this is needed for vmware backdor interface to work since it
+	   changes registers values  during IO operation */
+	memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+
 restart:
 	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
 
@@ -3936,6 +3941,7 @@ restart:
 	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
 	kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
 	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
 	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
 
 	if (vcpu->arch.pio.count) {
@@ -4919,6 +4925,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 		    bool has_error_code, u32 error_code)
 {
+	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
 	int cs_db, cs_l, ret;
 	cache_all_regs(vcpu);
 
@@ -4933,6 +4940,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 		? X86EMUL_MODE_VM86 : cs_l
 		? X86EMUL_MODE_PROT64 :	cs_db
 		? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+	memset(c, 0, sizeof(struct decode_cache));
+	memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
 
 	ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
 				   tss_selector, reason, has_error_code,
@@ -4941,6 +4950,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 	if (ret)
 		return EMULATE_FAIL;
 
+	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
 	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
 	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
 	return EMULATE_DONE;
-- 
cgit v1.2.3


From 95cb229530f329ec8002274891793be9c91385f7 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:43 +0300
Subject: KVM: x86 emulator: move interruptibility state tracking out of
 emulator

Emulator shouldn't access vcpu directly.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 19 ++-----------------
 arch/x86/kvm/x86.c     | 20 +++++++++++++++++---
 2 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 97a42e8c00d0..c40b40561dff 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1843,20 +1843,6 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 	return X86EMUL_CONTINUE;
 }
 
-static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
-{
-	u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
-	/*
-	 * an sti; sti; sequence only disable interrupts for the first
-	 * instruction. So, if the last instruction, be it emulated or
-	 * not, left the system with the INT_STI flag enabled, it
-	 * means that the last instruction is an sti. We should not
-	 * leave the flag on in this case. The same goes for mov ss
-	 */
-	if (!(int_shadow & mask))
-		ctxt->interruptibility = mask;
-}
-
 static inline void
 setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
 			struct x86_emulate_ops *ops, struct desc_struct *cs,
@@ -2516,7 +2502,6 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	int rc = X86EMUL_CONTINUE;
 	int saved_dst_type = c->dst.type;
 
-	ctxt->interruptibility = 0;
 	ctxt->decode.mem_read.pos = 0;
 
 	if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
@@ -2789,7 +2774,7 @@ special_insn:
 		}
 
 		if (c->modrm_reg == VCPU_SREG_SS)
-			toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS);
+			ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
 
 		rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
 
@@ -2958,7 +2943,7 @@ special_insn:
 		if (emulator_bad_iopl(ctxt, ops))
 			kvm_inject_gp(ctxt->vcpu, 0);
 		else {
-			toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI);
+			ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
 			ctxt->eflags |= X86_EFLAGS_IF;
 			c->dst.type = OP_NONE;	/* Disable writeback. */
 		}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 21d36081a9d9..91bfe7771f50 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3838,12 +3838,26 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)
 	vcpu->arch.regs_dirty = ~0;
 }
 
+static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
+{
+	u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
+	/*
+	 * an sti; sti; sequence only disable interrupts for the first
+	 * instruction. So, if the last instruction, be it emulated or
+	 * not, left the system with the INT_STI flag enabled, it
+	 * means that the last instruction is an sti. We should not
+	 * leave the flag on in this case. The same goes for mov ss
+	 */
+	if (!(int_shadow & mask))
+		kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
+}
+
 int emulate_instruction(struct kvm_vcpu *vcpu,
 			unsigned long cr2,
 			u16 error_code,
 			int emulation_type)
 {
-	int r, shadow_mask;
+	int r;
 	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
 
 	kvm_clear_exception_queue(vcpu);
@@ -3871,6 +3885,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 			? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
 		memset(c, 0, sizeof(struct decode_cache));
 		memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+		vcpu->arch.emulate_ctxt.interruptibility = 0;
 
 		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
 		trace_kvm_emulate_insn_start(vcpu);
@@ -3938,8 +3953,7 @@ restart:
 		return EMULATE_FAIL;
 	}
 
-	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
-	kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
+	toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
 	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
 	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
 	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
-- 
cgit v1.2.3


From 54b8486f469475d6c8e8aec917b91239a54eb8c8 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 28 Apr 2010 19:15:44 +0300
Subject: KVM: x86 emulator: do not inject exception directly into vcpu

Return exception as a result of instruction emulation and handle
injection in KVM code.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_emulate.h |   6 ++
 arch/x86/kvm/emulate.c             | 124 +++++++++++++++++++++++--------------
 arch/x86/kvm/x86.c                 |  20 +++++-
 3 files changed, 100 insertions(+), 50 deletions(-)

diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index a87d95f09572..51cfd730ac5d 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -216,6 +216,12 @@ struct x86_emulate_ctxt {
 	int interruptibility;
 
 	bool restart; /* restart string instruction after writeback */
+
+	int exception; /* exception that happens during emulation or -1 */
+	u32 error_code; /* error code for exception */
+	bool error_code_valid;
+	unsigned long cr2; /* faulted address in case of #PF */
+
 	/* decode cache */
 	struct decode_cache decode;
 };
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c40b40561dff..b43ac98ef790 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -653,6 +653,37 @@ static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
 	return seg_base(ctxt, ops, VCPU_SREG_SS);
 }
 
+static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
+				      u32 error, bool valid)
+{
+	ctxt->exception = vec;
+	ctxt->error_code = error;
+	ctxt->error_code_valid = valid;
+	ctxt->restart = false;
+}
+
+static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
+{
+	emulate_exception(ctxt, GP_VECTOR, err, true);
+}
+
+static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr,
+		       int err)
+{
+	ctxt->cr2 = addr;
+	emulate_exception(ctxt, PF_VECTOR, err, true);
+}
+
+static void emulate_ud(struct x86_emulate_ctxt *ctxt)
+{
+	emulate_exception(ctxt, UD_VECTOR, 0, false);
+}
+
+static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
+{
+	emulate_exception(ctxt, TS_VECTOR, err, true);
+}
+
 static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
 			      struct x86_emulate_ops *ops,
 			      unsigned long eip, u8 *dest)
@@ -1285,7 +1316,7 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
 		rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
 					ctxt->vcpu);
 		if (rc == X86EMUL_PROPAGATE_FAULT)
-			kvm_inject_page_fault(ctxt->vcpu, addr, err);
+			emulate_pf(ctxt, addr, err);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 		mc->end += n;
@@ -1366,13 +1397,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
 
 	if (dt.size < index * 8 + 7) {
-		kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
+		emulate_gp(ctxt, selector & 0xfffc);
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 	addr = dt.address + index * 8;
 	ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,  &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT)
-		kvm_inject_page_fault(ctxt->vcpu, addr, err);
+		emulate_pf(ctxt, addr, err);
 
        return ret;
 }
@@ -1391,14 +1422,14 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
 
 	if (dt.size < index * 8 + 7) {
-		kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
+		emulate_gp(ctxt, selector & 0xfffc);
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 
 	addr = dt.address + index * 8;
 	ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT)
-		kvm_inject_page_fault(ctxt->vcpu, addr, err);
+		emulate_pf(ctxt, addr, err);
 
 	return ret;
 }
@@ -1517,7 +1548,7 @@ load:
 	ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
 	return X86EMUL_CONTINUE;
 exception:
-	kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code);
+	emulate_exception(ctxt, err_vec, err_code, true);
 	return X86EMUL_PROPAGATE_FAULT;
 }
 
@@ -1578,7 +1609,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
 		break;
 	case X86EMUL_MODE_VM86:
 		if (iopl < 3) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			return X86EMUL_PROPAGATE_FAULT;
 		}
 		change_mask |= EFLG_IF;
@@ -1829,7 +1860,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 					&err,
 					ctxt->vcpu);
 		if (rc == X86EMUL_PROPAGATE_FAULT)
-			kvm_inject_page_fault(ctxt->vcpu,
+			emulate_pf(ctxt,
 					      (unsigned long)c->dst.ptr, err);
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
@@ -1883,7 +1914,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	/* syscall is not available in real mode */
 	if (ctxt->mode == X86EMUL_MODE_REAL ||
 	    ctxt->mode == X86EMUL_MODE_VM86) {
-		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+		emulate_ud(ctxt);
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 
@@ -1937,7 +1968,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 
 	/* inject #GP if in real mode */
 	if (ctxt->mode == X86EMUL_MODE_REAL) {
-		kvm_inject_gp(ctxt->vcpu, 0);
+		emulate_gp(ctxt, 0);
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 
@@ -1945,7 +1976,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	* Therefore, we inject an #UD.
 	*/
 	if (ctxt->mode == X86EMUL_MODE_PROT64) {
-		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+		emulate_ud(ctxt);
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 
@@ -1955,13 +1986,13 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	switch (ctxt->mode) {
 	case X86EMUL_MODE_PROT32:
 		if ((msr_data & 0xfffc) == 0x0) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			return X86EMUL_PROPAGATE_FAULT;
 		}
 		break;
 	case X86EMUL_MODE_PROT64:
 		if (msr_data == 0x0) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			return X86EMUL_PROPAGATE_FAULT;
 		}
 		break;
@@ -2004,7 +2035,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	/* inject #GP if in real mode or Virtual 8086 mode */
 	if (ctxt->mode == X86EMUL_MODE_REAL ||
 	    ctxt->mode == X86EMUL_MODE_VM86) {
-		kvm_inject_gp(ctxt->vcpu, 0);
+		emulate_gp(ctxt, 0);
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 
@@ -2022,7 +2053,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	case X86EMUL_MODE_PROT32:
 		cs_sel = (u16)(msr_data + 16);
 		if ((msr_data & 0xfffc) == 0x0) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			return X86EMUL_PROPAGATE_FAULT;
 		}
 		ss_sel = (u16)(msr_data + 24);
@@ -2030,7 +2061,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	case X86EMUL_MODE_PROT64:
 		cs_sel = (u16)(msr_data + 32);
 		if (msr_data == 0x0) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			return X86EMUL_PROPAGATE_FAULT;
 		}
 		ss_sel = cs_sel + 8;
@@ -2192,7 +2223,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+		emulate_pf(ctxt, old_tss_base, err);
 		return ret;
 	}
 
@@ -2202,7 +2233,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 			     &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+		emulate_pf(ctxt, old_tss_base, err);
 		return ret;
 	}
 
@@ -2210,7 +2241,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+		emulate_pf(ctxt, new_tss_base, err);
 		return ret;
 	}
 
@@ -2223,7 +2254,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 				     ctxt->vcpu, &err);
 		if (ret == X86EMUL_PROPAGATE_FAULT) {
 			/* FIXME: need to provide precise fault address */
-			kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+			emulate_pf(ctxt, new_tss_base, err);
 			return ret;
 		}
 	}
@@ -2266,7 +2297,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
 	int ret;
 
 	if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) {
-		kvm_inject_gp(ctxt->vcpu, 0);
+		emulate_gp(ctxt, 0);
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 	c->eip = tss->eip;
@@ -2334,7 +2365,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+		emulate_pf(ctxt, old_tss_base, err);
 		return ret;
 	}
 
@@ -2344,7 +2375,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 			     &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+		emulate_pf(ctxt, old_tss_base, err);
 		return ret;
 	}
 
@@ -2352,7 +2383,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
-		kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+		emulate_pf(ctxt, new_tss_base, err);
 		return ret;
 	}
 
@@ -2365,7 +2396,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 				     ctxt->vcpu, &err);
 		if (ret == X86EMUL_PROPAGATE_FAULT) {
 			/* FIXME: need to provide precise fault address */
-			kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+			emulate_pf(ctxt, new_tss_base, err);
 			return ret;
 		}
 	}
@@ -2399,7 +2430,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 	if (reason != TASK_SWITCH_IRET) {
 		if ((tss_selector & 3) > next_tss_desc.dpl ||
 		    ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			return X86EMUL_PROPAGATE_FAULT;
 		}
 	}
@@ -2408,8 +2439,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 	if (!next_tss_desc.p ||
 	    ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
 	     desc_limit < 0x2b)) {
-		kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR,
-				      tss_selector & 0xfffc);
+		emulate_ts(ctxt, tss_selector & 0xfffc);
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 
@@ -2505,19 +2535,19 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	ctxt->decode.mem_read.pos = 0;
 
 	if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
-		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+		emulate_ud(ctxt);
 		goto done;
 	}
 
 	/* LOCK prefix is allowed only with some instructions */
 	if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
-		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+		emulate_ud(ctxt);
 		goto done;
 	}
 
 	/* Privileged instruction can be executed only in CPL=0 */
 	if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
-		kvm_inject_gp(ctxt->vcpu, 0);
+		emulate_gp(ctxt, 0);
 		goto done;
 	}
 
@@ -2679,7 +2709,7 @@ special_insn:
 		c->dst.bytes = min(c->dst.bytes, 4u);
 		if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
 					  c->dst.bytes)) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			goto done;
 		}
 		if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
@@ -2691,7 +2721,7 @@ special_insn:
 		c->src.bytes = min(c->src.bytes, 4u);
 		if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
 					  c->src.bytes)) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			goto done;
 		}
 		ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
@@ -2754,7 +2784,7 @@ special_insn:
 		goto mov;
 	case 0x8c:  /* mov r/m, sreg */
 		if (c->modrm_reg > VCPU_SREG_GS) {
-			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			emulate_ud(ctxt);
 			goto done;
 		}
 		c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
@@ -2769,7 +2799,7 @@ special_insn:
 
 		if (c->modrm_reg == VCPU_SREG_CS ||
 		    c->modrm_reg > VCPU_SREG_GS) {
-			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			emulate_ud(ctxt);
 			goto done;
 		}
 
@@ -2895,7 +2925,7 @@ special_insn:
 	do_io_in:
 		c->dst.bytes = min(c->dst.bytes, 4u);
 		if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			goto done;
 		}
 		if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
@@ -2908,7 +2938,7 @@ special_insn:
 	do_io_out:
 		c->dst.bytes = min(c->dst.bytes, 4u);
 		if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			goto done;
 		}
 		ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
@@ -2933,7 +2963,7 @@ special_insn:
 		break;
 	case 0xfa: /* cli */
 		if (emulator_bad_iopl(ctxt, ops))
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 		else {
 			ctxt->eflags &= ~X86_EFLAGS_IF;
 			c->dst.type = OP_NONE;	/* Disable writeback. */
@@ -2941,7 +2971,7 @@ special_insn:
 		break;
 	case 0xfb: /* sti */
 		if (emulator_bad_iopl(ctxt, ops))
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 		else {
 			ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
 			ctxt->eflags |= X86_EFLAGS_IF;
@@ -3069,7 +3099,7 @@ twobyte_insn:
 			c->dst.type = OP_NONE;
 			break;
 		case 5: /* not defined */
-			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			emulate_ud(ctxt);
 			goto done;
 		case 7: /* invlpg*/
 			emulate_invlpg(ctxt->vcpu, c->modrm_ea);
@@ -3102,7 +3132,7 @@ twobyte_insn:
 		case 1:
 		case 5 ... 7:
 		case 9 ... 15:
-			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			emulate_ud(ctxt);
 			goto done;
 		}
 		c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
@@ -3111,7 +3141,7 @@ twobyte_insn:
 	case 0x21: /* mov from dr to reg */
 		if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
 		    (c->modrm_reg == 4 || c->modrm_reg == 5)) {
-			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			emulate_ud(ctxt);
 			goto done;
 		}
 		ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu);
@@ -3119,7 +3149,7 @@ twobyte_insn:
 		break;
 	case 0x22: /* mov reg, cr */
 		if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			goto done;
 		}
 		c->dst.type = OP_NONE;
@@ -3127,7 +3157,7 @@ twobyte_insn:
 	case 0x23: /* mov from reg to dr */
 		if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
 		    (c->modrm_reg == 4 || c->modrm_reg == 5)) {
-			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			emulate_ud(ctxt);
 			goto done;
 		}
 
@@ -3135,7 +3165,7 @@ twobyte_insn:
 				((ctxt->mode == X86EMUL_MODE_PROT64) ?
 				 ~0ULL : ~0U), ctxt->vcpu) < 0) {
 			/* #UD condition is already handled by the code above */
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			goto done;
 		}
 
@@ -3146,7 +3176,7 @@ twobyte_insn:
 		msr_data = (u32)c->regs[VCPU_REGS_RAX]
 			| ((u64)c->regs[VCPU_REGS_RDX] << 32);
 		if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			goto done;
 		}
 		rc = X86EMUL_CONTINUE;
@@ -3155,7 +3185,7 @@ twobyte_insn:
 	case 0x32:
 		/* rdmsr */
 		if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			goto done;
 		} else {
 			c->regs[VCPU_REGS_RAX] = (u32)msr_data;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 91bfe7771f50..63c87adcec48 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3852,6 +3852,17 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
 		kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
 }
 
+static void inject_emulated_exception(struct kvm_vcpu *vcpu)
+{
+	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+	if (ctxt->exception == PF_VECTOR)
+		kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code);
+	else if (ctxt->error_code_valid)
+		kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
+	else
+		kvm_queue_exception(vcpu, ctxt->exception);
+}
+
 int emulate_instruction(struct kvm_vcpu *vcpu,
 			unsigned long cr2,
 			u16 error_code,
@@ -3886,6 +3897,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		memset(c, 0, sizeof(struct decode_cache));
 		memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
 		vcpu->arch.emulate_ctxt.interruptibility = 0;
+		vcpu->arch.emulate_ctxt.exception = -1;
 
 		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
 		trace_kvm_emulate_insn_start(vcpu);
@@ -3958,6 +3970,11 @@ restart:
 	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
 	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
 
+	if (vcpu->arch.emulate_ctxt.exception >= 0) {
+		inject_emulated_exception(vcpu);
+		return EMULATE_DONE;
+	}
+
 	if (vcpu->arch.pio.count) {
 		if (!vcpu->arch.pio.in)
 			vcpu->arch.pio.count = 0;
@@ -3970,9 +3987,6 @@ restart:
 		return EMULATE_DO_MMIO;
 	}
 
-	if (vcpu->arch.exception.pending)
-		vcpu->arch.emulate_ctxt.restart = false;
-
 	if (vcpu->arch.emulate_ctxt.restart)
 		goto restart;
 
-- 
cgit v1.2.3


From d94e1dc9af60e3431a586c3edfbe42d8a0d3932b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 3 May 2010 16:54:48 +0300
Subject: KVM: Get rid of KVM_REQ_KICK

KVM_REQ_KICK poisons vcpu->requests by having a bit set during normal
operation.  This causes the fast path check for a clear vcpu->requests
to fail all the time, triggering tons of atomic operations.

Fix by replacing KVM_REQ_KICK with a vcpu->guest_mode atomic.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c       | 17 ++++++++++-------
 include/linux/kvm_host.h |  1 +
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 63c87adcec48..fc5611b4007f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4604,13 +4604,15 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (vcpu->fpu_active)
 		kvm_load_guest_fpu(vcpu);
 
-	local_irq_disable();
+	atomic_set(&vcpu->guest_mode, 1);
+	smp_wmb();
 
-	clear_bit(KVM_REQ_KICK, &vcpu->requests);
-	smp_mb__after_clear_bit();
+	local_irq_disable();
 
-	if (vcpu->requests || need_resched() || signal_pending(current)) {
-		set_bit(KVM_REQ_KICK, &vcpu->requests);
+	if (!atomic_read(&vcpu->guest_mode) || vcpu->requests
+	    || need_resched() || signal_pending(current)) {
+		atomic_set(&vcpu->guest_mode, 0);
+		smp_wmb();
 		local_irq_enable();
 		preempt_enable();
 		r = 1;
@@ -4655,7 +4657,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (hw_breakpoint_active())
 		hw_breakpoint_restore();
 
-	set_bit(KVM_REQ_KICK, &vcpu->requests);
+	atomic_set(&vcpu->guest_mode, 0);
+	smp_wmb();
 	local_irq_enable();
 
 	++vcpu->stat.exits;
@@ -5580,7 +5583,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 
 	me = get_cpu();
 	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
-		if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
+		if (atomic_xchg(&vcpu->guest_mode, 0))
 			smp_send_reschedule(cpu);
 	put_cpu();
 }
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a0e019769f5d..2c62319727ef 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -81,6 +81,7 @@ struct kvm_vcpu {
 	int vcpu_id;
 	struct mutex mutex;
 	int   cpu;
+	atomic_t guest_mode;
 	struct kvm_run *run;
 	unsigned long requests;
 	unsigned long guest_debug;
-- 
cgit v1.2.3


From 3f10c846f8f9e6a32bbfeefaf7dda7ff51c7da29 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 5 May 2010 16:04:42 +0200
Subject: KVM: SVM: Dump vmcb contents on failed vmrun

This patch adds a function to dump the vmcb into the kernel
log and calls it after a failed vmrun to ease debugging.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ce438e0fdd26..685cffff01f3 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2726,6 +2726,99 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_NPF]				= pf_interception,
 };
 
+void dump_vmcb(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct vmcb_control_area *control = &svm->vmcb->control;
+	struct vmcb_save_area *save = &svm->vmcb->save;
+
+	pr_err("VMCB Control Area:\n");
+	pr_err("cr_read:            %04x\n", control->intercept_cr_read);
+	pr_err("cr_write:           %04x\n", control->intercept_cr_write);
+	pr_err("dr_read:            %04x\n", control->intercept_dr_read);
+	pr_err("dr_write:           %04x\n", control->intercept_dr_write);
+	pr_err("exceptions:         %08x\n", control->intercept_exceptions);
+	pr_err("intercepts:         %016llx\n", control->intercept);
+	pr_err("pause filter count: %d\n", control->pause_filter_count);
+	pr_err("iopm_base_pa:       %016llx\n", control->iopm_base_pa);
+	pr_err("msrpm_base_pa:      %016llx\n", control->msrpm_base_pa);
+	pr_err("tsc_offset:         %016llx\n", control->tsc_offset);
+	pr_err("asid:               %d\n", control->asid);
+	pr_err("tlb_ctl:            %d\n", control->tlb_ctl);
+	pr_err("int_ctl:            %08x\n", control->int_ctl);
+	pr_err("int_vector:         %08x\n", control->int_vector);
+	pr_err("int_state:          %08x\n", control->int_state);
+	pr_err("exit_code:          %08x\n", control->exit_code);
+	pr_err("exit_info1:         %016llx\n", control->exit_info_1);
+	pr_err("exit_info2:         %016llx\n", control->exit_info_2);
+	pr_err("exit_int_info:      %08x\n", control->exit_int_info);
+	pr_err("exit_int_info_err:  %08x\n", control->exit_int_info_err);
+	pr_err("nested_ctl:         %lld\n", control->nested_ctl);
+	pr_err("nested_cr3:         %016llx\n", control->nested_cr3);
+	pr_err("event_inj:          %08x\n", control->event_inj);
+	pr_err("event_inj_err:      %08x\n", control->event_inj_err);
+	pr_err("lbr_ctl:            %lld\n", control->lbr_ctl);
+	pr_err("next_rip:           %016llx\n", control->next_rip);
+	pr_err("VMCB State Save Area:\n");
+	pr_err("es:   s: %04x a: %04x l: %08x b: %016llx\n",
+		save->es.selector, save->es.attrib,
+		save->es.limit, save->es.base);
+	pr_err("cs:   s: %04x a: %04x l: %08x b: %016llx\n",
+		save->cs.selector, save->cs.attrib,
+		save->cs.limit, save->cs.base);
+	pr_err("ss:   s: %04x a: %04x l: %08x b: %016llx\n",
+		save->ss.selector, save->ss.attrib,
+		save->ss.limit, save->ss.base);
+	pr_err("ds:   s: %04x a: %04x l: %08x b: %016llx\n",
+		save->ds.selector, save->ds.attrib,
+		save->ds.limit, save->ds.base);
+	pr_err("fs:   s: %04x a: %04x l: %08x b: %016llx\n",
+		save->fs.selector, save->fs.attrib,
+		save->fs.limit, save->fs.base);
+	pr_err("gs:   s: %04x a: %04x l: %08x b: %016llx\n",
+		save->gs.selector, save->gs.attrib,
+		save->gs.limit, save->gs.base);
+	pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n",
+		save->gdtr.selector, save->gdtr.attrib,
+		save->gdtr.limit, save->gdtr.base);
+	pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n",
+		save->ldtr.selector, save->ldtr.attrib,
+		save->ldtr.limit, save->ldtr.base);
+	pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n",
+		save->idtr.selector, save->idtr.attrib,
+		save->idtr.limit, save->idtr.base);
+	pr_err("tr:   s: %04x a: %04x l: %08x b: %016llx\n",
+		save->tr.selector, save->tr.attrib,
+		save->tr.limit, save->tr.base);
+	pr_err("cpl:            %d                efer:         %016llx\n",
+		save->cpl, save->efer);
+	pr_err("cr0:            %016llx cr2:          %016llx\n",
+		save->cr0, save->cr2);
+	pr_err("cr3:            %016llx cr4:          %016llx\n",
+		save->cr3, save->cr4);
+	pr_err("dr6:            %016llx dr7:          %016llx\n",
+		save->dr6, save->dr7);
+	pr_err("rip:            %016llx rflags:       %016llx\n",
+		save->rip, save->rflags);
+	pr_err("rsp:            %016llx rax:          %016llx\n",
+		save->rsp, save->rax);
+	pr_err("star:           %016llx lstar:        %016llx\n",
+		save->star, save->lstar);
+	pr_err("cstar:          %016llx sfmask:       %016llx\n",
+		save->cstar, save->sfmask);
+	pr_err("kernel_gs_base: %016llx sysenter_cs:  %016llx\n",
+		save->kernel_gs_base, save->sysenter_cs);
+	pr_err("sysenter_esp:   %016llx sysenter_eip: %016llx\n",
+		save->sysenter_esp, save->sysenter_eip);
+	pr_err("gpat:           %016llx dbgctl:       %016llx\n",
+		save->g_pat, save->dbgctl);
+	pr_err("br_from:        %016llx br_to:        %016llx\n",
+		save->br_from, save->br_to);
+	pr_err("excp_from:      %016llx excp_to:      %016llx\n",
+		save->last_excp_from, save->last_excp_to);
+
+}
+
 static int handle_exit(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -2770,6 +2863,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		kvm_run->fail_entry.hardware_entry_failure_reason
 			= svm->vmcb->control.exit_code;
+		pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
+		dump_vmcb(vcpu);
 		return 0;
 	}
 
-- 
cgit v1.2.3


From eec4b140c924b4c650e9a89e01d223266490e325 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 5 May 2010 16:04:44 +0200
Subject: KVM: SVM: Allow EFER.LMSLE to be set with nested svm

This patch enables setting of efer bit 13 which is allowed
in all SVM capable processors. This is necessary for the
SLES11 version of Xen 4.0 to boot with nested svm.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/msr-index.h | 2 ++
 arch/x86/kvm/svm.c               | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 8c7ae4318629..509a42187dc2 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -20,6 +20,7 @@
 #define _EFER_LMA		10 /* Long mode active (read-only) */
 #define _EFER_NX		11 /* No execute enable */
 #define _EFER_SVME		12 /* Enable virtualization */
+#define _EFER_LMSLE		13 /* Long Mode Segment Limit Enable */
 #define _EFER_FFXSR		14 /* Enable Fast FXSAVE/FXRSTOR */
 
 #define EFER_SCE		(1<<_EFER_SCE)
@@ -27,6 +28,7 @@
 #define EFER_LMA		(1<<_EFER_LMA)
 #define EFER_NX			(1<<_EFER_NX)
 #define EFER_SVME		(1<<_EFER_SVME)
+#define EFER_LMSLE		(1<<_EFER_LMSLE)
 #define EFER_FFXSR		(1<<_EFER_FFXSR)
 
 /* Intel MSRs. Some also available on other CPUs */
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 685cffff01f3..41fe0381a1ae 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -640,7 +640,7 @@ static __init int svm_hardware_setup(void)
 
 	if (nested) {
 		printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
-		kvm_enable_efer_bits(EFER_SVME);
+		kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
 	}
 
 	for_each_possible_cpu(cpu) {
-- 
cgit v1.2.3


From f3b8c964a9a6cfef6d3ca778648d53947b9fd257 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Wed, 5 May 2010 09:09:21 +0800
Subject: KVM: MMU: mark page table dirty when a pte is actually modified

Sometime cmpxchg_gpte doesn't modify gpte, in such case, don't mark
page table page as dirty.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index c7f27779c998..5c8ac060442f 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -177,10 +177,10 @@ walk:
 		if (!(pte & PT_ACCESSED_MASK)) {
 			trace_kvm_mmu_set_accessed_bit(table_gfn, index,
 						       sizeof(pte));
-			mark_page_dirty(vcpu->kvm, table_gfn);
 			if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
 			    index, pte, pte|PT_ACCESSED_MASK))
 				goto walk;
+			mark_page_dirty(vcpu->kvm, table_gfn);
 			pte |= PT_ACCESSED_MASK;
 		}
 
@@ -217,11 +217,11 @@ walk:
 		bool ret;
 
 		trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
-		mark_page_dirty(vcpu->kvm, table_gfn);
 		ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
 			    pte|PT_DIRTY_MASK);
 		if (ret)
 			goto walk;
+		mark_page_dirty(vcpu->kvm, table_gfn);
 		pte |= PT_DIRTY_MASK;
 		walker->ptes[walker->level - 1] = pte;
 	}
-- 
cgit v1.2.3


From 518c5a05e89a79e498c95c3e29f29bd236b3c972 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Wed, 5 May 2010 09:58:33 +0800
Subject: KVM: MMU: Fix debug output error in walk_addr()

Fix a debug output error in walk_addr

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 5c8ac060442f..15e379eaf3b0 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -229,7 +229,7 @@ walk:
 	walker->pt_access = pt_access;
 	walker->pte_access = pte_access;
 	pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
-		 __func__, (u64)pte, pt_access, pte_access);
+		 __func__, (u64)pte, pte_access, pt_access);
 	return 1;
 
 not_present:
-- 
cgit v1.2.3


From 54a4f0239f2e98bc0842818f611a4cf73bb7dd35 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Wed, 5 May 2010 09:03:49 +0800
Subject: KVM: MMU: make kvm_mmu_zap_page() return the number of pages it
 actually freed

Currently, kvm_mmu_zap_page() returning the number of freed children sp.
This might confuse the caller, because caller don't know the actual freed
number. Let's make kvm_mmu_zap_page() return the number of pages it actually
freed.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b666d8d106a9..be981b1f1881 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1504,6 +1504,8 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	if (sp->unsync)
 		kvm_unlink_unsync_page(kvm, sp);
 	if (!sp->root_count) {
+		/* Count self */
+		ret++;
 		hlist_del(&sp->hash_link);
 		kvm_mmu_free_page(kvm, sp);
 	} else {
@@ -1540,7 +1542,6 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 			page = container_of(kvm->arch.active_mmu_pages.prev,
 					    struct kvm_mmu_page, link);
 			used_pages -= kvm_mmu_zap_page(kvm, page);
-			used_pages--;
 		}
 		kvm_nr_mmu_pages = used_pages;
 		kvm->arch.n_free_mmu_pages = 0;
@@ -2941,7 +2942,7 @@ static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm)
 
 	page = container_of(kvm->arch.active_mmu_pages.prev,
 			    struct kvm_mmu_page, link);
-	return kvm_mmu_zap_page(kvm, page) + 1;
+	return kvm_mmu_zap_page(kvm, page);
 }
 
 static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
-- 
cgit v1.2.3


From 47dbb84f9e04e3261585da384d96b8c63f3376e3 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Apr 2010 12:08:56 +0300
Subject: KVM: Document KVM_SET_IDENTITY_MAP ioctl

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/kvm/api.txt | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index a237518e51b9..0f96e5223475 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -892,6 +892,24 @@ arguments.
 This ioctl is only useful after KVM_CREATE_IRQCHIP.  Without an in-kernel
 irqchip, the multiprocessing state must be maintained by userspace.
 
+4.39 KVM_SET_IDENTITY_MAP_ADDR
+
+Capability: KVM_CAP_SET_IDENTITY_MAP_ADDR
+Architectures: x86
+Type: vm ioctl
+Parameters: unsigned long identity (in)
+Returns: 0 on success, -1 on error
+
+This ioctl defines the physical address of a one-page region in the guest
+physical address space.  The region must be within the first 4GB of the
+guest physical address space and must not conflict with any memory slot
+or any mmio address.  The guest may malfunction if it accesses this memory
+region.
+
+This ioctl is required on Intel-based hosts.  This is needed on Intel hardware
+because of a quirk in the virtualization implementation (see the internals
+documentation when it pops into existence).
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
-- 
cgit v1.2.3


From 57bc24cfd655c912498983130326b312e0404db1 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 29 Apr 2010 12:12:57 +0300
Subject: KVM: Document KVM_SET_BOOT_CPU_ID

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/kvm/api.txt | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index 0f96e5223475..159b4efe1b0e 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -910,6 +910,18 @@ This ioctl is required on Intel-based hosts.  This is needed on Intel hardware
 because of a quirk in the virtualization implementation (see the internals
 documentation when it pops into existence).
 
+4.40 KVM_SET_BOOT_CPU_ID
+
+Capability: KVM_CAP_SET_BOOT_CPU_ID
+Architectures: x86, ia64
+Type: vm ioctl
+Parameters: unsigned long vcpu_id
+Returns: 0 on success, -1 on error
+
+Define which vcpu is the Bootstrap Processor (BSP).  Values are the same
+as the vcpu id in KVM_CREATE_VCPU.  If this ioctl is not called, the default
+is vcpu 0.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
-- 
cgit v1.2.3


From 6d77dbfc88e37c9efd5c5dd18445cfe819ae17ea Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 10 May 2010 11:16:56 +0300
Subject: KVM: inject #UD if instruction emulation fails and exit to userspace

Do not kill VM when instruction emulation fails. Inject #UD and report
failure to userspace instead. Userspace may choose to reenter guest if
vcpu is in userspace (cpl == 3) in which case guest OS will kill
offending process and continue running.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/mmu.c              |  5 +----
 arch/x86/kvm/svm.c              | 10 +++-------
 arch/x86/kvm/vmx.c              | 28 ++++-----------------------
 arch/x86/kvm/x86.c              | 43 +++++++++++++++++------------------------
 5 files changed, 26 insertions(+), 61 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2ca1867ed97a..0c06148fa3b1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -576,7 +576,6 @@ enum emulation_result {
 #define EMULTYPE_SKIP		    (1 << 2)
 int emulate_instruction(struct kvm_vcpu *vcpu,
 			unsigned long cr2, u16 error_code, int emulation_type);
-void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
 void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index be981b1f1881..4a02dee1f2b5 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2814,11 +2814,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
 		return 1;
 	case EMULATE_DO_MMIO:
 		++vcpu->stat.mmio_exits;
-		return 0;
+		/* fall through */
 	case EMULATE_FAIL:
-		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-		vcpu->run->internal.ndata = 0;
 		return 0;
 	default:
 		BUG();
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 41fe0381a1ae..134260c36ce2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1535,7 +1535,7 @@ static int io_interception(struct vcpu_svm *svm)
 	string = (io_info & SVM_IOIO_STR_MASK) != 0;
 	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
 	if (string || in)
-		return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
+		return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
 
 	port = io_info >> 16;
 	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -2386,16 +2386,12 @@ static int iret_interception(struct vcpu_svm *svm)
 
 static int invlpg_interception(struct vcpu_svm *svm)
 {
-	if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
-		pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
-	return 1;
+	return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
 }
 
 static int emulate_on_interception(struct vcpu_svm *svm)
 {
-	if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
-		pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
-	return 1;
+	return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
 }
 
 static int cr8_write_interception(struct vcpu_svm *svm)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 598931734251..a82cfa1e2a40 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3070,7 +3070,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
 	++vcpu->stat.io_exits;
 
 	if (string || in)
-		return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
+		return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
 
 	port = exit_qualification >> 16;
 	size = (exit_qualification & 7) + 1;
@@ -3327,22 +3327,7 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu)
 
 static int handle_apic_access(struct kvm_vcpu *vcpu)
 {
-	unsigned long exit_qualification;
-	enum emulation_result er;
-	unsigned long offset;
-
-	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-	offset = exit_qualification & 0xffful;
-
-	er = emulate_instruction(vcpu, 0, 0, 0);
-
-	if (er !=  EMULATE_DONE) {
-		printk(KERN_ERR
-		       "Fail to handle apic access vmexit! Offset is 0x%lx\n",
-		       offset);
-		return -ENOEXEC;
-	}
-	return 1;
+	return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
 }
 
 static int handle_task_switch(struct kvm_vcpu *vcpu)
@@ -3554,13 +3539,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 			goto out;
 		}
 
-		if (err != EMULATE_DONE) {
-			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-			vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-			vcpu->run->internal.ndata = 0;
-			ret = 0;
-			goto out;
-		}
+		if (err != EMULATE_DONE)
+			return 0;
 
 		if (signal_pending(current))
 			goto out;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fc5611b4007f..ae9d6f3e5d0d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3639,24 +3639,6 @@ int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu)
 	return __kvm_set_dr(vcpu, dr, value);
 }
 
-void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
-{
-	u8 opcodes[4];
-	unsigned long rip = kvm_rip_read(vcpu);
-	unsigned long rip_linear;
-
-	if (!printk_ratelimit())
-		return;
-
-	rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
-
-	kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
-
-	printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
-	       context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
-}
-EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
-
 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
 {
 	return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
@@ -3863,6 +3845,19 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 		kvm_queue_exception(vcpu, ctxt->exception);
 }
 
+static int handle_emulation_failure(struct kvm_vcpu *vcpu)
+{
+	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+
+	++vcpu->stat.insn_emulation_fail;
+	trace_kvm_emulate_insn_failed(vcpu);
+	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+	vcpu->run->internal.ndata = 0;
+	kvm_queue_exception(vcpu, UD_VECTOR);
+	return EMULATE_FAIL;
+}
+
 int emulate_instruction(struct kvm_vcpu *vcpu,
 			unsigned long cr2,
 			u16 error_code,
@@ -3931,11 +3926,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 
 		++vcpu->stat.insn_emulation;
 		if (r)  {
-			++vcpu->stat.insn_emulation_fail;
-			trace_kvm_emulate_insn_failed(vcpu);
 			if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
 				return EMULATE_DONE;
-			return EMULATE_FAIL;
+			if (emulation_type & EMULTYPE_SKIP)
+				return EMULATE_FAIL;
+			return handle_emulation_failure(vcpu);
 		}
 	}
 
@@ -3960,9 +3955,7 @@ restart:
 		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
 			return EMULATE_DONE;
 
-		trace_kvm_emulate_insn_failed(vcpu);
-		kvm_report_emulation_failure(vcpu, "mmio");
-		return EMULATE_FAIL;
+		return handle_emulation_failure(vcpu);
 	}
 
 	toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
@@ -4798,7 +4791,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 		r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
 		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-		if (r == EMULATE_DO_MMIO) {
+		if (r != EMULATE_DONE) {
 			r = 0;
 			goto out;
 		}
-- 
cgit v1.2.3


From f0f5933a1626c8df7b0bfd227819c66320fb4f0f Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 10 May 2010 12:09:56 +0300
Subject: KVM: MMU: Fix free memory accounting race in mmu_alloc_roots()

We drop the mmu lock between freeing memory and allocating the roots; this
allows some other vcpu to sneak in and allocate memory.

While the race is benign (resulting only in temporary overallocation, not oom)
it is simple and easy to fix by moving the freeing close to the allocation.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4a02dee1f2b5..d7aebafffdfe 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2094,6 +2094,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 			root_gfn = 0;
 		}
 		spin_lock(&vcpu->kvm->mmu_lock);
+		kvm_mmu_free_some_pages(vcpu->kvm);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
 				      PT64_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
@@ -2124,6 +2125,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 			root_gfn = i << 30;
 		}
 		spin_lock(&vcpu->kvm->mmu_lock);
+		kvm_mmu_free_some_pages(vcpu->kvm);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
 				      PT32_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
@@ -2496,9 +2498,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
 		goto out;
-	spin_lock(&vcpu->kvm->mmu_lock);
-	kvm_mmu_free_some_pages(vcpu);
-	spin_unlock(&vcpu->kvm->mmu_lock);
 	r = mmu_alloc_roots(vcpu);
 	spin_lock(&vcpu->kvm->mmu_lock);
 	mmu_sync_roots(vcpu);
-- 
cgit v1.2.3


From 6859762e8ae32ec258a671faf5c9fef07b25b83f Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Tue, 11 May 2010 14:36:58 +0800
Subject: KVM: update mmu documetation for role.nxe

There's no member "cr4_nxe" in struct kvm_mmu_page_role, it names "nxe" now.
Update mmu document.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 Documentation/kvm/mmu.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/kvm/mmu.txt b/Documentation/kvm/mmu.txt
index aaed6ab9d7ab..0e872ae30914 100644
--- a/Documentation/kvm/mmu.txt
+++ b/Documentation/kvm/mmu.txt
@@ -161,7 +161,7 @@ Shadow pages contain the following information:
   role.cr4_pae:
     Contains the value of cr4.pae for which the page is valid (e.g. whether
     32-bit or 64-bit gptes are in use).
-  role.cr4_nxe:
+  role.nxe:
     Contains the value of efer.nxe for which the page is valid.
   role.cr0_wp:
     Contains the value of cr0.wp for which the page is valid.
-- 
cgit v1.2.3


From 7725b89414836df492d6222b1d3cacb0ca576d77 Mon Sep 17 00:00:00 2001
From: Dongxiao Xu <dongxiao.xu@intel.com>
Date: Tue, 11 May 2010 18:29:38 +0800
Subject: KVM: VMX: Define new functions to wrapper direct call of asm code

Define vmcs_load() and kvm_cpu_vmxon() to avoid direct call of asm
code. Also move VMXE bit operation out of kvm_cpu_vmxoff().

Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a82cfa1e2a40..823288821444 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -453,6 +453,19 @@ static void vmcs_clear(struct vmcs *vmcs)
 		       vmcs, phys_addr);
 }
 
+static void vmcs_load(struct vmcs *vmcs)
+{
+	u64 phys_addr = __pa(vmcs);
+	u8 error;
+
+	asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
+			: "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+			: "cc", "memory");
+	if (error)
+		printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
+		       vmcs, phys_addr);
+}
+
 static void __vcpu_clear(void *arg)
 {
 	struct vcpu_vmx *vmx = arg;
@@ -830,7 +843,6 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u64 phys_addr = __pa(vmx->vmcs);
 	u64 tsc_this, delta, new_offset;
 
 	if (vcpu->cpu != cpu) {
@@ -844,15 +856,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	}
 
 	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
-		u8 error;
-
 		per_cpu(current_vmcs, cpu) = vmx->vmcs;
-		asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
-			      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
-			      : "cc");
-		if (error)
-			printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
-			       vmx->vmcs, phys_addr);
+		vmcs_load(vmx->vmcs);
 	}
 
 	if (vcpu->cpu != cpu) {
@@ -1288,6 +1293,13 @@ static __init int vmx_disabled_by_bios(void)
 	/* locked but not enabled */
 }
 
+static void kvm_cpu_vmxon(u64 addr)
+{
+	asm volatile (ASM_VMX_VMXON_RAX
+			: : "a"(&addr), "m"(addr)
+			: "memory", "cc");
+}
+
 static int hardware_enable(void *garbage)
 {
 	int cpu = raw_smp_processor_id();
@@ -1310,9 +1322,7 @@ static int hardware_enable(void *garbage)
 		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
 	}
 	write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
-	asm volatile (ASM_VMX_VMXON_RAX
-		      : : "a"(&phys_addr), "m"(phys_addr)
-		      : "memory", "cc");
+	kvm_cpu_vmxon(phys_addr);
 
 	ept_sync_global();
 
@@ -1336,13 +1346,13 @@ static void vmclear_local_vcpus(void)
 static void kvm_cpu_vmxoff(void)
 {
 	asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
-	write_cr4(read_cr4() & ~X86_CR4_VMXE);
 }
 
 static void hardware_disable(void *garbage)
 {
 	vmclear_local_vcpus();
 	kvm_cpu_vmxoff();
+	write_cr4(read_cr4() & ~X86_CR4_VMXE);
 }
 
 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
-- 
cgit v1.2.3


From 92fe13be74303a7b80dc3c99e22e12a87d41bd5f Mon Sep 17 00:00:00 2001
From: Dongxiao Xu <dongxiao.xu@intel.com>
Date: Tue, 11 May 2010 18:29:42 +0800
Subject: KVM: VMX: Some minor changes to code structure

Do some preparations for vmm coexistence support.

Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 823288821444..0d281dbc008f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -845,15 +845,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u64 tsc_this, delta, new_offset;
 
-	if (vcpu->cpu != cpu) {
+	if (vcpu->cpu != cpu)
 		vcpu_clear(vmx);
-		kvm_migrate_timers(vcpu);
-		set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
-		local_irq_disable();
-		list_add(&vmx->local_vcpus_link,
-			 &per_cpu(vcpus_on_cpu, cpu));
-		local_irq_enable();
-	}
 
 	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
 		per_cpu(current_vmcs, cpu) = vmx->vmcs;
@@ -864,6 +857,13 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		struct desc_ptr dt;
 		unsigned long sysenter_esp;
 
+		kvm_migrate_timers(vcpu);
+		set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
+		local_irq_disable();
+		list_add(&vmx->local_vcpus_link,
+			 &per_cpu(vcpus_on_cpu, cpu));
+		local_irq_enable();
+
 		vcpu->cpu = cpu;
 		/*
 		 * Linux uses per-cpu TSS and GDT, so set these when switching
-- 
cgit v1.2.3


From b923e62e4d48bc5242b32a6ef5ba0f886137668a Mon Sep 17 00:00:00 2001
From: Dongxiao Xu <dongxiao.xu@intel.com>
Date: Tue, 11 May 2010 18:29:45 +0800
Subject: KVM: VMX: VMCLEAR/VMPTRLD usage changes

Originally VMCLEAR/VMPTRLD is called on vcpu migration. To
support hosted VMM coexistance, VMCLEAR is executed on vcpu
schedule out, and VMPTRLD is executed on vcpu schedule in.
This could also eliminate the IPI when doing VMCLEAR.

Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0d281dbc008f..9529bff04262 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -63,6 +63,9 @@ module_param_named(unrestricted_guest,
 static int __read_mostly emulate_invalid_guest_state = 0;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
+static int __read_mostly vmm_exclusive = 1;
+module_param(vmm_exclusive, bool, S_IRUGO);
+
 #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST				\
 	(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
 #define KVM_GUEST_CR0_MASK						\
@@ -845,7 +848,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u64 tsc_this, delta, new_offset;
 
-	if (vcpu->cpu != cpu)
+	if (vmm_exclusive && vcpu->cpu != cpu)
 		vcpu_clear(vmx);
 
 	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
@@ -891,6 +894,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	__vmx_load_host_state(to_vmx(vcpu));
+	if (!vmm_exclusive)
+		__vcpu_clear(to_vmx(vcpu));
 }
 
 static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 4610c9cc6d9c84f7d585583699f04d5f51c83671 Mon Sep 17 00:00:00 2001
From: Dongxiao Xu <dongxiao.xu@intel.com>
Date: Tue, 11 May 2010 18:29:48 +0800
Subject: KVM: VMX: VMXON/VMXOFF usage changes

SDM suggests VMXON should be called before VMPTRLD, and VMXOFF
should be called after doing VMCLEAR.

Therefore in vmm coexistence case, we should firstly call VMXON
before any VMCS operation, and then call VMXOFF after the
operation is done.

Signed-off-by: Dongxiao Xu <dongxiao.xu@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 38 +++++++++++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9529bff04262..b8aac4e98903 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -176,6 +176,8 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 
 static int init_rmode(struct kvm *kvm);
 static u64 construct_eptp(unsigned long root_hpa);
+static void kvm_cpu_vmxon(u64 addr);
+static void kvm_cpu_vmxoff(void);
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -847,8 +849,11 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u64 tsc_this, delta, new_offset;
+	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
 
-	if (vmm_exclusive && vcpu->cpu != cpu)
+	if (!vmm_exclusive)
+		kvm_cpu_vmxon(phys_addr);
+	else if (vcpu->cpu != cpu)
 		vcpu_clear(vmx);
 
 	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
@@ -894,8 +899,10 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	__vmx_load_host_state(to_vmx(vcpu));
-	if (!vmm_exclusive)
+	if (!vmm_exclusive) {
 		__vcpu_clear(to_vmx(vcpu));
+		kvm_cpu_vmxoff();
+	}
 }
 
 static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -1327,9 +1334,11 @@ static int hardware_enable(void *garbage)
 		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
 	}
 	write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
-	kvm_cpu_vmxon(phys_addr);
 
-	ept_sync_global();
+	if (vmm_exclusive) {
+		kvm_cpu_vmxon(phys_addr);
+		ept_sync_global();
+	}
 
 	return 0;
 }
@@ -1355,8 +1364,10 @@ static void kvm_cpu_vmxoff(void)
 
 static void hardware_disable(void *garbage)
 {
-	vmclear_local_vcpus();
-	kvm_cpu_vmxoff();
+	if (vmm_exclusive) {
+		vmclear_local_vcpus();
+		kvm_cpu_vmxoff();
+	}
 	write_cr4(read_cr4() & ~X86_CR4_VMXE);
 }
 
@@ -3991,6 +4002,19 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 	kmem_cache_free(kvm_vcpu_cache, vmx);
 }
 
+static inline void vmcs_init(struct vmcs *vmcs)
+{
+	u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
+
+	if (!vmm_exclusive)
+		kvm_cpu_vmxon(phys_addr);
+
+	vmcs_clear(vmcs);
+
+	if (!vmm_exclusive)
+		kvm_cpu_vmxoff();
+}
+
 static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 {
 	int err;
@@ -4016,7 +4040,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	if (!vmx->vmcs)
 		goto free_msrs;
 
-	vmcs_clear(vmx->vmcs);
+	vmcs_init(vmx->vmcs);
 
 	cpu = get_cpu();
 	vmx_vcpu_load(&vmx->vcpu, cpu);
-- 
cgit v1.2.3


From 24955b6c906045382b67f3e6beba7e5df4a4a045 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 12 May 2010 21:00:35 -0300
Subject: KVM: pass correct parameter to kvm_mmu_free_some_pages

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d7aebafffdfe..a455c5eee370 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2094,7 +2094,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 			root_gfn = 0;
 		}
 		spin_lock(&vcpu->kvm->mmu_lock);
-		kvm_mmu_free_some_pages(vcpu->kvm);
+		kvm_mmu_free_some_pages(vcpu);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
 				      PT64_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
@@ -2125,7 +2125,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 			root_gfn = i << 30;
 		}
 		spin_lock(&vcpu->kvm->mmu_lock);
-		kvm_mmu_free_some_pages(vcpu->kvm);
+		kvm_mmu_free_some_pages(vcpu);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
 				      PT32_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
-- 
cgit v1.2.3


From dfb507c41d0d12fc69820abb7f040d31fcf015fe Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Tue, 11 May 2010 22:22:40 +0300
Subject: KVM: x86 emulator: Add test acc, imm instruction (opcodes 0xA8 -
 0xA9)

This adds test acc, imm instruction to the x86 emulator

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b43ac98ef790..35dd57c5a7fd 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -181,7 +181,7 @@ static u32 opcode_table[256] = {
 	ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
 	ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
 	/* 0xA8 - 0xAF */
-	0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
+	DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
 	ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
 	ByteOp | DstDI | String, DstDI | String,
 	/* 0xB0 - 0xB7 */
@@ -2754,6 +2754,7 @@ special_insn:
 		}
 		break;
 	case 0x84 ... 0x85:
+	test:
 		emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
 		break;
 	case 0x86 ... 0x87:	/* xchg */
@@ -2852,6 +2853,8 @@ special_insn:
 		c->dst.type = OP_NONE; /* Disable writeback. */
 		DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
 		goto cmp;
+	case 0xa8 ... 0xa9:	/* test ax, imm */
+		goto test;
 	case 0xaa ... 0xab:	/* stos */
 		c->dst.val = c->regs[VCPU_REGS_RAX];
 		break;
-- 
cgit v1.2.3


From abc190830f28a5bb678eaccb633de02ed2967d55 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Wed, 12 May 2010 01:39:21 +0300
Subject: KVM: x86 emulator: Add missing decoder flags for sub instruction

This adds missing decoder flags for sub instructions (opcodes 0x2c - 0x2d)

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 35dd57c5a7fd..1b974f80e1e4 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -126,7 +126,7 @@ static u32 opcode_table[256] = {
 	/* 0x28 - 0x2F */
 	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
+	ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
 	/* 0x30 - 0x37 */
 	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-- 
cgit v1.2.3


From 222b7c52c33bdef721248bfeba992af495800d30 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Wed, 12 May 2010 01:39:22 +0300
Subject: KVM: x86 emulator: Add missing decoder flags for xor instructions

This adds missing decoder flags for xor instructions (opcodes 0x34 - 0x35)

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1b974f80e1e4..7a36eec8bab8 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -130,7 +130,7 @@ static u32 opcode_table[256] = {
 	/* 0x30 - 0x37 */
 	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
+	ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
 	/* 0x38 - 0x3F */
 	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-- 
cgit v1.2.3


From 62ad07551a2ace89e35604d1c55fdae1dd3359a8 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 12 May 2010 16:40:41 +0800
Subject: KVM: x86: Clean up duplicate assignment

mmu.free() already set root_hpa to INVALID_PAGE, no need to do it again in the
destory_kvm_mmu().

kvm_x86_ops->set_cr4() and set_efer() already assign cr4/efer to
vcpu->arch.cr4/efer, no need to do it again later.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 ++---
 arch/x86/kvm/x86.c | 4 +---
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a455c5eee370..c075542648cd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2478,10 +2478,9 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
 {
 	ASSERT(vcpu);
-	if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
+	if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		/* mmu.free() should set root_hpa = INVALID_PAGE */
 		vcpu->arch.mmu.free(vcpu);
-		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-	}
 }
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ae9d6f3e5d0d..03039fd86980 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -486,7 +486,7 @@ int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		return 1;
 
 	kvm_x86_ops->set_cr4(vcpu, cr4);
-	vcpu->arch.cr4 = cr4;
+
 	kvm_mmu_reset_context(vcpu);
 
 	return 0;
@@ -721,8 +721,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 
 	kvm_x86_ops->set_efer(vcpu, efer);
 
-	vcpu->arch.efer = efer;
-
 	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
 	kvm_mmu_reset_context(vcpu);
 
-- 
cgit v1.2.3


From aad827034e419fa8c5ec39e6455266f0b942d856 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 12 May 2010 16:40:42 +0800
Subject: KVM: VMX: Only reset MMU when necessary

Only modifying some bits of CR0/CR4 needs paging mode switch.

Modify EFER.NXE bit would result in reserved bit updates.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 03039fd86980..78147f0421a0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -416,6 +416,10 @@ out:
 
 static int __kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
+	unsigned long old_cr0 = kvm_read_cr0(vcpu);
+	unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
+				    X86_CR0_CD | X86_CR0_NW;
+
 	cr0 |= X86_CR0_ET;
 
 #ifdef CONFIG_X86_64
@@ -449,7 +453,8 @@ static int __kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 
 	kvm_x86_ops->set_cr0(vcpu, cr0);
 
-	kvm_mmu_reset_context(vcpu);
+	if ((cr0 ^ old_cr0) & update_bits)
+		kvm_mmu_reset_context(vcpu);
 	return 0;
 }
 
@@ -487,7 +492,8 @@ int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 
 	kvm_x86_ops->set_cr4(vcpu, cr4);
 
-	kvm_mmu_reset_context(vcpu);
+	if ((cr4 ^ old_cr4) & pdptr_bits)
+		kvm_mmu_reset_context(vcpu);
 
 	return 0;
 }
@@ -693,6 +699,8 @@ static u32 emulated_msrs[] = {
 
 static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
+	u64 old_efer = vcpu->arch.efer;
+
 	if (efer & efer_reserved_bits)
 		return 1;
 
@@ -724,6 +732,10 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
 	kvm_mmu_reset_context(vcpu);
 
+	/* Update reserved bits */
+	if ((efer ^ old_efer) & EFER_NX)
+		kvm_mmu_reset_context(vcpu);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 48bb09eee4e102544808c00f43bc40a4a2e43e50 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Wed, 12 May 2010 09:46:31 -0400
Subject: KVM: remove CAP_SYS_RAWIO requirement from kvm_vm_ioctl_assign_irq

Remove this check in an effort to allow kvm guests to run without
root privileges.  This capability check doesn't seem to add any
security since the device needs to have already been added via the
assign device ioctl and the io actually occurs through the pci
sysfs interface.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 virt/kvm/assigned-dev.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index 4d10b1e047f4..64672e2d43c3 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -448,9 +448,6 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 	struct kvm_assigned_dev_kernel *match;
 	unsigned long host_irq_type, guest_irq_type;
 
-	if (!capable(CAP_SYS_RAWIO))
-		return -EPERM;
-
 	if (!irqchip_in_kernel(kvm))
 		return r;
 
-- 
cgit v1.2.3


From e8ad9a707496c163312bcdd6aa3b90603d45dc9b Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 13 May 2010 10:06:02 +0800
Subject: KVM: MMU: use proper cache object freeing function

Use kmem_cache_free to free objects allocated by kmem_cache_alloc.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c075542648cd..bb48b0ca5f8c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -305,10 +305,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 	return 0;
 }
 
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
+				  struct kmem_cache *cache)
 {
 	while (mc->nobjs)
-		kfree(mc->objects[--mc->nobjs]);
+		kmem_cache_free(cache, mc->objects[--mc->nobjs]);
 }
 
 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
@@ -356,10 +357,11 @@ out:
 
 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 {
-	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
-	mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
+	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
+	mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
 	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
-	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
+	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
+				mmu_page_header_cache);
 }
 
 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
@@ -380,7 +382,7 @@ static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
 
 static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
 {
-	kfree(pc);
+	kmem_cache_free(pte_chain_cache, pc);
 }
 
 static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
@@ -391,7 +393,7 @@ static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
 
 static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
 {
-	kfree(rd);
+	kmem_cache_free(rmap_desc_cache, rd);
 }
 
 /*
@@ -898,7 +900,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	list_del(&sp->link);
 	__free_page(virt_to_page(sp->spt));
 	__free_page(virt_to_page(sp->gfns));
-	kfree(sp);
+	kmem_cache_free(mmu_page_header_cache, sp);
 	++kvm->arch.n_free_mmu_pages;
 }
 
-- 
cgit v1.2.3


From 6d74229f013ed8e4a00d74cfa7a3fa6a2315c467 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 13 May 2010 10:07:00 +0800
Subject: KVM: MMU: remove rmap before clear spte

Remove rmap before clear spte otherwise it will trigger BUG_ON() in
some functions such as rmap_write_protect().

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index bb48b0ca5f8c..5c9d6df0113e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1813,6 +1813,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		if (level > PT_PAGE_TABLE_LEVEL &&
 		    has_wrprotected_page(vcpu->kvm, gfn, level)) {
 			ret = 1;
+			rmap_remove(vcpu->kvm, sptep);
 			spte = shadow_trap_nonpresent_pte;
 			goto set_pte;
 		}
-- 
cgit v1.2.3


From f55c3f419ab1f0a9d66f44ceeefe752975ae4233 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 13 May 2010 10:08:08 +0800
Subject: KVM: MMU: unalias gfn before sp->gfns[] comparison in sync_page

sp->gfns[] contain unaliased gfns, but gpte might contain pointer
to aliased region.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 15e379eaf3b0..22f13797f521 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -586,7 +586,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		unsigned pte_access;
 		pt_element_t gpte;
 		gpa_t pte_gpa;
-		gfn_t gfn = sp->gfns[i];
+		gfn_t gfn;
 
 		if (!is_shadow_present_pte(sp->spt[i]))
 			continue;
@@ -597,8 +597,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 					  sizeof(pt_element_t)))
 			return -EINVAL;
 
-		if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) ||
-		    !(gpte & PT_ACCESSED_MASK)) {
+		gfn = gpte_to_gfn(gpte);
+		if (unalias_gfn(vcpu->kvm, gfn) != sp->gfns[i] ||
+		      !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) {
 			u64 nonpresent;
 
 			rmap_remove(vcpu->kvm, &sp->spt[i]);
-- 
cgit v1.2.3


From 1683b2416e4c514d30ff5844a06733d0444ee000 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Thu, 13 May 2010 10:09:57 +0800
Subject: KVM: x86: cleanup unused local variable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

fix:
 arch/x86/kvm/x86.c: In function ‘handle_emulation_failure’:
 arch/x86/kvm/x86.c:3844: warning: unused variable ‘ctxt’

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 78147f0421a0..b05321adfd2f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3857,8 +3857,6 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu)
 
 static int handle_emulation_failure(struct kvm_vcpu *vcpu)
 {
-	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
-
 	++vcpu->stat.insn_emulation_fail;
 	trace_kvm_emulate_insn_failed(vcpu);
 	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-- 
cgit v1.2.3


From 2122ff5eab8faec853e43f6de886e8dc8f31e317 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 13 May 2010 11:25:04 +0300
Subject: KVM: move vcpu locking to dispatcher for generic vcpu ioctls

All vcpu ioctls need to be locked, so instead of locking each one specifically
we lock at the generic dispatcher.

This patch only updates generic ioctls and leaves arch specific ioctls alone.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c   | 11 -----------
 arch/powerpc/kvm/book3s.c  | 16 ----------------
 arch/powerpc/kvm/booke.c   | 10 ----------
 arch/powerpc/kvm/powerpc.c |  4 ----
 arch/s390/kvm/kvm-s390.c   | 16 ----------------
 arch/x86/kvm/x86.c         | 40 ++--------------------------------------
 virt/kvm/kvm_main.c        | 15 +++++++++++++++
 7 files changed, 17 insertions(+), 95 deletions(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 21b701374f72..91760e80e268 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -725,8 +725,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	int r;
 	sigset_t sigsaved;
 
-	vcpu_load(vcpu);
-
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
@@ -748,7 +746,6 @@ out:
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
-	vcpu_put(vcpu);
 	return r;
 }
 
@@ -883,8 +880,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
 	int i;
 
-	vcpu_load(vcpu);
-
 	for (i = 0; i < 16; i++) {
 		vpd->vgr[i] = regs->vpd.vgr[i];
 		vpd->vbgr[i] = regs->vpd.vbgr[i];
@@ -931,8 +926,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	vcpu->arch.itc_offset = regs->saved_itc - kvm_get_itc(vcpu);
 	set_bit(KVM_REQ_RESUME, &vcpu->requests);
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
@@ -1967,9 +1960,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
-	vcpu_load(vcpu);
 	mp_state->mp_state = vcpu->arch.mp_state;
-	vcpu_put(vcpu);
 	return 0;
 }
 
@@ -2000,10 +1991,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 {
 	int r = 0;
 
-	vcpu_load(vcpu);
 	vcpu->arch.mp_state = mp_state->mp_state;
 	if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)
 		r = vcpu_reset(vcpu);
-	vcpu_put(vcpu);
 	return r;
 }
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index b998abf1a63d..f6eac2f337d9 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1047,8 +1047,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 	int i;
 
-	vcpu_load(vcpu);
-
 	regs->pc = kvmppc_get_pc(vcpu);
 	regs->cr = kvmppc_get_cr(vcpu);
 	regs->ctr = kvmppc_get_ctr(vcpu);
@@ -1069,8 +1067,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 		regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
@@ -1078,8 +1074,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 	int i;
 
-	vcpu_load(vcpu);
-
 	kvmppc_set_pc(vcpu, regs->pc);
 	kvmppc_set_cr(vcpu, regs->cr);
 	kvmppc_set_ctr(vcpu, regs->ctr);
@@ -1099,8 +1093,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 		kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
@@ -1110,8 +1102,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 	int i;
 
-	vcpu_load(vcpu);
-
 	sregs->pvr = vcpu->arch.pvr;
 
 	sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1;
@@ -1131,8 +1121,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 		}
 	}
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
@@ -1142,8 +1130,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 	int i;
 
-	vcpu_load(vcpu);
-
 	kvmppc_set_pvr(vcpu, sregs->pvr);
 
 	vcpu3s->sdr1 = sregs->u.s.sdr1;
@@ -1171,8 +1157,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	/* Flush the MMU after messing with the segments */
 	kvmppc_mmu_pte_flush(vcpu, 0, 0);
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index a33ab8cc2ccc..b687f430f8c0 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -485,8 +485,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 	int i;
 
-	vcpu_load(vcpu);
-
 	regs->pc = vcpu->arch.pc;
 	regs->cr = kvmppc_get_cr(vcpu);
 	regs->ctr = vcpu->arch.ctr;
@@ -507,8 +505,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 		regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
@@ -516,8 +512,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 	int i;
 
-	vcpu_load(vcpu);
-
 	vcpu->arch.pc = regs->pc;
 	kvmppc_set_cr(vcpu, regs->cr);
 	vcpu->arch.ctr = regs->ctr;
@@ -537,8 +531,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 		kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
@@ -569,9 +561,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 {
 	int r;
 
-	vcpu_load(vcpu);
 	r = kvmppc_core_vcpu_translate(vcpu, tr);
-	vcpu_put(vcpu);
 	return r;
 }
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 9b8683f39e05..e0fae7a618ca 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -423,8 +423,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	int r;
 	sigset_t sigsaved;
 
-	vcpu_load(vcpu);
-
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
@@ -456,8 +454,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
-	vcpu_put(vcpu);
-
 	return r;
 }
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index ae3705816878..0d0e0add0201 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -371,55 +371,43 @@ static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
 
 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
-	vcpu_load(vcpu);
 	memcpy(&vcpu->arch.guest_gprs, &regs->gprs, sizeof(regs->gprs));
-	vcpu_put(vcpu);
 	return 0;
 }
 
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
-	vcpu_load(vcpu);
 	memcpy(&regs->gprs, &vcpu->arch.guest_gprs, sizeof(regs->gprs));
-	vcpu_put(vcpu);
 	return 0;
 }
 
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 {
-	vcpu_load(vcpu);
 	memcpy(&vcpu->arch.guest_acrs, &sregs->acrs, sizeof(sregs->acrs));
 	memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs));
-	vcpu_put(vcpu);
 	return 0;
 }
 
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 {
-	vcpu_load(vcpu);
 	memcpy(&sregs->acrs, &vcpu->arch.guest_acrs, sizeof(sregs->acrs));
 	memcpy(&sregs->crs, &vcpu->arch.sie_block->gcr, sizeof(sregs->crs));
-	vcpu_put(vcpu);
 	return 0;
 }
 
 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-	vcpu_load(vcpu);
 	memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs));
 	vcpu->arch.guest_fpregs.fpc = fpu->fpc;
-	vcpu_put(vcpu);
 	return 0;
 }
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-	vcpu_load(vcpu);
 	memcpy(&fpu->fprs, &vcpu->arch.guest_fpregs.fprs, sizeof(fpu->fprs));
 	fpu->fpc = vcpu->arch.guest_fpregs.fpc;
-	vcpu_put(vcpu);
 	return 0;
 }
 
@@ -498,8 +486,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	int rc;
 	sigset_t sigsaved;
 
-	vcpu_load(vcpu);
-
 rerun_vcpu:
 	if (vcpu->requests)
 		if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -568,8 +554,6 @@ rerun_vcpu:
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
-	vcpu_put(vcpu);
-
 	vcpu->stat.exit_userspace++;
 	return rc;
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b05321adfd2f..5acd21245fc7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4773,8 +4773,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	int r;
 	sigset_t sigsaved;
 
-	vcpu_load(vcpu);
-
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
@@ -4815,14 +4813,11 @@ out:
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
-	vcpu_put(vcpu);
 	return r;
 }
 
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
-	vcpu_load(vcpu);
-
 	regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
 	regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
 	regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -4845,15 +4840,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	regs->rip = kvm_rip_read(vcpu);
 	regs->rflags = kvm_get_rflags(vcpu);
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
-	vcpu_load(vcpu);
-
 	kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
 	kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
 	kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
@@ -4878,8 +4869,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 
 	vcpu->arch.exception.pending = false;
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
@@ -4898,8 +4887,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 {
 	struct desc_ptr dt;
 
-	vcpu_load(vcpu);
-
 	kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
 	kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
 	kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
@@ -4931,26 +4918,20 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 		set_bit(vcpu->arch.interrupt.nr,
 			(unsigned long *)sregs->interrupt_bitmap);
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
-	vcpu_load(vcpu);
 	mp_state->mp_state = vcpu->arch.mp_state;
-	vcpu_put(vcpu);
 	return 0;
 }
 
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 {
-	vcpu_load(vcpu);
 	vcpu->arch.mp_state = mp_state->mp_state;
-	vcpu_put(vcpu);
 	return 0;
 }
 
@@ -4996,8 +4977,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	int pending_vec, max_bits;
 	struct desc_ptr dt;
 
-	vcpu_load(vcpu);
-
 	dt.size = sregs->idt.limit;
 	dt.address = sregs->idt.base;
 	kvm_x86_ops->set_idt(vcpu, &dt);
@@ -5057,8 +5036,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	    !is_protmode(vcpu))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
@@ -5068,12 +5045,10 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 	unsigned long rflags;
 	int i, r;
 
-	vcpu_load(vcpu);
-
 	if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
 		r = -EBUSY;
 		if (vcpu->arch.exception.pending)
-			goto unlock_out;
+			goto out;
 		if (dbg->control & KVM_GUESTDBG_INJECT_DB)
 			kvm_queue_exception(vcpu, DB_VECTOR);
 		else
@@ -5115,8 +5090,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 
 	r = 0;
 
-unlock_out:
-	vcpu_put(vcpu);
+out:
 
 	return r;
 }
@@ -5152,7 +5126,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 	gpa_t gpa;
 	int idx;
 
-	vcpu_load(vcpu);
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
 	gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -5160,7 +5133,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 	tr->valid = gpa != UNMAPPED_GVA;
 	tr->writeable = 1;
 	tr->usermode = 0;
-	vcpu_put(vcpu);
 
 	return 0;
 }
@@ -5169,8 +5141,6 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
 	struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
 
-	vcpu_load(vcpu);
-
 	memcpy(fpu->fpr, fxsave->st_space, 128);
 	fpu->fcw = fxsave->cwd;
 	fpu->fsw = fxsave->swd;
@@ -5180,8 +5150,6 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	fpu->last_dp = fxsave->rdp;
 	memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
@@ -5189,8 +5157,6 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
 	struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
 
-	vcpu_load(vcpu);
-
 	memcpy(fxsave->st_space, fpu->fpr, 128);
 	fxsave->cwd = fpu->fcw;
 	fxsave->swd = fpu->fsw;
@@ -5200,8 +5166,6 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	fxsave->rdp = fpu->last_dp;
 	memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 187aa8d984a7..e0fb0988a3fe 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1392,6 +1392,18 @@ static long kvm_vcpu_ioctl(struct file *filp,
 
 	if (vcpu->kvm->mm != current->mm)
 		return -EIO;
+
+#if defined(CONFIG_S390) || defined(CONFIG_PPC)
+	/*
+	 * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
+	 * so vcpu_load() would break it.
+	 */
+	if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT)
+		return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
+#endif
+
+
+	vcpu_load(vcpu);
 	switch (ioctl) {
 	case KVM_RUN:
 		r = -EINVAL;
@@ -1566,9 +1578,12 @@ out_free2:
 		break;
 	}
 	default:
+		vcpu_put(vcpu);
 		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
+		vcpu_load(vcpu);
 	}
 out:
+	vcpu_put(vcpu);
 	kfree(fpu);
 	kfree(kvm_sregs);
 	return r;
-- 
cgit v1.2.3


From 526b78ad1a9e66ef240ad7c757988de039e42229 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 13 May 2010 11:53:06 +0300
Subject: KVM: x86: Lock arch specific vcpu ioctls centrally

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 41 ++---------------------------------------
 1 file changed, 2 insertions(+), 39 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5acd21245fc7..999b017011f4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1541,16 +1541,12 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
 {
 	int i, idx;
 
-	vcpu_load(vcpu);
-
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
 	for (i = 0; i < msrs->nmsrs; ++i)
 		if (do_msr(vcpu, entries[i].index, &entries[i].data))
 			break;
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 
-	vcpu_put(vcpu);
-
 	return i;
 }
 
@@ -1798,7 +1794,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
 	if (copy_from_user(cpuid_entries, entries,
 			   cpuid->nent * sizeof(struct kvm_cpuid_entry)))
 		goto out_free;
-	vcpu_load(vcpu);
 	for (i = 0; i < cpuid->nent; i++) {
 		vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
 		vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
@@ -1816,7 +1811,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
 	r = 0;
 	kvm_apic_set_version(vcpu);
 	kvm_x86_ops->cpuid_update(vcpu);
-	vcpu_put(vcpu);
 
 out_free:
 	vfree(cpuid_entries);
@@ -1837,11 +1831,9 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
 	if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
 			   cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
 		goto out;
-	vcpu_load(vcpu);
 	vcpu->arch.cpuid_nent = cpuid->nent;
 	kvm_apic_set_version(vcpu);
 	kvm_x86_ops->cpuid_update(vcpu);
-	vcpu_put(vcpu);
 	return 0;
 
 out:
@@ -1854,7 +1846,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 {
 	int r;
 
-	vcpu_load(vcpu);
 	r = -E2BIG;
 	if (cpuid->nent < vcpu->arch.cpuid_nent)
 		goto out;
@@ -1866,7 +1857,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 
 out:
 	cpuid->nent = vcpu->arch.cpuid_nent;
-	vcpu_put(vcpu);
 	return r;
 }
 
@@ -2098,9 +2088,7 @@ out:
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
 				    struct kvm_lapic_state *s)
 {
-	vcpu_load(vcpu);
 	memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
-	vcpu_put(vcpu);
 
 	return 0;
 }
@@ -2108,11 +2096,9 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
 				    struct kvm_lapic_state *s)
 {
-	vcpu_load(vcpu);
 	memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
 	kvm_apic_post_state_restore(vcpu);
 	update_cr8_intercept(vcpu);
-	vcpu_put(vcpu);
 
 	return 0;
 }
@@ -2124,20 +2110,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 		return -EINVAL;
 	if (irqchip_in_kernel(vcpu->kvm))
 		return -ENXIO;
-	vcpu_load(vcpu);
 
 	kvm_queue_interrupt(vcpu, irq->irq, false);
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
 {
-	vcpu_load(vcpu);
 	kvm_inject_nmi(vcpu);
-	vcpu_put(vcpu);
 
 	return 0;
 }
@@ -2157,7 +2138,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
 	int r;
 	unsigned bank_num = mcg_cap & 0xff, bank;
 
-	vcpu_load(vcpu);
 	r = -EINVAL;
 	if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
 		goto out;
@@ -2172,7 +2152,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
 	for (bank = 0; bank < bank_num; bank++)
 		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
 out:
-	vcpu_put(vcpu);
 	return r;
 }
 
@@ -2230,8 +2209,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 					       struct kvm_vcpu_events *events)
 {
-	vcpu_load(vcpu);
-
 	events->exception.injected =
 		vcpu->arch.exception.pending &&
 		!kvm_exception_is_soft(vcpu->arch.exception.nr);
@@ -2256,8 +2233,6 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
 			 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
 			 | KVM_VCPUEVENT_VALID_SHADOW);
-
-	vcpu_put(vcpu);
 }
 
 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
@@ -2268,8 +2243,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 			      | KVM_VCPUEVENT_VALID_SHADOW))
 		return -EINVAL;
 
-	vcpu_load(vcpu);
-
 	vcpu->arch.exception.pending = events->exception.injected;
 	vcpu->arch.exception.nr = events->exception.nr;
 	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
@@ -2292,22 +2265,16 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
 		vcpu->arch.sipi_vector = events->sipi_vector;
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
 					     struct kvm_debugregs *dbgregs)
 {
-	vcpu_load(vcpu);
-
 	memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
 	dbgregs->dr6 = vcpu->arch.dr6;
 	dbgregs->dr7 = vcpu->arch.dr7;
 	dbgregs->flags = 0;
-
-	vcpu_put(vcpu);
 }
 
 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -2316,14 +2283,10 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
 	if (dbgregs->flags)
 		return -EINVAL;
 
-	vcpu_load(vcpu);
-
 	memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
 	vcpu->arch.dr6 = dbgregs->dr6;
 	vcpu->arch.dr7 = dbgregs->dr7;
 
-	vcpu_put(vcpu);
-
 	return 0;
 }
 
@@ -2335,6 +2298,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	int r;
 	struct kvm_lapic_state *lapic = NULL;
 
+	vcpu_load(vcpu);
 	switch (ioctl) {
 	case KVM_GET_LAPIC: {
 		r = -EINVAL;
@@ -2481,9 +2445,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = -EFAULT;
 		if (copy_from_user(&mce, argp, sizeof mce))
 			goto out;
-		vcpu_load(vcpu);
 		r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
-		vcpu_put(vcpu);
 		break;
 	}
 	case KVM_GET_VCPU_EVENTS: {
@@ -2534,6 +2496,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = -EINVAL;
 	}
 out:
+	vcpu_put(vcpu);
 	kfree(lapic);
 	return r;
 }
-- 
cgit v1.2.3


From bc923cc93b7719576f20687e4cb07751601fbbb3 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 13 May 2010 12:21:46 +0300
Subject: KVM: s390: Centrally lock arch specific vcpu ioctls

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/s390/kvm/kvm-s390.c | 40 +++++++++++++++++-----------------------
 1 file changed, 17 insertions(+), 23 deletions(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 0d0e0add0201..fd169f658bf8 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -363,9 +363,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 
 static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
 {
-	vcpu_load(vcpu);
 	kvm_s390_vcpu_initial_reset(vcpu);
-	vcpu_put(vcpu);
 	return 0;
 }
 
@@ -415,14 +413,12 @@ static int kvm_arch_vcpu_ioctl_set_initial_psw(struct kvm_vcpu *vcpu, psw_t psw)
 {
 	int rc = 0;
 
-	vcpu_load(vcpu);
 	if (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_RUNNING)
 		rc = -EBUSY;
 	else {
 		vcpu->run->psw_mask = psw.mask;
 		vcpu->run->psw_addr = psw.addr;
 	}
-	vcpu_put(vcpu);
 	return rc;
 }
 
@@ -573,7 +569,7 @@ static int __guestcopy(struct kvm_vcpu *vcpu, u64 guestdest, const void *from,
  * KVM_S390_STORE_STATUS_NOADDR: -> 0x1200 on 64 bit
  * KVM_S390_STORE_STATUS_PREFIXED: -> prefix
  */
-int __kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
+static int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
 {
 	const unsigned char archmode = 1;
 	int prefix;
@@ -635,45 +631,43 @@ int __kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
 	return 0;
 }
 
-static int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
-{
-	int rc;
-
-	vcpu_load(vcpu);
-	rc = __kvm_s390_vcpu_store_status(vcpu, addr);
-	vcpu_put(vcpu);
-	return rc;
-}
-
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 {
 	struct kvm_vcpu *vcpu = filp->private_data;
 	void __user *argp = (void __user *)arg;
+	long r;
 
-	switch (ioctl) {
-	case KVM_S390_INTERRUPT: {
+	if (ioctl == KVM_S390_INTERRUPT) {
 		struct kvm_s390_interrupt s390int;
 
 		if (copy_from_user(&s390int, argp, sizeof(s390int)))
 			return -EFAULT;
 		return kvm_s390_inject_vcpu(vcpu, &s390int);
 	}
+
+	vcpu_load(vcpu);
+	switch (ioctl) {
 	case KVM_S390_STORE_STATUS:
-		return kvm_s390_vcpu_store_status(vcpu, arg);
+		r = kvm_s390_vcpu_store_status(vcpu, arg);
+		break;
 	case KVM_S390_SET_INITIAL_PSW: {
 		psw_t psw;
 
+		r = -EFAULT;
 		if (copy_from_user(&psw, argp, sizeof(psw)))
-			return -EFAULT;
-		return kvm_arch_vcpu_ioctl_set_initial_psw(vcpu, psw);
+			break;
+		r = kvm_arch_vcpu_ioctl_set_initial_psw(vcpu, psw);
+		break;
 	}
 	case KVM_S390_INITIAL_RESET:
-		return kvm_arch_vcpu_ioctl_initial_reset(vcpu);
+		r = kvm_arch_vcpu_ioctl_initial_reset(vcpu);
+		break;
 	default:
-		;
+		r = -EINVAL;
 	}
-	return -EINVAL;
+	vcpu_put(vcpu);
+	return r;
 }
 
 /* Section: memory related */
-- 
cgit v1.2.3


From 19483d144023f7f4817dedafe26d5fe9ff2e7087 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 13 May 2010 12:30:43 +0300
Subject: KVM: PPC: Centralize locking of arch specific vcpu ioctls

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/kvm/powerpc.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index e0fae7a618ca..caeed7b31bf9 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -512,15 +512,17 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	void __user *argp = (void __user *)arg;
 	long r;
 
-	switch (ioctl) {
-	case KVM_INTERRUPT: {
+	if (ioctl == KVM_INTERRUPT) {
 		struct kvm_interrupt irq;
 		r = -EFAULT;
 		if (copy_from_user(&irq, argp, sizeof(irq)))
-			goto out;
+			goto out_nolock;
 		r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
-		break;
+		goto out_nolock;
 	}
+
+	vcpu_load(vcpu);
+	switch (ioctl) {
 	case KVM_ENABLE_CAP:
 	{
 		struct kvm_enable_cap cap;
@@ -535,6 +537,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	}
 
 out:
+	vcpu_put(vcpu);
+out_nolock:
 	return r;
 }
 
-- 
cgit v1.2.3


From 93736624635235cc5372ffca6d62816d02170724 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 13 May 2010 12:35:17 +0300
Subject: KVM: Consolidate arch specific vcpu ioctl locking

Now that all arch specific ioctls have centralized locking, it is easy to
move it to the central dispatcher.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/kvm/powerpc.c | 11 ++++-------
 arch/s390/kvm/kvm-s390.c   | 13 ++++++-------
 arch/x86/kvm/x86.c         |  2 --
 virt/kvm/kvm_main.c        |  2 --
 4 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index caeed7b31bf9..a1d87508892e 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -512,17 +512,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	void __user *argp = (void __user *)arg;
 	long r;
 
-	if (ioctl == KVM_INTERRUPT) {
+	switch (ioctl) {
+	case KVM_INTERRUPT: {
 		struct kvm_interrupt irq;
 		r = -EFAULT;
 		if (copy_from_user(&irq, argp, sizeof(irq)))
-			goto out_nolock;
+			goto out;
 		r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
-		goto out_nolock;
+		goto out;
 	}
 
-	vcpu_load(vcpu);
-	switch (ioctl) {
 	case KVM_ENABLE_CAP:
 	{
 		struct kvm_enable_cap cap;
@@ -537,8 +536,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	}
 
 out:
-	vcpu_put(vcpu);
-out_nolock:
 	return r;
 }
 
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index fd169f658bf8..0cb0da7822d5 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -638,16 +638,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	void __user *argp = (void __user *)arg;
 	long r;
 
-	if (ioctl == KVM_S390_INTERRUPT) {
+	switch (ioctl) {
+	case KVM_S390_INTERRUPT: {
 		struct kvm_s390_interrupt s390int;
 
+		r = -EFAULT;
 		if (copy_from_user(&s390int, argp, sizeof(s390int)))
-			return -EFAULT;
-		return kvm_s390_inject_vcpu(vcpu, &s390int);
+			break;
+		r = kvm_s390_inject_vcpu(vcpu, &s390int);
+		break;
 	}
-
-	vcpu_load(vcpu);
-	switch (ioctl) {
 	case KVM_S390_STORE_STATUS:
 		r = kvm_s390_vcpu_store_status(vcpu, arg);
 		break;
@@ -666,7 +666,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	default:
 		r = -EINVAL;
 	}
-	vcpu_put(vcpu);
 	return r;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 999b017011f4..4c2096f30d90 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2298,7 +2298,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	int r;
 	struct kvm_lapic_state *lapic = NULL;
 
-	vcpu_load(vcpu);
 	switch (ioctl) {
 	case KVM_GET_LAPIC: {
 		r = -EINVAL;
@@ -2496,7 +2495,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = -EINVAL;
 	}
 out:
-	vcpu_put(vcpu);
 	kfree(lapic);
 	return r;
 }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e0fb0988a3fe..35532c964710 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1578,9 +1578,7 @@ out_free2:
 		break;
 	}
 	default:
-		vcpu_put(vcpu);
 		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
-		vcpu_load(vcpu);
 	}
 out:
 	vcpu_put(vcpu);
-- 
cgit v1.2.3


From 5ee481da7b62a992b91f958bf26aaaa92354c170 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 17 May 2010 17:22:23 +0800
Subject: x86: Export FPU API for KVM use

Also add some constants.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/i387.h  | 2 ++
 arch/x86/include/asm/xsave.h | 3 +++
 arch/x86/kernel/i387.c       | 3 ++-
 arch/x86/kernel/process.c    | 1 +
 4 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index c991b3a7b904..815c5b2b9f57 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -482,6 +482,8 @@ static inline void fpu_copy(struct fpu *dst, struct fpu *src)
 	memcpy(dst->state, src->state, xstate_size);
 }
 
+extern void fpu_finit(struct fpu *fpu);
+
 #endif /* __ASSEMBLY__ */
 
 #define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 2c4390cae228..29ee4e4c64cf 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -13,6 +13,9 @@
 
 #define FXSAVE_SIZE	512
 
+#define XSTATE_YMM_SIZE 256
+#define XSTATE_YMM_OFFSET (512 + 64)
+
 /*
  * These are the features that the OS can handle currently.
  */
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 86cef6b32253..c4444bce8469 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -107,7 +107,7 @@ void __cpuinit fpu_init(void)
 }
 #endif	/* CONFIG_X86_64 */
 
-static void fpu_finit(struct fpu *fpu)
+void fpu_finit(struct fpu *fpu)
 {
 #ifdef CONFIG_X86_32
 	if (!HAVE_HWFP) {
@@ -132,6 +132,7 @@ static void fpu_finit(struct fpu *fpu)
 		fp->fos = 0xffff0000u;
 	}
 }
+EXPORT_SYMBOL_GPL(fpu_finit);
 
 /*
  * The _current_ task is using the FPU for the first time
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e7e35219b32f..ebcfcceccc72 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@ unsigned long idle_nomwait;
 EXPORT_SYMBOL(idle_nomwait);
 
 struct kmem_cache *task_xstate_cachep;
+EXPORT_SYMBOL_GPL(task_xstate_cachep);
 
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
-- 
cgit v1.2.3


From 7cf30855e02be7a207ffebb8b9350986f2ba83e9 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 17 May 2010 17:08:27 +0800
Subject: KVM: x86: Use unlazy_fpu() for host FPU

We can avoid unnecessary fpu load when userspace process
didn't use FPU frequently.

Derived from Avi's idea.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/x86.c              | 18 ++----------------
 2 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0c06148fa3b1..d93601c52902 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -301,7 +301,6 @@ struct kvm_vcpu_arch {
 		unsigned long mmu_seq;
 	} update_pte;
 
-	struct i387_fxsave_struct host_fx_image;
 	struct i387_fxsave_struct guest_fx_image;
 
 	gva_t mmio_fault_cr2;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4c2096f30d90..54ce77582eda 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -52,6 +52,7 @@
 #include <asm/desc.h>
 #include <asm/mtrr.h>
 #include <asm/mce.h>
+#include <asm/i387.h>
 
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS						\
@@ -5134,21 +5135,10 @@ void fx_init(struct kvm_vcpu *vcpu)
 {
 	unsigned after_mxcsr_mask;
 
-	/*
-	 * Touch the fpu the first time in non atomic context as if
-	 * this is the first fpu instruction the exception handler
-	 * will fire before the instruction returns and it'll have to
-	 * allocate ram with GFP_KERNEL.
-	 */
-	if (!used_math())
-		kvm_fx_save(&vcpu->arch.host_fx_image);
-
 	/* Initialize guest FPU by resetting ours and saving into guest's */
 	preempt_disable();
-	kvm_fx_save(&vcpu->arch.host_fx_image);
 	kvm_fx_finit();
 	kvm_fx_save(&vcpu->arch.guest_fx_image);
-	kvm_fx_restore(&vcpu->arch.host_fx_image);
 	preempt_enable();
 
 	vcpu->arch.cr0 |= X86_CR0_ET;
@@ -5165,7 +5155,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 		return;
 
 	vcpu->guest_fpu_loaded = 1;
-	kvm_fx_save(&vcpu->arch.host_fx_image);
+	unlazy_fpu(current);
 	kvm_fx_restore(&vcpu->arch.guest_fx_image);
 	trace_kvm_fpu(1);
 }
@@ -5177,7 +5167,6 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 
 	vcpu->guest_fpu_loaded = 0;
 	kvm_fx_save(&vcpu->arch.guest_fx_image);
-	kvm_fx_restore(&vcpu->arch.host_fx_image);
 	++vcpu->stat.fpu_reload;
 	set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
 	trace_kvm_fpu(0);
@@ -5203,9 +5192,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
 	int r;
 
-	/* We do fxsave: this must be aligned. */
-	BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
-
 	vcpu->arch.mtrr_state.have_fixed = 1;
 	vcpu_load(vcpu);
 	r = kvm_arch_vcpu_reset(vcpu);
-- 
cgit v1.2.3


From 98918833a3e21ffc5619535955e7a003cb788163 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 17 May 2010 17:08:28 +0800
Subject: KVM: x86: Use FPU API

Convert KVM to use generic FPU API.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 17 +-------------
 arch/x86/kvm/x86.c              | 52 +++++++++++++----------------------------
 2 files changed, 17 insertions(+), 52 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d93601c52902..d08bb4a202de 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -301,7 +301,7 @@ struct kvm_vcpu_arch {
 		unsigned long mmu_seq;
 	} update_pte;
 
-	struct i387_fxsave_struct guest_fx_image;
+	struct fpu guest_fpu;
 
 	gva_t mmio_fault_cr2;
 	struct kvm_pio_request pio;
@@ -708,21 +708,6 @@ static inline unsigned long read_msr(unsigned long msr)
 }
 #endif
 
-static inline void kvm_fx_save(struct i387_fxsave_struct *image)
-{
-	asm("fxsave (%0)":: "r" (image));
-}
-
-static inline void kvm_fx_restore(struct i387_fxsave_struct *image)
-{
-	asm("fxrstor (%0)":: "r" (image));
-}
-
-static inline void kvm_fx_finit(void)
-{
-	asm("finit");
-}
-
 static inline u32 get_rdx_init_val(void)
 {
 	return 0x600; /* P6 family */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 54ce77582eda..84b1788489de 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -53,6 +53,7 @@
 #include <asm/mtrr.h>
 #include <asm/mce.h>
 #include <asm/i387.h>
+#include <asm/xcr.h>
 
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS						\
@@ -5057,27 +5058,6 @@ out:
 	return r;
 }
 
-/*
- * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
- * we have asm/x86/processor.h
- */
-struct fxsave {
-	u16	cwd;
-	u16	swd;
-	u16	twd;
-	u16	fop;
-	u64	rip;
-	u64	rdp;
-	u32	mxcsr;
-	u32	mxcsr_mask;
-	u32	st_space[32];	/* 8*16 bytes for each FP-reg = 128 bytes */
-#ifdef CONFIG_X86_64
-	u32	xmm_space[64];	/* 16*16 bytes for each XMM-reg = 256 bytes */
-#else
-	u32	xmm_space[32];	/* 8*16 bytes for each XMM-reg = 128 bytes */
-#endif
-};
-
 /*
  * Translate a guest virtual address to a guest physical address.
  */
@@ -5101,7 +5081,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-	struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+	struct i387_fxsave_struct *fxsave =
+			&vcpu->arch.guest_fpu.state->fxsave;
 
 	memcpy(fpu->fpr, fxsave->st_space, 128);
 	fpu->fcw = fxsave->cwd;
@@ -5117,7 +5098,8 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 
 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
-	struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
+	struct i387_fxsave_struct *fxsave =
+			&vcpu->arch.guest_fpu.state->fxsave;
 
 	memcpy(fxsave->st_space, fpu->fpr, 128);
 	fxsave->cwd = fpu->fcw;
@@ -5133,22 +5115,18 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 
 void fx_init(struct kvm_vcpu *vcpu)
 {
-	unsigned after_mxcsr_mask;
-
-	/* Initialize guest FPU by resetting ours and saving into guest's */
-	preempt_disable();
-	kvm_fx_finit();
-	kvm_fx_save(&vcpu->arch.guest_fx_image);
-	preempt_enable();
+	fpu_alloc(&vcpu->arch.guest_fpu);
+	fpu_finit(&vcpu->arch.guest_fpu);
 
 	vcpu->arch.cr0 |= X86_CR0_ET;
-	after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
-	vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
-	memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
-	       0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
 }
 EXPORT_SYMBOL_GPL(fx_init);
 
+static void fx_free(struct kvm_vcpu *vcpu)
+{
+	fpu_free(&vcpu->arch.guest_fpu);
+}
+
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 {
 	if (vcpu->guest_fpu_loaded)
@@ -5156,7 +5134,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 
 	vcpu->guest_fpu_loaded = 1;
 	unlazy_fpu(current);
-	kvm_fx_restore(&vcpu->arch.guest_fx_image);
+	fpu_restore_checking(&vcpu->arch.guest_fpu);
 	trace_kvm_fpu(1);
 }
 
@@ -5166,7 +5144,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 		return;
 
 	vcpu->guest_fpu_loaded = 0;
-	kvm_fx_save(&vcpu->arch.guest_fx_image);
+	fpu_save_init(&vcpu->arch.guest_fpu);
 	++vcpu->stat.fpu_reload;
 	set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
 	trace_kvm_fpu(0);
@@ -5179,6 +5157,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 		vcpu->arch.time_page = NULL;
 	}
 
+	fx_free(vcpu);
 	kvm_x86_ops->vcpu_free(vcpu);
 }
 
@@ -5213,6 +5192,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	kvm_mmu_unload(vcpu);
 	vcpu_put(vcpu);
 
+	fx_free(vcpu);
 	kvm_x86_ops->vcpu_free(vcpu);
 }
 
-- 
cgit v1.2.3


From 1d9dc7e000915b9607b480e34fcb4238b789fbb1 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 15 May 2010 18:51:24 +0800
Subject: KVM: MMU: split kvm_sync_page() function

Split kvm_sync_page() into kvm_sync_page() and kvm_sync_page_transient()
to clarify the code address Avi's suggestion

kvm_sync_page_transient() function only update shadow page but not mark
it sync and not write protect sp->gfn. it will be used by later patch

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5c9d6df0113e..ef5d140a2705 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1199,16 +1199,20 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 
 static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 
-static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			   bool clear_unsync)
 {
 	if (sp->role.cr4_pae != !!is_pae(vcpu)) {
 		kvm_mmu_zap_page(vcpu->kvm, sp);
 		return 1;
 	}
 
-	if (rmap_write_protect(vcpu->kvm, sp->gfn))
-		kvm_flush_remote_tlbs(vcpu->kvm);
-	kvm_unlink_unsync_page(vcpu->kvm, sp);
+	if (clear_unsync) {
+		if (rmap_write_protect(vcpu->kvm, sp->gfn))
+			kvm_flush_remote_tlbs(vcpu->kvm);
+		kvm_unlink_unsync_page(vcpu->kvm, sp);
+	}
+
 	if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
 		kvm_mmu_zap_page(vcpu->kvm, sp);
 		return 1;
@@ -1218,6 +1222,23 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 	return 0;
 }
 
+static void mmu_convert_notrap(struct kvm_mmu_page *sp);
+static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
+				   struct kvm_mmu_page *sp)
+{
+	int ret;
+
+	ret = __kvm_sync_page(vcpu, sp, false);
+	if (!ret)
+		mmu_convert_notrap(sp);
+	return ret;
+}
+
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+	return __kvm_sync_page(vcpu, sp, true);
+}
+
 struct mmu_page_path {
 	struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
 	unsigned int idx[PT64_ROOT_LEVEL-1];
-- 
cgit v1.2.3


From e02aa901b1aa41fb541521800cc2a4774c162485 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 15 May 2010 18:52:34 +0800
Subject: KVM: MMU: don't write-protect if have new mapping to unsync page

Two cases maybe happen in kvm_mmu_get_page() function:

- one case is, the goal sp is already in cache, if the sp is unsync,
  we only need update it to assure this mapping is valid, but not
  mark it sync and not write-protect sp->gfn since it not broke unsync
  rule(one shadow page for a gfn)

- another case is, the goal sp not existed, we need create a new sp
  for gfn, i.e, gfn (may)has another shadow page, to keep unsync rule,
  we should sync(mark sync and write-protect) gfn's unsync shadow page.
  After enabling multiple unsync shadows, we sync those shadow pages
  only when the new sp not allow to become unsync(also for the unsyc
  rule, the new rule is: allow all pte page become unsync)

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ef5d140a2705..064ddfbde108 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1337,7 +1337,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	unsigned index;
 	unsigned quadrant;
 	struct hlist_head *bucket;
-	struct kvm_mmu_page *sp;
+	struct kvm_mmu_page *sp, *unsync_sp = NULL;
 	struct hlist_node *node, *tmp;
 
 	role = vcpu->arch.mmu.base_role;
@@ -1356,20 +1356,30 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
 		if (sp->gfn == gfn) {
 			if (sp->unsync)
-				if (kvm_sync_page(vcpu, sp))
-					continue;
+				unsync_sp = sp;
 
 			if (sp->role.word != role.word)
 				continue;
 
+			if (!direct && unsync_sp &&
+			      kvm_sync_page_transient(vcpu, unsync_sp)) {
+				unsync_sp = NULL;
+				break;
+			}
+
 			mmu_page_add_parent_pte(vcpu, sp, parent_pte);
 			if (sp->unsync_children) {
 				set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
 				kvm_mmu_mark_parents_unsync(sp);
-			}
+			} else if (sp->unsync)
+				kvm_mmu_mark_parents_unsync(sp);
+
 			trace_kvm_mmu_get_page(sp, false);
 			return sp;
 		}
+	if (!direct && unsync_sp)
+		kvm_sync_page(vcpu, unsync_sp);
+
 	++vcpu->kvm->stat.mmu_cache_miss;
 	sp = kvm_mmu_alloc_page(vcpu, parent_pte);
 	if (!sp)
-- 
cgit v1.2.3


From f78978aa3a8222f7822f15fba5dbaea990ef0887 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 15 May 2010 18:53:35 +0800
Subject: KVM: MMU: only update unsync page in invlpg path

Only unsync pages need updated at invlpg time since other shadow
pages are write-protected

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 22f13797f521..0671d7a29c3c 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -461,6 +461,7 @@ out_unlock:
 static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 {
 	struct kvm_shadow_walk_iterator iterator;
+	struct kvm_mmu_page *sp;
 	gpa_t pte_gpa = -1;
 	int level;
 	u64 *sptep;
@@ -472,10 +473,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 		level = iterator.level;
 		sptep = iterator.sptep;
 
+		sp = page_header(__pa(sptep));
 		if (is_last_spte(*sptep, level)) {
-			struct kvm_mmu_page *sp = page_header(__pa(sptep));
 			int offset, shift;
 
+			if (!sp->unsync)
+				break;
+
 			shift = PAGE_SHIFT -
 				  (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
 			offset = sp->role.quadrant << shift;
@@ -493,7 +497,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 			break;
 		}
 
-		if (!is_shadow_present_pte(*sptep))
+		if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
 			break;
 	}
 
-- 
cgit v1.2.3


From 9fb2d2b4ff292a01ae30da003d1dc097917b0988 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 23 May 2010 14:28:26 +0300
Subject: KVM: SVM: correctly trace irq injection

On SVM interrupts are injected by svm_set_irq() not svm_inject_irq().
The later is used only to wait for irq window.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 134260c36ce2..f5c2b432078a 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2917,9 +2917,6 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
 {
 	struct vmcb_control_area *control;
 
-	trace_kvm_inj_virq(irq);
-
-	++svm->vcpu.stat.irq_injections;
 	control = &svm->vmcb->control;
 	control->int_vector = irq;
 	control->int_ctl &= ~V_INTR_PRIO_MASK;
@@ -2933,6 +2930,9 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
 
 	BUG_ON(!(gif_set(svm)));
 
+	trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
+	++vcpu->stat.irq_injections;
+
 	svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
 		SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
 }
-- 
cgit v1.2.3


From 221d059d15f1c8bd070a63fd45cd8d2598af5f99 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 23 May 2010 18:37:00 +0300
Subject: KVM: Update Red Hat copyrights

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c     |  1 +
 arch/x86/kvm/i8254.c       |  1 +
 arch/x86/kvm/i8259.c       |  1 +
 arch/x86/kvm/irq.c         |  1 +
 arch/x86/kvm/lapic.c       |  1 +
 arch/x86/kvm/mmu.c         |  1 +
 arch/x86/kvm/paging_tmpl.h |  1 +
 arch/x86/kvm/svm.c         |  1 +
 arch/x86/kvm/timer.c       | 14 ++++++++++++++
 arch/x86/kvm/vmx.c         |  1 +
 arch/x86/kvm/x86.c         |  1 +
 virt/kvm/assigned-dev.c    |  2 +-
 virt/kvm/coalesced_mmio.c  |  1 +
 virt/kvm/eventfd.c         |  1 +
 virt/kvm/ioapic.c          |  1 +
 virt/kvm/iommu.c           |  2 ++
 virt/kvm/irq_comm.c        |  1 +
 virt/kvm/kvm_main.c        |  1 +
 18 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 7a36eec8bab8..a4c2dcd10326 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -9,6 +9,7 @@
  * privileged instructions:
  *
  * Copyright (C) 2006 Qumranet
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  *
  *   Avi Kivity <avi@qumranet.com>
  *   Yaniv Kamay <yaniv@qumranet.com>
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 0150affad25d..188d82762c1b 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -5,6 +5,7 @@
  * Copyright (c) 2006 Intel Corporation
  * Copyright (c) 2007 Keir Fraser, XenSource Inc
  * Copyright (c) 2008 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affilates.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 93825ff3338f..2c73f4493148 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -3,6 +3,7 @@
  *
  * Copyright (c) 2003-2004 Fabrice Bellard
  * Copyright (c) 2007 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affilates.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 96dfbb6ad2a9..0f4e488331c7 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -1,6 +1,7 @@
 /*
  * irq.c: API for in kernel interrupt controller
  * Copyright (c) 2007, Intel Corporation.
+ * Copyright 2009 Red Hat, Inc. and/or its affilates.
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 1eb7a4ae0c9c..d8258a0060f8 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -5,6 +5,7 @@
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2007 Novell
  * Copyright (C) 2007 Intel
+ * Copyright 2009 Red Hat, Inc. and/or its affilates.
  *
  * Authors:
  *   Dor Laor <dor.laor@qumranet.com>
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 064ddfbde108..25d3bb2543e2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -7,6 +7,7 @@
  * MMU support
  *
  * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  *
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 0671d7a29c3c..167f53357eef 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -7,6 +7,7 @@
  * MMU support
  *
  * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  *
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f5c2b432078a..02ea5cf4b1e5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4,6 +4,7 @@
  * AMD SVM support
  *
  * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  *
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index 4ddadb1a5ffe..564548fbb3d6 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -1,3 +1,17 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * timer support
+ *
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
 #include <linux/kvm_host.h>
 #include <linux/kvm.h>
 #include <linux/hrtimer.h>
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b8aac4e98903..9c3ffc5fde44 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5,6 +5,7 @@
  * machines without emulation or binary translation.
  *
  * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  *
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 84b1788489de..033b9c207f93 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6,6 +6,7 @@
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2008 Qumranet, Inc.
  * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  *
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index 64672e2d43c3..bf9d958d4d5b 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -1,7 +1,7 @@
 /*
  * Kernel-based Virtual Machine - device assignment support
  *
- * Copyright (C) 2006-9 Red Hat, Inc
+ * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates.
  *
  * This work is licensed under the terms of the GNU GPL, version 2.  See
  * the COPYING file in the top-level directory.
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 53850177163f..fc8487564d1f 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -2,6 +2,7 @@
  * KVM coalesced MMIO
  *
  * Copyright (c) 2008 Bull S.A.S.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
  *
  *  Author: Laurent Vivier <Laurent.Vivier@bull.net>
  *
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index b81f0ebbaaad..66cf65b510b1 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -2,6 +2,7 @@
  * kvm eventfd support - use eventfd objects to signal various KVM events
  *
  * Copyright 2009 Novell.  All Rights Reserved.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  * Author:
  *	Gregory Haskins <ghaskins@novell.com>
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 3500dee9cf2b..1149c60b198f 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -1,5 +1,6 @@
 /*
  *  Copyright (C) 2001  MandrakeSoft S.A.
+ *  Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  *    MandrakeSoft S.A.
  *    43, rue d'Aboukir
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 96048ee9e39e..673c88a8efe9 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -16,6 +16,8 @@
  *
  * Copyright (C) 2006-2008 Intel Corporation
  * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
  * Author: Allen M. Kay <allen.m.kay@intel.com>
  * Author: Weidong Han <weidong.han@intel.com>
  * Author: Ben-Ami Yassour <benami@il.ibm.com>
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index a0e88809e45e..52f412f16be3 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -17,6 +17,7 @@
  * Authors:
  *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
  *
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  */
 
 #include <linux/kvm_host.h>
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 35532c964710..4a71faa96dfd 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -5,6 +5,7 @@
  * machines without emulation or binary translation.
  *
  * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  *
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
-- 
cgit v1.2.3


From 9cf5cf5ad43b293581e5b87678ea5783c06d1a41 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 24 May 2010 15:40:07 +0800
Subject: KVM: MMU: allow more page become unsync at gfn mapping time

In current code, shadow page can become asynchronous only if one
shadow page for a gfn, this rule is too strict, in fact, we can
let all last mapping page(i.e, it's the pte page) become unsync,
and sync them at invlpg or flush tlb time.

This patch allow more page become asynchronous at gfn mapping time

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 82 +++++++++++++++++++++++++-----------------------------
 1 file changed, 38 insertions(+), 44 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 25d3bb2543e2..ba119dae890e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1170,26 +1170,6 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
 	return __mmu_unsync_walk(sp, pvec);
 }
 
-static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
-{
-	unsigned index;
-	struct hlist_head *bucket;
-	struct kvm_mmu_page *sp;
-	struct hlist_node *node;
-
-	pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
-	index = kvm_page_table_hashfn(gfn);
-	bucket = &kvm->arch.mmu_page_hash[index];
-	hlist_for_each_entry(sp, node, bucket, hash_link)
-		if (sp->gfn == gfn && !sp->role.direct
-		    && !sp->role.invalid) {
-			pgprintk("%s: found role %x\n",
-				 __func__, sp->role.word);
-			return sp;
-		}
-	return NULL;
-}
-
 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	WARN_ON(!sp->unsync);
@@ -1759,47 +1739,61 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
 
-static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+	trace_kvm_mmu_unsync_page(sp);
+	++vcpu->kvm->stat.mmu_unsync;
+	sp->unsync = 1;
+
+	kvm_mmu_mark_parents_unsync(sp);
+	mmu_convert_notrap(sp);
+}
+
+static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 {
-	unsigned index;
 	struct hlist_head *bucket;
 	struct kvm_mmu_page *s;
 	struct hlist_node *node, *n;
+	unsigned index;
 
-	index = kvm_page_table_hashfn(sp->gfn);
+	index = kvm_page_table_hashfn(gfn);
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-	/* don't unsync if pagetable is shadowed with multiple roles */
+
 	hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
-		if (s->gfn != sp->gfn || s->role.direct)
+		if (s->gfn != gfn || s->role.direct || s->unsync ||
+		      s->role.invalid)
 			continue;
-		if (s->role.word != sp->role.word)
-			return 1;
+		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+		__kvm_unsync_page(vcpu, s);
 	}
-	trace_kvm_mmu_unsync_page(sp);
-	++vcpu->kvm->stat.mmu_unsync;
-	sp->unsync = 1;
-
-	kvm_mmu_mark_parents_unsync(sp);
-
-	mmu_convert_notrap(sp);
-	return 0;
 }
 
 static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 				  bool can_unsync)
 {
-	struct kvm_mmu_page *shadow;
+	unsigned index;
+	struct hlist_head *bucket;
+	struct kvm_mmu_page *s;
+	struct hlist_node *node, *n;
+	bool need_unsync = false;
+
+	index = kvm_page_table_hashfn(gfn);
+	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+	hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
+		if (s->gfn != gfn || s->role.direct || s->role.invalid)
+			continue;
 
-	shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
-	if (shadow) {
-		if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
+		if (s->role.level != PT_PAGE_TABLE_LEVEL)
 			return 1;
-		if (shadow->unsync)
-			return 0;
-		if (can_unsync && oos_shadow)
-			return kvm_unsync_page(vcpu, shadow);
-		return 1;
+
+		if (!need_unsync && !s->unsync) {
+			if (!can_unsync || !oos_shadow)
+				return 1;
+			need_unsync = true;
+		}
 	}
+	if (need_unsync)
+		kvm_unsync_pages(vcpu, gfn);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 9f1a122f970dbef5ba3496587f39df5c1853083f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Mon, 24 May 2010 15:41:33 +0800
Subject: KVM: MMU: allow more page become unsync at getting sp time

Allow more page become asynchronous at getting sp time, if need create new
shadow page for gfn but it not allow unsync(level > 1), we should unsync all
gfn's unsync page

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 47 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 37 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ba119dae890e..07673487fd5d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1220,6 +1220,35 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 	return __kvm_sync_page(vcpu, sp, true);
 }
 
+/* @gfn should be write-protected at the call site */
+static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
+{
+	struct hlist_head *bucket;
+	struct kvm_mmu_page *s;
+	struct hlist_node *node, *n;
+	unsigned index;
+	bool flush = false;
+
+	index = kvm_page_table_hashfn(gfn);
+	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
+	hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
+		if (s->gfn != gfn || !s->unsync || s->role.invalid)
+			continue;
+
+		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+		if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
+			(vcpu->arch.mmu.sync_page(vcpu, s))) {
+			kvm_mmu_zap_page(vcpu->kvm, s);
+			continue;
+		}
+		kvm_unlink_unsync_page(vcpu->kvm, s);
+		flush = true;
+	}
+
+	if (flush)
+		kvm_mmu_flush_tlb(vcpu);
+}
+
 struct mmu_page_path {
 	struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
 	unsigned int idx[PT64_ROOT_LEVEL-1];
@@ -1318,8 +1347,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	unsigned index;
 	unsigned quadrant;
 	struct hlist_head *bucket;
-	struct kvm_mmu_page *sp, *unsync_sp = NULL;
+	struct kvm_mmu_page *sp;
 	struct hlist_node *node, *tmp;
+	bool need_sync = false;
 
 	role = vcpu->arch.mmu.base_role;
 	role.level = level;
@@ -1336,17 +1366,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 	hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
 		if (sp->gfn == gfn) {
-			if (sp->unsync)
-				unsync_sp = sp;
+			if (!need_sync && sp->unsync)
+				need_sync = true;
 
 			if (sp->role.word != role.word)
 				continue;
 
-			if (!direct && unsync_sp &&
-			      kvm_sync_page_transient(vcpu, unsync_sp)) {
-				unsync_sp = NULL;
+			if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
 				break;
-			}
 
 			mmu_page_add_parent_pte(vcpu, sp, parent_pte);
 			if (sp->unsync_children) {
@@ -1358,9 +1385,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 			trace_kvm_mmu_get_page(sp, false);
 			return sp;
 		}
-	if (!direct && unsync_sp)
-		kvm_sync_page(vcpu, unsync_sp);
-
 	++vcpu->kvm->stat.mmu_cache_miss;
 	sp = kvm_mmu_alloc_page(vcpu, parent_pte);
 	if (!sp)
@@ -1371,6 +1395,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	if (!direct) {
 		if (rmap_write_protect(vcpu->kvm, gfn))
 			kvm_flush_remote_tlbs(vcpu->kvm);
+		if (level > PT_PAGE_TABLE_LEVEL && need_sync)
+			kvm_sync_pages(vcpu, gfn);
+
 		account_shadowed(vcpu->kvm, gfn);
 	}
 	if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
-- 
cgit v1.2.3


From c8174f7b35b3018c4c7b3237ed1c792e454fd5c3 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Mon, 24 May 2010 01:01:04 +0300
Subject: KVM: VMX: Add constant for invalid guest state exit reason

For the sake of completeness, this patch adds a symbolic
constant for VMX exit reason 0x21 (invalid guest state).

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/vmx.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 9e6779f7cf2d..104cf86a7562 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -257,6 +257,7 @@ enum vmcs_field {
 #define EXIT_REASON_IO_INSTRUCTION      30
 #define EXIT_REASON_MSR_READ            31
 #define EXIT_REASON_MSR_WRITE           32
+#define EXIT_REASON_INVALID_STATE	33
 #define EXIT_REASON_MWAIT_INSTRUCTION   36
 #define EXIT_REASON_MONITOR_INSTRUCTION 39
 #define EXIT_REASON_PAUSE_INSTRUCTION   40
-- 
cgit v1.2.3


From 2032a93d66fa282ba0f2ea9152eeff9511fa9a96 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 26 May 2010 16:49:59 +0800
Subject: KVM: MMU: Don't allocate gfns page for direct mmu pages

When sp->role.direct is set, sp->gfns does not contain any essential
information, leaf sptes reachable from this sp are for a continuous
guest physical memory range (a linear range).
So sp->gfns[i] (if it was set) equals to sp->gfn + i. (PT_PAGE_TABLE_LEVEL)
Obviously, it is not essential information, we can calculate it when need.

It means we don't need sp->gfns when sp->role.direct=1,
Thus we can save one page usage for every kvm_mmu_page.

Note:
  Access to sp->gfns must be wrapped by kvm_mmu_page_get_gfn()
  or kvm_mmu_page_set_gfn().
  It is only exposed in FNAME(sync_page).

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/kvm/mmu.txt  |  4 +++-
 arch/x86/kvm/mmu.c         | 38 +++++++++++++++++++++++++++++---------
 arch/x86/kvm/paging_tmpl.h |  3 +++
 3 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/Documentation/kvm/mmu.txt b/Documentation/kvm/mmu.txt
index 0e872ae30914..2201dcba92a1 100644
--- a/Documentation/kvm/mmu.txt
+++ b/Documentation/kvm/mmu.txt
@@ -180,7 +180,9 @@ Shadow pages contain the following information:
     guest pages as leaves.
   gfns:
     An array of 512 guest frame numbers, one for each present pte.  Used to
-    perform a reverse map from a pte to a gfn.
+    perform a reverse map from a pte to a gfn. When role.direct is set, any
+    element of this array can be calculated from the gfn field when used, in
+    this case, the array of gfns is not allocated. See role.direct and gfn.
   slot_bitmap:
     A bitmap containing one bit per memory slot.  If the page contains a pte
     mapping a page from memory slot n, then bit n of slot_bitmap will be set
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 07673487fd5d..f46b6c9aff27 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -397,6 +397,22 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
 	kmem_cache_free(rmap_desc_cache, rd);
 }
 
+static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
+{
+	if (!sp->role.direct)
+		return sp->gfns[index];
+
+	return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
+}
+
+static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
+{
+	if (sp->role.direct)
+		BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
+	else
+		sp->gfns[index] = gfn;
+}
+
 /*
  * Return the pointer to the largepage write count for a given
  * gfn, handling slots that are not large page aligned.
@@ -547,7 +563,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 		return count;
 	gfn = unalias_gfn(vcpu->kvm, gfn);
 	sp = page_header(__pa(spte));
-	sp->gfns[spte - sp->spt] = gfn;
+	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
 	if (!*rmapp) {
 		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
@@ -605,6 +621,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	struct kvm_rmap_desc *prev_desc;
 	struct kvm_mmu_page *sp;
 	pfn_t pfn;
+	gfn_t gfn;
 	unsigned long *rmapp;
 	int i;
 
@@ -616,7 +633,8 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 		kvm_set_pfn_accessed(pfn);
 	if (is_writable_pte(*spte))
 		kvm_set_pfn_dirty(pfn);
-	rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
+	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
+	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
 	if (!*rmapp) {
 		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
 		BUG();
@@ -900,7 +918,8 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	ASSERT(is_empty_shadow_page(sp->spt));
 	list_del(&sp->link);
 	__free_page(virt_to_page(sp->spt));
-	__free_page(virt_to_page(sp->gfns));
+	if (!sp->role.direct)
+		__free_page(virt_to_page(sp->gfns));
 	kmem_cache_free(mmu_page_header_cache, sp);
 	++kvm->arch.n_free_mmu_pages;
 }
@@ -911,13 +930,15 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
 }
 
 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
-					       u64 *parent_pte)
+					       u64 *parent_pte, int direct)
 {
 	struct kvm_mmu_page *sp;
 
 	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
 	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
-	sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+	if (!direct)
+		sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
+						  PAGE_SIZE);
 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
 	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
@@ -1386,7 +1407,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 			return sp;
 		}
 	++vcpu->kvm->stat.mmu_cache_miss;
-	sp = kvm_mmu_alloc_page(vcpu, parent_pte);
+	sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
 	if (!sp)
 		return sp;
 	sp->gfn = gfn;
@@ -3403,7 +3424,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 
 	if (*sptep & PT_WRITABLE_MASK) {
 		rev_sp = page_header(__pa(sptep));
-		gfn = rev_sp->gfns[sptep - rev_sp->spt];
+		gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
 
 		if (!gfn_to_memslot(kvm, gfn)) {
 			if (!printk_ratelimit())
@@ -3417,8 +3438,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 			return;
 		}
 
-		rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
-				    rev_sp->role.level);
+		rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
 		if (!*rmapp) {
 			if (!printk_ratelimit())
 				return;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 167f53357eef..2ee7060a80a5 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -582,6 +582,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
 	offset = nr_present = 0;
 
+	/* direct kvm_mmu_page can not be unsync. */
+	BUG_ON(sp->role.direct);
+
 	if (PTTYPE == 32)
 		offset = sp->role.quadrant << PT64_LEVEL_BITS;
 
-- 
cgit v1.2.3


From c9fa0b3bef9a0b117b3c3f958ec553c21f609a9f Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 26 May 2010 16:48:25 +0800
Subject: KVM: MMU: Calculate correct base gfn for direct non-DIR level

In Document/kvm/mmu.txt:
  gfn:
    Either the guest page table containing the translations shadowed by this
    page, or the base page frame for linear translations. See role.direct.

But in __direct_map(), the base gfn calculation is incorrect,
it does not calculate correctly when level=3 or 4.

Fix by using PT64_LVL_ADDR_MASK() which accounts for all levels correctly.

Reported-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f46b6c9aff27..c0350be52c91 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2020,7 +2020,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 		}
 
 		if (*iterator.sptep == shadow_trap_nonpresent_pte) {
-			pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+			u64 base_addr = iterator.addr;
+
+			base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
+			pseudo_gfn = base_addr >> PAGE_SHIFT;
 			sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
 					      iterator.level - 1,
 					      1, ACC_ALL, iterator.sptep);
-- 
cgit v1.2.3


From 3af1817a0d65e8c1317e8d23cfe8a91aa1d4a065 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 26 May 2010 16:48:19 +0800
Subject: KVM: MMU: calculate correct gfn for small host pages backing large
 guest pages

In Documentation/kvm/mmu.txt:
  gfn:
    Either the guest page table containing the translations shadowed by this
    page, or the base page frame for linear translations. See role.direct.

But in function FNAME(fetch)(), sp->gfn is incorrect when one of following
situations occurred:

 1) guest is 32bit paging and the guest PDE maps a 4-MByte page
    (backed by 4k host pages), FNAME(fetch)() miss handling the quadrant.

    And if guest use pse-36, "table_gfn = gpte_to_gfn(gw->ptes[level - delta]);"
    is incorrect.

 2) guest is long mode paging and the guest PDPTE maps a 1-GByte page
    (backed by 4k or 2M host pages).

So we fix it to suit to the document and suit to the code which
requires sp->gfn correct when sp->role.direct=1.

We use the goal mapping gfn(gw->gfn) to calculate the base page frame
for linear translations, it is simple and easy to be understood.

Reported-by: Marcelo Tosatti <mtosatti@redhat.com>
Reported-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 2ee7060a80a5..1f7f5dd8306f 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -339,10 +339,13 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			direct = 1;
 			if (!is_dirty_gpte(gw->ptes[level - delta]))
 				access &= ~ACC_WRITE_MASK;
-			table_gfn = gpte_to_gfn(gw->ptes[level - delta]);
-			/* advance table_gfn when emulating 1gb pages with 4k */
-			if (delta == 0)
-				table_gfn += PT_INDEX(addr, level);
+			/*
+			 * It is a large guest pages backed by small host pages,
+			 * So we set @direct(@shadow_page->role.direct)=1, and
+			 * set @table_gfn(@shadow_page->gfn)=the base page frame
+			 * for linear translations.
+			 */
+			table_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
 			access &= gw->pte_access;
 		} else {
 			direct = 0;
-- 
cgit v1.2.3


From 01c168ac3d6568fed0373d82bd2db2b9339aab16 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Thu, 27 May 2010 16:09:48 +0800
Subject: KVM: MMU: don't check PT_WRITABLE_MASK directly

Since we have is_writable_pte(), make use of it.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c0350be52c91..9f4be0114bce 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2990,7 +2990,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 		pt = sp->spt;
 		for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
 			/* avoid RMW */
-			if (pt[i] & PT_WRITABLE_MASK)
+			if (is_writable_pte(pt[i]))
 				pt[i] &= ~PT_WRITABLE_MASK;
 	}
 	kvm_flush_remote_tlbs(kvm);
@@ -3425,7 +3425,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 	struct kvm_mmu_page *rev_sp;
 	gfn_t gfn;
 
-	if (*sptep & PT_WRITABLE_MASK) {
+	if (is_writable_pte(*sptep)) {
 		rev_sp = page_header(__pa(sptep));
 		gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
 
@@ -3474,7 +3474,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
 
 			if (!(ent & PT_PRESENT_MASK))
 				continue;
-			if (!(ent & PT_WRITABLE_MASK))
+			if (!is_writable_pte(ent))
 				continue;
 			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
 		}
@@ -3508,7 +3508,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
 
 		spte = rmap_next(vcpu->kvm, rmapp, NULL);
 		while (spte) {
-			if (*spte & PT_WRITABLE_MASK)
+			if (is_writable_pte(*spte))
 				printk(KERN_ERR "%s: (%s) shadow page has "
 				"writable mappings: gfn %lx role %x\n",
 			       __func__, audit_msg, sp->gfn,
-- 
cgit v1.2.3


From 6dc696d4ddf2181eefee361e1d24a49351aef1f6 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Wed, 26 May 2010 15:09:43 -1000
Subject: KVM: SVM: Fix EFER.LME being stripped

Must set VCPU register to be the guest notion of EFER even if that
setting is not valid on hardware.  This was masked by the set in
set_efer until 7657fd5ace88e8092f5f3a84117e093d7b893f26 broke that.
Fix is simply to set the VCPU register before stripping bits.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 02ea5cf4b1e5..9c68a650f57e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -286,11 +286,11 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
 
 static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
+	vcpu->arch.efer = efer;
 	if (!npt_enabled && !(efer & EFER_LMA))
 		efer &= ~EFER_LME;
 
 	to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
-	vcpu->arch.efer = efer;
 }
 
 static int is_external_interrupt(u32 info)
-- 
cgit v1.2.3


From 10ab25cd6bf7ee4e5a55d81f203f7dc1a855c27e Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Tue, 25 May 2010 16:01:50 +0200
Subject: KVM: x86: Propagate fpu_alloc errors

Memory allocation may fail. Propagate such errors.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Reviewed-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/svm.c              |  7 ++++++-
 arch/x86/kvm/vmx.c              |  4 +++-
 arch/x86/kvm/x86.c              | 11 +++++++++--
 4 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d08bb4a202de..0cd0f2923af5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -624,7 +624,7 @@ int kvm_pic_set_irq(void *opaque, int irq, int level);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
-void fx_init(struct kvm_vcpu *vcpu);
+int fx_init(struct kvm_vcpu *vcpu);
 
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9c68a650f57e..2ae0c3923293 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -904,13 +904,18 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	svm->asid_generation = 0;
 	init_vmcb(svm);
 
-	fx_init(&svm->vcpu);
+	err = fx_init(&svm->vcpu);
+	if (err)
+		goto free_page4;
+
 	svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
 	if (kvm_vcpu_is_bsp(&svm->vcpu))
 		svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
 
 	return &svm->vcpu;
 
+free_page4:
+	__free_page(hsave_page);
 free_page3:
 	__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
 free_page2:
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9c3ffc5fde44..e71c731433ee 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2659,7 +2659,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 		msr |= MSR_IA32_APICBASE_BSP;
 	kvm_set_apic_base(&vmx->vcpu, msr);
 
-	fx_init(&vmx->vcpu);
+	ret = fx_init(&vmx->vcpu);
+	if (ret != 0)
+		goto out;
 
 	seg_setup(VCPU_SREG_CS);
 	/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 033b9c207f93..e6e0d7781af7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5114,12 +5114,19 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	return 0;
 }
 
-void fx_init(struct kvm_vcpu *vcpu)
+int fx_init(struct kvm_vcpu *vcpu)
 {
-	fpu_alloc(&vcpu->arch.guest_fpu);
+	int err;
+
+	err = fpu_alloc(&vcpu->arch.guest_fpu);
+	if (err)
+		return err;
+
 	fpu_finit(&vcpu->arch.guest_fpu);
 
 	vcpu->arch.cr0 |= X86_CR0_ET;
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(fx_init);
 
-- 
cgit v1.2.3


From 8184dd38e22fcaec664c2b98c382b85c26780e26 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 27 May 2010 14:22:51 +0300
Subject: KVM: MMU: Allow spte.w=1 for gpte.w=0 and cr0.wp=0 only in shadow
 mode

When tdp is enabled, the guest's cr0.wp shouldn't have any effect on spte
permissions.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9f4be0114bce..69d40a6e1e68 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1882,7 +1882,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	spte |= (u64)pfn << PAGE_SHIFT;
 
 	if ((pte_access & ACC_WRITE_MASK)
-	    || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
+	    || (!tdp_enabled && write_fault && !is_write_protection(vcpu)
+		&& !user_fault)) {
 
 		if (level > PT_PAGE_TABLE_LEVEL &&
 		    has_wrprotected_page(vcpu->kvm, gfn, level)) {
-- 
cgit v1.2.3


From ec87fe2afcbcc4f430554980ec3e408bae34229d Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 27 May 2010 14:46:04 +0300
Subject: KVM: MMU: Document cr0.wp emulation

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/kvm/mmu.txt | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/Documentation/kvm/mmu.txt b/Documentation/kvm/mmu.txt
index 2201dcba92a1..1e7ecdd15bba 100644
--- a/Documentation/kvm/mmu.txt
+++ b/Documentation/kvm/mmu.txt
@@ -298,6 +298,25 @@ Host translation updates:
   - look up affected sptes through reverse map
   - drop (or update) translations
 
+Emulating cr0.wp
+================
+
+If tdp is not enabled, the host must keep cr0.wp=1 so page write protection
+works for the guest kernel, not guest guest userspace.  When the guest
+cr0.wp=1, this does not present a problem.  However when the guest cr0.wp=0,
+we cannot map the permissions for gpte.u=1, gpte.w=0 to any spte (the
+semantics require allowing any guest kernel access plus user read access).
+
+We handle this by mapping the permissions to two possible sptes, depending
+on fault type:
+
+- kernel write fault: spte.u=0, spte.w=1 (allows full kernel access,
+  disallows user access)
+- read fault: spte.u=1, spte.w=0 (allows full read access, disallows kernel
+  write access)
+
+(user write faults generate a #PF)
+
 Further reading
 ===============
 
-- 
cgit v1.2.3


From 316b95216e277cdb22bd65346f9b9d9e10d0f53d Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 27 May 2010 16:44:12 +0300
Subject: KVM: MMU: Document large pages

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 Documentation/kvm/mmu.txt | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/Documentation/kvm/mmu.txt b/Documentation/kvm/mmu.txt
index 1e7ecdd15bba..8cb42b957c73 100644
--- a/Documentation/kvm/mmu.txt
+++ b/Documentation/kvm/mmu.txt
@@ -317,6 +317,29 @@ on fault type:
 
 (user write faults generate a #PF)
 
+Large pages
+===========
+
+The mmu supports all combinations of large and small guest and host pages.
+Supported page sizes include 4k, 2M, 4M, and 1G.  4M pages are treated as
+two separate 2M pages, on both guest and host, since the mmu always uses PAE
+paging.
+
+To instantiate a large spte, four constraints must be satisfied:
+
+- the spte must point to a large host page
+- the guest pte must be a large pte of at least equivalent size (if tdp is
+  enabled, there is no guest pte and this condition is satisified)
+- if the spte will be writeable, the large page frame may not overlap any
+  write-protected pages
+- the guest page must be wholly contained by a single memory slot
+
+To check the last two conditions, the mmu maintains a ->write_count set of
+arrays for each memory slot and large page size.  Every write protected page
+causes its write_count to be incremented, thus preventing instantiation of
+a large spte.  The frames at the end of an unaligned memory slot have
+artificically inflated ->write_counts so they can never be instantiated.
+
 Further reading
 ===============
 
-- 
cgit v1.2.3


From b66d80006e415ee083e59c9429911eab78047f8f Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Mon, 31 May 2010 17:11:39 +0800
Subject: KVM: MMU: Don't calculate quadrant if tdp_enabled

There's no need to calculate quadrant if tdp is enabled.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 69d40a6e1e68..d3cd102aee26 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1378,7 +1378,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	if (role.direct)
 		role.cr4_pae = 0;
 	role.access = access;
-	if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+	if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
 		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
 		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
 		role.quadrant = quadrant;
-- 
cgit v1.2.3


From 5120702e732ed72c7055f511f8dd01de36424569 Mon Sep 17 00:00:00 2001
From: Mohammed Gamal <m.gamal005@gmail.com>
Date: Mon, 31 May 2010 22:40:54 +0300
Subject: KVM: VMX: Properly return error to userspace on vmentry failure

The vmexit handler returns KVM_EXIT_UNKNOWN since there is no handler
for vmentry failures. This intercepts vmentry failures and returns
KVM_FAIL_ENTRY to userspace instead.

Signed-off-by: Mohammed Gamal <m.gamal005@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e71c731433ee..8c9085d44c37 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3665,6 +3665,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	if (enable_ept && is_paging(vcpu))
 		vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
 
+	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+		vcpu->run->fail_entry.hardware_entry_failure_reason
+			= exit_reason;
+		return 0;
+	}
+
 	if (unlikely(vmx->fail)) {
 		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		vcpu->run->fail_entry.hardware_entry_failure_reason
-- 
cgit v1.2.3


From 49f6be8ea1bd74713c1a48e42db06a3808dfa2cd Mon Sep 17 00:00:00 2001
From: Andreas Schwab <schwab@linux-m68k.org>
Date: Mon, 31 May 2010 21:59:13 +0200
Subject: KVM: PPC: elide struct thread_struct instances from stack

Instead of instantiating a whole thread_struct on the stack use only the
required parts of it.

Signed-off-by: Andreas Schwab <schwab@linux-m68k.org>
Tested-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/powerpc/include/asm/kvm_fpu.h       | 27 +++++----
 arch/powerpc/kernel/ppc_ksyms.c          |  4 --
 arch/powerpc/kvm/book3s.c                | 49 +++++++++--------
 arch/powerpc/kvm/book3s_paired_singles.c | 94 +++++++++++++-------------------
 arch/powerpc/kvm/fpu.S                   | 18 ++++++
 5 files changed, 97 insertions(+), 95 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_fpu.h b/arch/powerpc/include/asm/kvm_fpu.h
index 94f05de9ad04..c3d4f0518a67 100644
--- a/arch/powerpc/include/asm/kvm_fpu.h
+++ b/arch/powerpc/include/asm/kvm_fpu.h
@@ -22,24 +22,24 @@
 
 #include <linux/types.h>
 
-extern void fps_fres(struct thread_struct *t, u32 *dst, u32 *src1);
-extern void fps_frsqrte(struct thread_struct *t, u32 *dst, u32 *src1);
-extern void fps_fsqrts(struct thread_struct *t, u32 *dst, u32 *src1);
+extern void fps_fres(u64 *fpscr, u32 *dst, u32 *src1);
+extern void fps_frsqrte(u64 *fpscr, u32 *dst, u32 *src1);
+extern void fps_fsqrts(u64 *fpscr, u32 *dst, u32 *src1);
 
-extern void fps_fadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
-extern void fps_fdivs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
-extern void fps_fmuls(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
-extern void fps_fsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fadds(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fdivs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fmuls(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fsubs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
 
-extern void fps_fmadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+extern void fps_fmadds(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
 		       u32 *src3);
-extern void fps_fmsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+extern void fps_fmsubs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
 		       u32 *src3);
-extern void fps_fnmadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+extern void fps_fnmadds(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
 		        u32 *src3);
-extern void fps_fnmsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+extern void fps_fnmsubs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
 		        u32 *src3);
-extern void fps_fsel(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+extern void fps_fsel(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
 		     u32 *src3);
 
 #define FPD_ONE_IN(name) extern void fpd_ ## name(u64 *fpscr, u32 *cr, \
@@ -82,4 +82,7 @@ FPD_THREE_IN(fmadd)
 FPD_THREE_IN(fnmsub)
 FPD_THREE_IN(fnmadd)
 
+extern void kvm_cvt_fd(u32 *from, u64 *to, u64 *fpscr);
+extern void kvm_cvt_df(u64 *from, u32 *to, u64 *fpscr);
+
 #endif
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index 3b4dcc82a4c1..ab3e392ac63c 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -101,10 +101,6 @@ EXPORT_SYMBOL(pci_dram_offset);
 EXPORT_SYMBOL(start_thread);
 EXPORT_SYMBOL(kernel_thread);
 
-#ifdef CONFIG_PPC_FPU
-EXPORT_SYMBOL_GPL(cvt_df);
-EXPORT_SYMBOL_GPL(cvt_fd);
-#endif
 EXPORT_SYMBOL(giveup_fpu);
 #ifdef CONFIG_ALTIVEC
 EXPORT_SYMBOL(giveup_altivec);
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index f6eac2f337d9..801d9f3c70ae 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1293,12 +1293,17 @@ extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
 int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
 	int ret;
-	struct thread_struct ext_bkp;
+	double fpr[32][TS_FPRWIDTH];
+	unsigned int fpscr;
+	int fpexc_mode;
 #ifdef CONFIG_ALTIVEC
-	bool save_vec = current->thread.used_vr;
+	vector128 vr[32];
+	vector128 vscr;
+	unsigned long uninitialized_var(vrsave);
+	int used_vr;
 #endif
 #ifdef CONFIG_VSX
-	bool save_vsx = current->thread.used_vsr;
+	int used_vsr;
 #endif
 	ulong ext_msr;
 
@@ -1311,27 +1316,27 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	/* Save FPU state in stack */
 	if (current->thread.regs->msr & MSR_FP)
 		giveup_fpu(current);
-	memcpy(ext_bkp.fpr, current->thread.fpr, sizeof(current->thread.fpr));
-	ext_bkp.fpscr = current->thread.fpscr;
-	ext_bkp.fpexc_mode = current->thread.fpexc_mode;
+	memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr));
+	fpscr = current->thread.fpscr.val;
+	fpexc_mode = current->thread.fpexc_mode;
 
 #ifdef CONFIG_ALTIVEC
 	/* Save Altivec state in stack */
-	if (save_vec) {
+	used_vr = current->thread.used_vr;
+	if (used_vr) {
 		if (current->thread.regs->msr & MSR_VEC)
 			giveup_altivec(current);
-		memcpy(ext_bkp.vr, current->thread.vr, sizeof(ext_bkp.vr));
-		ext_bkp.vscr = current->thread.vscr;
-		ext_bkp.vrsave = current->thread.vrsave;
+		memcpy(vr, current->thread.vr, sizeof(current->thread.vr));
+		vscr = current->thread.vscr;
+		vrsave = current->thread.vrsave;
 	}
-	ext_bkp.used_vr = current->thread.used_vr;
 #endif
 
 #ifdef CONFIG_VSX
 	/* Save VSX state in stack */
-	if (save_vsx && (current->thread.regs->msr & MSR_VSX))
+	used_vsr = current->thread.used_vsr;
+	if (used_vsr && (current->thread.regs->msr & MSR_VSX))
 			__giveup_vsx(current);
-	ext_bkp.used_vsr = current->thread.used_vsr;
 #endif
 
 	/* Remember the MSR with disabled extensions */
@@ -1356,22 +1361,22 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	kvmppc_giveup_ext(vcpu, MSR_VSX);
 
 	/* Restore FPU state from stack */
-	memcpy(current->thread.fpr, ext_bkp.fpr, sizeof(ext_bkp.fpr));
-	current->thread.fpscr = ext_bkp.fpscr;
-	current->thread.fpexc_mode = ext_bkp.fpexc_mode;
+	memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr));
+	current->thread.fpscr.val = fpscr;
+	current->thread.fpexc_mode = fpexc_mode;
 
 #ifdef CONFIG_ALTIVEC
 	/* Restore Altivec state from stack */
-	if (save_vec && current->thread.used_vr) {
-		memcpy(current->thread.vr, ext_bkp.vr, sizeof(ext_bkp.vr));
-		current->thread.vscr = ext_bkp.vscr;
-		current->thread.vrsave= ext_bkp.vrsave;
+	if (used_vr && current->thread.used_vr) {
+		memcpy(current->thread.vr, vr, sizeof(current->thread.vr));
+		current->thread.vscr = vscr;
+		current->thread.vrsave = vrsave;
 	}
-	current->thread.used_vr = ext_bkp.used_vr;
+	current->thread.used_vr = used_vr;
 #endif
 
 #ifdef CONFIG_VSX
-	current->thread.used_vsr = ext_bkp.used_vsr;
+	current->thread.used_vsr = used_vsr;
 #endif
 
 	return ret;
diff --git a/arch/powerpc/kvm/book3s_paired_singles.c b/arch/powerpc/kvm/book3s_paired_singles.c
index a9f66abafcb3..474f2e24050a 100644
--- a/arch/powerpc/kvm/book3s_paired_singles.c
+++ b/arch/powerpc/kvm/book3s_paired_singles.c
@@ -159,10 +159,7 @@
 
 static inline void kvmppc_sync_qpr(struct kvm_vcpu *vcpu, int rt)
 {
-	struct thread_struct t;
-
-	t.fpscr.val = vcpu->arch.fpscr;
-	cvt_df((double*)&vcpu->arch.fpr[rt], (float*)&vcpu->arch.qpr[rt], &t);
+	kvm_cvt_df(&vcpu->arch.fpr[rt], &vcpu->arch.qpr[rt], &vcpu->arch.fpscr);
 }
 
 static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store)
@@ -183,7 +180,6 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				   int rs, ulong addr, int ls_type)
 {
 	int emulated = EMULATE_FAIL;
-	struct thread_struct t;
 	int r;
 	char tmp[8];
 	int len = sizeof(u32);
@@ -191,8 +187,6 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	if (ls_type == FPU_LS_DOUBLE)
 		len = sizeof(u64);
 
-	t.fpscr.val = vcpu->arch.fpscr;
-
 	/* read from memory */
 	r = kvmppc_ld(vcpu, &addr, len, tmp, true);
 	vcpu->arch.paddr_accessed = addr;
@@ -210,7 +204,7 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	/* put in registers */
 	switch (ls_type) {
 	case FPU_LS_SINGLE:
-		cvt_fd((float*)tmp, (double*)&vcpu->arch.fpr[rs], &t);
+		kvm_cvt_fd((u32*)tmp, &vcpu->arch.fpr[rs], &vcpu->arch.fpscr);
 		vcpu->arch.qpr[rs] = *((u32*)tmp);
 		break;
 	case FPU_LS_DOUBLE:
@@ -229,17 +223,14 @@ static int kvmppc_emulate_fpr_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				    int rs, ulong addr, int ls_type)
 {
 	int emulated = EMULATE_FAIL;
-	struct thread_struct t;
 	int r;
 	char tmp[8];
 	u64 val;
 	int len;
 
-	t.fpscr.val = vcpu->arch.fpscr;
-
 	switch (ls_type) {
 	case FPU_LS_SINGLE:
-		cvt_df((double*)&vcpu->arch.fpr[rs], (float*)tmp, &t);
+		kvm_cvt_df(&vcpu->arch.fpr[rs], (u32*)tmp, &vcpu->arch.fpscr);
 		val = *((u32*)tmp);
 		len = sizeof(u32);
 		break;
@@ -278,13 +269,10 @@ static int kvmppc_emulate_psq_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				   int rs, ulong addr, bool w, int i)
 {
 	int emulated = EMULATE_FAIL;
-	struct thread_struct t;
 	int r;
 	float one = 1.0;
 	u32 tmp[2];
 
-	t.fpscr.val = vcpu->arch.fpscr;
-
 	/* read from memory */
 	if (w) {
 		r = kvmppc_ld(vcpu, &addr, sizeof(u32), tmp, true);
@@ -308,7 +296,7 @@ static int kvmppc_emulate_psq_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	emulated = EMULATE_DONE;
 
 	/* put in registers */
-	cvt_fd((float*)&tmp[0], (double*)&vcpu->arch.fpr[rs], &t);
+	kvm_cvt_fd(&tmp[0], &vcpu->arch.fpr[rs], &vcpu->arch.fpscr);
 	vcpu->arch.qpr[rs] = tmp[1];
 
 	dprintk(KERN_INFO "KVM: PSQ_LD [0x%x, 0x%x] at 0x%lx (%d)\n", tmp[0],
@@ -322,14 +310,11 @@ static int kvmppc_emulate_psq_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				    int rs, ulong addr, bool w, int i)
 {
 	int emulated = EMULATE_FAIL;
-	struct thread_struct t;
 	int r;
 	u32 tmp[2];
 	int len = w ? sizeof(u32) : sizeof(u64);
 
-	t.fpscr.val = vcpu->arch.fpscr;
-
-	cvt_df((double*)&vcpu->arch.fpr[rs], (float*)&tmp[0], &t);
+	kvm_cvt_df(&vcpu->arch.fpr[rs], &tmp[0], &vcpu->arch.fpscr);
 	tmp[1] = vcpu->arch.qpr[rs];
 
 	r = kvmppc_st(vcpu, &addr, len, tmp, true);
@@ -517,7 +502,7 @@ static int get_d_signext(u32 inst)
 static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
 				      int reg_out, int reg_in1, int reg_in2,
 				      int reg_in3, int scalar,
-				      void (*func)(struct thread_struct *t,
+				      void (*func)(u64 *fpscr,
 						 u32 *dst, u32 *src1,
 						 u32 *src2, u32 *src3))
 {
@@ -526,27 +511,25 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
 	u32 ps0_out;
 	u32 ps0_in1, ps0_in2, ps0_in3;
 	u32 ps1_in1, ps1_in2, ps1_in3;
-	struct thread_struct t;
-	t.fpscr.val = vcpu->arch.fpscr;
 
 	/* RC */
 	WARN_ON(rc);
 
 	/* PS0 */
-	cvt_df((double*)&fpr[reg_in1], (float*)&ps0_in1, &t);
-	cvt_df((double*)&fpr[reg_in2], (float*)&ps0_in2, &t);
-	cvt_df((double*)&fpr[reg_in3], (float*)&ps0_in3, &t);
+	kvm_cvt_df(&fpr[reg_in1], &ps0_in1, &vcpu->arch.fpscr);
+	kvm_cvt_df(&fpr[reg_in2], &ps0_in2, &vcpu->arch.fpscr);
+	kvm_cvt_df(&fpr[reg_in3], &ps0_in3, &vcpu->arch.fpscr);
 
 	if (scalar & SCALAR_LOW)
 		ps0_in2 = qpr[reg_in2];
 
-	func(&t, &ps0_out, &ps0_in1, &ps0_in2, &ps0_in3);
+	func(&vcpu->arch.fpscr, &ps0_out, &ps0_in1, &ps0_in2, &ps0_in3);
 
 	dprintk(KERN_INFO "PS3 ps0 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n",
 			  ps0_in1, ps0_in2, ps0_in3, ps0_out);
 
 	if (!(scalar & SCALAR_NO_PS0))
-		cvt_fd((float*)&ps0_out, (double*)&fpr[reg_out], &t);
+		kvm_cvt_fd(&ps0_out, &fpr[reg_out], &vcpu->arch.fpscr);
 
 	/* PS1 */
 	ps1_in1 = qpr[reg_in1];
@@ -557,7 +540,7 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
 		ps1_in2 = ps0_in2;
 
 	if (!(scalar & SCALAR_NO_PS1))
-		func(&t, &qpr[reg_out], &ps1_in1, &ps1_in2, &ps1_in3);
+		func(&vcpu->arch.fpscr, &qpr[reg_out], &ps1_in1, &ps1_in2, &ps1_in3);
 
 	dprintk(KERN_INFO "PS3 ps1 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n",
 			  ps1_in1, ps1_in2, ps1_in3, qpr[reg_out]);
@@ -568,7 +551,7 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
 static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
 				    int reg_out, int reg_in1, int reg_in2,
 				    int scalar,
-				    void (*func)(struct thread_struct *t,
+				    void (*func)(u64 *fpscr,
 						 u32 *dst, u32 *src1,
 						 u32 *src2))
 {
@@ -578,27 +561,25 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
 	u32 ps0_in1, ps0_in2;
 	u32 ps1_out;
 	u32 ps1_in1, ps1_in2;
-	struct thread_struct t;
-	t.fpscr.val = vcpu->arch.fpscr;
 
 	/* RC */
 	WARN_ON(rc);
 
 	/* PS0 */
-	cvt_df((double*)&fpr[reg_in1], (float*)&ps0_in1, &t);
+	kvm_cvt_df(&fpr[reg_in1], &ps0_in1, &vcpu->arch.fpscr);
 
 	if (scalar & SCALAR_LOW)
 		ps0_in2 = qpr[reg_in2];
 	else
-		cvt_df((double*)&fpr[reg_in2], (float*)&ps0_in2, &t);
+		kvm_cvt_df(&fpr[reg_in2], &ps0_in2, &vcpu->arch.fpscr);
 
-	func(&t, &ps0_out, &ps0_in1, &ps0_in2);
+	func(&vcpu->arch.fpscr, &ps0_out, &ps0_in1, &ps0_in2);
 
 	if (!(scalar & SCALAR_NO_PS0)) {
 		dprintk(KERN_INFO "PS2 ps0 -> f(0x%x, 0x%x) = 0x%x\n",
 				  ps0_in1, ps0_in2, ps0_out);
 
-		cvt_fd((float*)&ps0_out, (double*)&fpr[reg_out], &t);
+		kvm_cvt_fd(&ps0_out, &fpr[reg_out], &vcpu->arch.fpscr);
 	}
 
 	/* PS1 */
@@ -608,7 +589,7 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
 	if (scalar & SCALAR_HIGH)
 		ps1_in2 = ps0_in2;
 
-	func(&t, &ps1_out, &ps1_in1, &ps1_in2);
+	func(&vcpu->arch.fpscr, &ps1_out, &ps1_in1, &ps1_in2);
 
 	if (!(scalar & SCALAR_NO_PS1)) {
 		qpr[reg_out] = ps1_out;
@@ -622,31 +603,29 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
 
 static int kvmppc_ps_one_in(struct kvm_vcpu *vcpu, bool rc,
 				    int reg_out, int reg_in,
-				    void (*func)(struct thread_struct *t,
+				    void (*func)(u64 *t,
 						 u32 *dst, u32 *src1))
 {
 	u32 *qpr = vcpu->arch.qpr;
 	u64 *fpr = vcpu->arch.fpr;
 	u32 ps0_out, ps0_in;
 	u32 ps1_in;
-	struct thread_struct t;
-	t.fpscr.val = vcpu->arch.fpscr;
 
 	/* RC */
 	WARN_ON(rc);
 
 	/* PS0 */
-	cvt_df((double*)&fpr[reg_in], (float*)&ps0_in, &t);
-	func(&t, &ps0_out, &ps0_in);
+	kvm_cvt_df(&fpr[reg_in], &ps0_in, &vcpu->arch.fpscr);
+	func(&vcpu->arch.fpscr, &ps0_out, &ps0_in);
 
 	dprintk(KERN_INFO "PS1 ps0 -> f(0x%x) = 0x%x\n",
 			  ps0_in, ps0_out);
 
-	cvt_fd((float*)&ps0_out, (double*)&fpr[reg_out], &t);
+	kvm_cvt_fd(&ps0_out, &fpr[reg_out], &vcpu->arch.fpscr);
 
 	/* PS1 */
 	ps1_in = qpr[reg_in];
-	func(&t, &qpr[reg_out], &ps1_in);
+	func(&vcpu->arch.fpscr, &qpr[reg_out], &ps1_in);
 
 	dprintk(KERN_INFO "PS1 ps1 -> f(0x%x) = 0x%x\n",
 			  ps1_in, qpr[reg_out]);
@@ -672,13 +651,10 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 
 	bool rcomp = (inst & 1) ? true : false;
 	u32 cr = kvmppc_get_cr(vcpu);
-	struct thread_struct t;
 #ifdef DEBUG
 	int i;
 #endif
 
-	t.fpscr.val = vcpu->arch.fpscr;
-
 	if (!kvmppc_inst_is_paired_single(vcpu, inst))
 		return EMULATE_FAIL;
 
@@ -695,7 +671,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 #ifdef DEBUG
 	for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) {
 		u32 f;
-		cvt_df((double*)&vcpu->arch.fpr[i], (float*)&f, &t);
+		kvm_cvt_df(&vcpu->arch.fpr[i], &f, &vcpu->arch.fpscr);
 		dprintk(KERN_INFO "FPR[%d] = 0x%x / 0x%llx    QPR[%d] = 0x%x\n",
 			i, f, vcpu->arch.fpr[i], i, vcpu->arch.qpr[i]);
 	}
@@ -819,8 +795,9 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			WARN_ON(rcomp);
 			vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_ra];
 			/* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */
-			cvt_df((double*)&vcpu->arch.fpr[ax_rb],
-			       (float*)&vcpu->arch.qpr[ax_rd], &t);
+			kvm_cvt_df(&vcpu->arch.fpr[ax_rb],
+				   &vcpu->arch.qpr[ax_rd],
+				   &vcpu->arch.fpscr);
 			break;
 		case OP_4X_PS_MERGE01:
 			WARN_ON(rcomp);
@@ -830,17 +807,20 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		case OP_4X_PS_MERGE10:
 			WARN_ON(rcomp);
 			/* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */
-			cvt_fd((float*)&vcpu->arch.qpr[ax_ra],
-			       (double*)&vcpu->arch.fpr[ax_rd], &t);
+			kvm_cvt_fd(&vcpu->arch.qpr[ax_ra],
+				   &vcpu->arch.fpr[ax_rd],
+				   &vcpu->arch.fpscr);
 			/* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */
-			cvt_df((double*)&vcpu->arch.fpr[ax_rb],
-			       (float*)&vcpu->arch.qpr[ax_rd], &t);
+			kvm_cvt_df(&vcpu->arch.fpr[ax_rb],
+				   &vcpu->arch.qpr[ax_rd],
+				   &vcpu->arch.fpscr);
 			break;
 		case OP_4X_PS_MERGE11:
 			WARN_ON(rcomp);
 			/* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */
-			cvt_fd((float*)&vcpu->arch.qpr[ax_ra],
-			       (double*)&vcpu->arch.fpr[ax_rd], &t);
+			kvm_cvt_fd(&vcpu->arch.qpr[ax_ra],
+				   &vcpu->arch.fpr[ax_rd],
+				   &vcpu->arch.fpscr);
 			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
 			break;
 		}
@@ -1275,7 +1255,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 #ifdef DEBUG
 	for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) {
 		u32 f;
-		cvt_df((double*)&vcpu->arch.fpr[i], (float*)&f, &t);
+		kvm_cvt_df(&vcpu->arch.fpr[i], &f, &vcpu->arch.fpscr);
 		dprintk(KERN_INFO "FPR[%d] = 0x%x\n", i, f);
 	}
 #endif
diff --git a/arch/powerpc/kvm/fpu.S b/arch/powerpc/kvm/fpu.S
index 2b340a3eee90..cb34bbe16113 100644
--- a/arch/powerpc/kvm/fpu.S
+++ b/arch/powerpc/kvm/fpu.S
@@ -271,3 +271,21 @@ FPD_THREE_IN(fmsub)
 FPD_THREE_IN(fmadd)
 FPD_THREE_IN(fnmsub)
 FPD_THREE_IN(fnmadd)
+
+_GLOBAL(kvm_cvt_fd)
+	lfd	0,0(r5)			/* load up fpscr value */
+	MTFSF_L(0)
+	lfs	0,0(r3)
+	stfd	0,0(r4)
+	mffs	0
+	stfd	0,0(r5)			/* save new fpscr value */
+	blr
+
+_GLOBAL(kvm_cvt_df)
+	lfd	0,0(r5)			/* load up fpscr value */
+	MTFSF_L(0)
+	lfd	0,0(r3)
+	stfs	0,0(r4)
+	mffs	0
+	stfd	0,0(r5)			/* save new fpscr value */
+	blr
-- 
cgit v1.2.3


From d2d7a61127ced8ce9ec7666991aad1d846932212 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Tue, 1 Jun 2010 08:22:48 -0400
Subject: KVM: Add Documentation/kvm/msr.txt

This patch adds a file that documents the usage of KVM-specific
MSRs.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Reviewed-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 Documentation/kvm/msr.txt | 153 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 153 insertions(+)
 create mode 100644 Documentation/kvm/msr.txt

diff --git a/Documentation/kvm/msr.txt b/Documentation/kvm/msr.txt
new file mode 100644
index 000000000000..8ddcfe84c09a
--- /dev/null
+++ b/Documentation/kvm/msr.txt
@@ -0,0 +1,153 @@
+KVM-specific MSRs.
+Glauber Costa <glommer@redhat.com>, Red Hat Inc, 2010
+=====================================================
+
+KVM makes use of some custom MSRs to service some requests.
+At present, this facility is only used by kvmclock.
+
+Custom MSRs have a range reserved for them, that goes from
+0x4b564d00 to 0x4b564dff. There are MSRs outside this area,
+but they are deprecated and their use is discouraged.
+
+Custom MSR list
+--------
+
+The current supported Custom MSR list is:
+
+MSR_KVM_WALL_CLOCK_NEW:   0x4b564d00
+
+	data: 4-byte alignment physical address of a memory area which must be
+	in guest RAM. This memory is expected to hold a copy of the following
+	structure:
+
+	struct pvclock_wall_clock {
+		u32   version;
+		u32   sec;
+		u32   nsec;
+	} __attribute__((__packed__));
+
+	whose data will be filled in by the hypervisor. The hypervisor is only
+	guaranteed to update this data at the moment of MSR write.
+	Users that want to reliably query this information more than once have
+	to write more than once to this MSR. Fields have the following meanings:
+
+		version: guest has to check version before and after grabbing
+		time information and check that they are both equal and even.
+		An odd version indicates an in-progress update.
+
+		sec: number of seconds for wallclock.
+
+		nsec: number of nanoseconds for wallclock.
+
+	Note that although MSRs are per-CPU entities, the effect of this
+	particular MSR is global.
+
+	Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid
+	leaf prior to usage.
+
+MSR_KVM_SYSTEM_TIME_NEW:  0x4b564d01
+
+	data: 4-byte aligned physical address of a memory area which must be in
+	guest RAM, plus an enable bit in bit 0. This memory is expected to hold
+	a copy of the following structure:
+
+	struct pvclock_vcpu_time_info {
+		u32   version;
+		u32   pad0;
+		u64   tsc_timestamp;
+		u64   system_time;
+		u32   tsc_to_system_mul;
+		s8    tsc_shift;
+		u8    flags;
+		u8    pad[2];
+	} __attribute__((__packed__)); /* 32 bytes */
+
+	whose data will be filled in by the hypervisor periodically. Only one
+	write, or registration, is needed for each VCPU. The interval between
+	updates of this structure is arbitrary and implementation-dependent.
+	The hypervisor may update this structure at any time it sees fit until
+	anything with bit0 == 0 is written to it.
+
+	Fields have the following meanings:
+
+		version: guest has to check version before and after grabbing
+		time information and check that they are both equal and even.
+		An odd version indicates an in-progress update.
+
+		tsc_timestamp: the tsc value at the current VCPU at the time
+		of the update of this structure. Guests can subtract this value
+		from current tsc to derive a notion of elapsed time since the
+		structure update.
+
+		system_time: a host notion of monotonic time, including sleep
+		time at the time this structure was last updated. Unit is
+		nanoseconds.
+
+		tsc_to_system_mul: a function of the tsc frequency. One has
+		to multiply any tsc-related quantity by this value to get
+		a value in nanoseconds, besides dividing by 2^tsc_shift
+
+		tsc_shift: cycle to nanosecond divider, as a power of two, to
+		allow for shift rights. One has to shift right any tsc-related
+		quantity by this value to get a value in nanoseconds, besides
+		multiplying by tsc_to_system_mul.
+
+		With this information, guests can derive per-CPU time by
+		doing:
+
+			time = (current_tsc - tsc_timestamp)
+			time = (time * tsc_to_system_mul) >> tsc_shift
+			time = time + system_time
+
+		flags: bits in this field indicate extended capabilities
+		coordinated between the guest and the hypervisor. Availability
+		of specific flags has to be checked in 0x40000001 cpuid leaf.
+		Current flags are:
+
+		 flag bit   | cpuid bit    | meaning
+		-------------------------------------------------------------
+			    |	           | time measures taken across
+		     0      |	   24      | multiple cpus are guaranteed to
+			    |		   | be monotonic
+		-------------------------------------------------------------
+
+	Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid
+	leaf prior to usage.
+
+
+MSR_KVM_WALL_CLOCK:  0x11
+
+	data and functioning: same as MSR_KVM_WALL_CLOCK_NEW. Use that instead.
+
+	This MSR falls outside the reserved KVM range and may be removed in the
+	future. Its usage is deprecated.
+
+	Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid
+	leaf prior to usage.
+
+MSR_KVM_SYSTEM_TIME: 0x12
+
+	data and functioning: same as MSR_KVM_SYSTEM_TIME_NEW. Use that instead.
+
+	This MSR falls outside the reserved KVM range and may be removed in the
+	future. Its usage is deprecated.
+
+	Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid
+	leaf prior to usage.
+
+	The suggested algorithm for detecting kvmclock presence is then:
+
+		if (!kvm_para_available())    /* refer to cpuid.txt */
+			return NON_PRESENT;
+
+		flags = cpuid_eax(0x40000001);
+		if (flags & 3) {
+			msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
+			msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
+			return PRESENT;
+		} else if (flags & 0) {
+			msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
+			msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
+			return PRESENT;
+		} else
+			return NON_PRESENT;
-- 
cgit v1.2.3


From 4bc9b9828150747386130ab172f7e868e1a0fc2a Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 2 Jun 2010 14:05:24 +0800
Subject: KVM: VMX: Enforce EPT pagetable level checking

We only support 4 levels EPT pagetable now.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8c9085d44c37..2201e3816205 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -340,6 +340,11 @@ static inline bool cpu_has_vmx_ept_1g_page(void)
 	return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
 }
 
+static inline bool cpu_has_vmx_ept_4levels(void)
+{
+	return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
+}
+
 static inline bool cpu_has_vmx_invept_individual_addr(void)
 {
 	return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
@@ -1568,7 +1573,8 @@ static __init int hardware_setup(void)
 	if (!cpu_has_vmx_vpid())
 		enable_vpid = 0;
 
-	if (!cpu_has_vmx_ept()) {
+	if (!cpu_has_vmx_ept() ||
+	    !cpu_has_vmx_ept_4levels()) {
 		enable_ept = 0;
 		enable_unrestricted_guest = 0;
 	}
-- 
cgit v1.2.3


From 3bd89007ab4243b36bc30f5fc4aa8b4f75ff8fc2 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 2 Jun 2010 17:01:23 +0800
Subject: KVM: cleanup "*new.rmap" type

The type of '*new.rmap' is not 'struct page *', fix it

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 virt/kvm/kvm_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4a71faa96dfd..78ed71ad6aa1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -600,7 +600,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	/* Allocate if a slot is being created */
 #ifndef CONFIG_S390
 	if (npages && !new.rmap) {
-		new.rmap = vmalloc(npages * sizeof(struct page *));
+		new.rmap = vmalloc(npages * sizeof(*new.rmap));
 
 		if (!new.rmap)
 			goto out_free;
-- 
cgit v1.2.3


From 7bee342a9e994cce7122cb187b4f3ded9d871165 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 2 Jun 2010 17:06:03 +0800
Subject: KVM: x86: use linux/uaccess.h instead of asm/uaccess.h

Should use linux/uaccess.h instead of asm/uaccess.h

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e6e0d7781af7..b08c0052e332 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -42,13 +42,13 @@
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <linux/perf_event.h>
+#include <linux/uaccess.h>
 #include <trace/events/kvm.h>
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 
 #include <asm/debugreg.h>
-#include <asm/uaccess.h>
 #include <asm/msr.h>
 #include <asm/desc.h>
 #include <asm/mtrr.h>
-- 
cgit v1.2.3


From 518c8aee5ca74fc03273fc6b4893cf456d65d545 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Fri, 4 Jun 2010 08:51:39 +0800
Subject: KVM: VMX: Make sure single type invvpid is supported before issuing
 invvpid instruction

According to SDM, we need check whether single-context INVVPID type is supported
before issuing invvpid instruction.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Reviewed-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/vmx.h | 2 ++
 arch/x86/kvm/vmx.c         | 8 +++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 104cf86a7562..b4e28400c9ff 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -376,6 +376,8 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_CONTEXT_BIT		(1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
 
+#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT      (1ull << 9) /* (41 - 32) */
+
 #define VMX_EPT_DEFAULT_GAW			3
 #define VMX_EPT_MAX_GAW				0x4
 #define VMX_EPT_MT_EPTE_SHIFT			3
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2201e3816205..945265361885 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -360,6 +360,11 @@ static inline bool cpu_has_vmx_invept_global(void)
 	return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
 }
 
+static inline bool cpu_has_vmx_invvpid_single(void)
+{
+	return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
+}
+
 static inline bool cpu_has_vmx_ept(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -504,7 +509,8 @@ static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
 	if (vmx->vpid == 0)
 		return;
 
-	__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
+	if (cpu_has_vmx_invvpid_single())
+		__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
 }
 
 static inline void ept_sync_global(void)
-- 
cgit v1.2.3


From 03116aa57e75b1bbe8b5e04f3cd21cdb6588c4ba Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 4 Jun 2010 21:52:17 +0800
Subject: KVM: MMU: skip invalid sp when unprotect page

In kvm_mmu_unprotect_page(), the invalid sp can be skipped

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d3cd102aee26..3ac51153bc47 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1629,7 +1629,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 	bucket = &kvm->arch.mmu_page_hash[index];
 restart:
 	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
-		if (sp->gfn == gfn && !sp->role.direct) {
+		if (sp->gfn == gfn && !sp->role.direct && !sp->role.invalid) {
 			pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
 				 sp->role.word);
 			r = 1;
-- 
cgit v1.2.3


From 7ae680eb2d5f0cb10ca0e6d1ff5ecb145befe8e4 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 4 Jun 2010 21:53:07 +0800
Subject: KVM: MMU: introduce some macros to cleanup hlist traverseing

Introduce for_each_gfn_sp() and for_each_gfn_indirect_valid_sp() to
cleanup hlist traverseing

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 122 +++++++++++++++++++++--------------------------------
 1 file changed, 47 insertions(+), 75 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3ac51153bc47..881ad918455c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1201,6 +1201,17 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 
 static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 
+#define for_each_gfn_sp(kvm, sp, gfn, pos, n)				\
+  hlist_for_each_entry_safe(sp, pos, n,					\
+   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\
+	if ((sp)->gfn != (gfn)) {} else
+
+#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos, n)		\
+  hlist_for_each_entry_safe(sp, pos, n,					\
+   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\
+		if ((sp)->gfn != (gfn) || (sp)->role.direct ||		\
+			(sp)->role.invalid) {} else
+
 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			   bool clear_unsync)
 {
@@ -1244,16 +1255,12 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 /* @gfn should be write-protected at the call site */
 static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 {
-	struct hlist_head *bucket;
 	struct kvm_mmu_page *s;
 	struct hlist_node *node, *n;
-	unsigned index;
 	bool flush = false;
 
-	index = kvm_page_table_hashfn(gfn);
-	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-	hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
-		if (s->gfn != gfn || !s->unsync || s->role.invalid)
+	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) {
+		if (!s->unsync)
 			continue;
 
 		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
@@ -1365,9 +1372,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 					     u64 *parent_pte)
 {
 	union kvm_mmu_page_role role;
-	unsigned index;
 	unsigned quadrant;
-	struct hlist_head *bucket;
 	struct kvm_mmu_page *sp;
 	struct hlist_node *node, *tmp;
 	bool need_sync = false;
@@ -1383,36 +1388,34 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
 		role.quadrant = quadrant;
 	}
-	index = kvm_page_table_hashfn(gfn);
-	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-	hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
-		if (sp->gfn == gfn) {
-			if (!need_sync && sp->unsync)
-				need_sync = true;
+	for_each_gfn_sp(vcpu->kvm, sp, gfn, node, tmp) {
+		if (!need_sync && sp->unsync)
+			need_sync = true;
 
-			if (sp->role.word != role.word)
-				continue;
+		if (sp->role.word != role.word)
+			continue;
 
-			if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
-				break;
+		if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
+			break;
 
-			mmu_page_add_parent_pte(vcpu, sp, parent_pte);
-			if (sp->unsync_children) {
-				set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
-				kvm_mmu_mark_parents_unsync(sp);
-			} else if (sp->unsync)
-				kvm_mmu_mark_parents_unsync(sp);
+		mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+		if (sp->unsync_children) {
+			set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
+			kvm_mmu_mark_parents_unsync(sp);
+		} else if (sp->unsync)
+			kvm_mmu_mark_parents_unsync(sp);
 
-			trace_kvm_mmu_get_page(sp, false);
-			return sp;
-		}
+		trace_kvm_mmu_get_page(sp, false);
+		return sp;
+	}
 	++vcpu->kvm->stat.mmu_cache_miss;
 	sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
 	if (!sp)
 		return sp;
 	sp->gfn = gfn;
 	sp->role = role;
-	hlist_add_head(&sp->hash_link, bucket);
+	hlist_add_head(&sp->hash_link,
+		&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
 	if (!direct) {
 		if (rmap_write_protect(vcpu->kvm, gfn))
 			kvm_flush_remote_tlbs(vcpu->kvm);
@@ -1617,46 +1620,34 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 
 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 {
-	unsigned index;
-	struct hlist_head *bucket;
 	struct kvm_mmu_page *sp;
 	struct hlist_node *node, *n;
 	int r;
 
 	pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
 	r = 0;
-	index = kvm_page_table_hashfn(gfn);
-	bucket = &kvm->arch.mmu_page_hash[index];
 restart:
-	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
-		if (sp->gfn == gfn && !sp->role.direct && !sp->role.invalid) {
-			pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
-				 sp->role.word);
-			r = 1;
-			if (kvm_mmu_zap_page(kvm, sp))
-				goto restart;
-		}
+	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, n) {
+		pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
+			 sp->role.word);
+		r = 1;
+		if (kvm_mmu_zap_page(kvm, sp))
+			goto restart;
+	}
 	return r;
 }
 
 static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
 {
-	unsigned index;
-	struct hlist_head *bucket;
 	struct kvm_mmu_page *sp;
 	struct hlist_node *node, *nn;
 
-	index = kvm_page_table_hashfn(gfn);
-	bucket = &kvm->arch.mmu_page_hash[index];
 restart:
-	hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
-		if (sp->gfn == gfn && !sp->role.direct
-		    && !sp->role.invalid) {
-			pgprintk("%s: zap %lx %x\n",
-				 __func__, gfn, sp->role.word);
-			if (kvm_mmu_zap_page(kvm, sp))
-				goto restart;
-		}
+	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, nn) {
+		pgprintk("%s: zap %lx %x\n",
+			 __func__, gfn, sp->role.word);
+		if (kvm_mmu_zap_page(kvm, sp))
+			goto restart;
 	}
 }
 
@@ -1799,17 +1790,11 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
 static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 {
-	struct hlist_head *bucket;
 	struct kvm_mmu_page *s;
 	struct hlist_node *node, *n;
-	unsigned index;
-
-	index = kvm_page_table_hashfn(gfn);
-	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 
-	hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
-		if (s->gfn != gfn || s->role.direct || s->unsync ||
-		      s->role.invalid)
+	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) {
+		if (s->unsync)
 			continue;
 		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
 		__kvm_unsync_page(vcpu, s);
@@ -1819,18 +1804,11 @@ static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 				  bool can_unsync)
 {
-	unsigned index;
-	struct hlist_head *bucket;
 	struct kvm_mmu_page *s;
 	struct hlist_node *node, *n;
 	bool need_unsync = false;
 
-	index = kvm_page_table_hashfn(gfn);
-	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-	hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
-		if (s->gfn != gfn || s->role.direct || s->role.invalid)
-			continue;
-
+	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) {
 		if (s->role.level != PT_PAGE_TABLE_LEVEL)
 			return 1;
 
@@ -2703,8 +2681,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	struct kvm_mmu_page *sp;
 	struct hlist_node *node, *n;
-	struct hlist_head *bucket;
-	unsigned index;
 	u64 entry, gentry;
 	u64 *spte;
 	unsigned offset = offset_in_page(gpa);
@@ -2772,13 +2748,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 			vcpu->arch.last_pte_updated = NULL;
 		}
 	}
-	index = kvm_page_table_hashfn(gfn);
-	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 
 restart:
-	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
-		if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
-			continue;
+	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node, n) {
 		pte_size = sp->role.cr4_pae ? 8 : 4;
 		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
 		misaligned |= bytes < 4;
-- 
cgit v1.2.3


From 7775834a233478ec855b97e30727248f12eafe76 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 4 Jun 2010 21:53:54 +0800
Subject: KVM: MMU: split the operations of kvm_mmu_zap_page()

Using kvm_mmu_prepare_zap_page() and kvm_mmu_commit_zap_page() to
split kvm_mmu_zap_page() function, then we can:

- traverse hlist safely
- easily to gather remote tlb flush which occurs during page zapped

Those feature can be used in the later patches

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c      | 52 ++++++++++++++++++++++++++++++++++++++++---------
 arch/x86/kvm/mmutrace.h |  2 +-
 2 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 881ad918455c..9b849a70742d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -916,6 +916,7 @@ static int is_empty_shadow_page(u64 *spt)
 static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	ASSERT(is_empty_shadow_page(sp->spt));
+	hlist_del(&sp->hash_link);
 	list_del(&sp->link);
 	__free_page(virt_to_page(sp->spt));
 	if (!sp->role.direct)
@@ -1200,6 +1201,10 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 }
 
 static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+				    struct list_head *invalid_list);
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+				    struct list_head *invalid_list);
 
 #define for_each_gfn_sp(kvm, sp, gfn, pos, n)				\
   hlist_for_each_entry_safe(sp, pos, n,					\
@@ -1530,7 +1535,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
 }
 
 static int mmu_zap_unsync_children(struct kvm *kvm,
-				   struct kvm_mmu_page *parent)
+				   struct kvm_mmu_page *parent,
+				   struct list_head *invalid_list)
 {
 	int i, zapped = 0;
 	struct mmu_page_path parents;
@@ -1544,7 +1550,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
 		struct kvm_mmu_page *sp;
 
 		for_each_sp(pages, sp, parents, i) {
-			kvm_mmu_zap_page(kvm, sp);
+			kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
 			mmu_pages_clear_parents(&parents);
 			zapped++;
 		}
@@ -1554,16 +1560,16 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
 	return zapped;
 }
 
-static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+				    struct list_head *invalid_list)
 {
 	int ret;
 
-	trace_kvm_mmu_zap_page(sp);
+	trace_kvm_mmu_prepare_zap_page(sp);
 	++kvm->stat.mmu_shadow_zapped;
-	ret = mmu_zap_unsync_children(kvm, sp);
+	ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
 	kvm_mmu_page_unlink_children(kvm, sp);
 	kvm_mmu_unlink_parents(kvm, sp);
-	kvm_flush_remote_tlbs(kvm);
 	if (!sp->role.invalid && !sp->role.direct)
 		unaccount_shadowed(kvm, sp->gfn);
 	if (sp->unsync)
@@ -1571,17 +1577,45 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	if (!sp->root_count) {
 		/* Count self */
 		ret++;
-		hlist_del(&sp->hash_link);
-		kvm_mmu_free_page(kvm, sp);
+		list_move(&sp->link, invalid_list);
 	} else {
-		sp->role.invalid = 1;
 		list_move(&sp->link, &kvm->arch.active_mmu_pages);
 		kvm_reload_remote_mmus(kvm);
 	}
+
+	sp->role.invalid = 1;
 	kvm_mmu_reset_last_pte_updated(kvm);
 	return ret;
 }
 
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+				    struct list_head *invalid_list)
+{
+	struct kvm_mmu_page *sp;
+
+	if (list_empty(invalid_list))
+		return;
+
+	kvm_flush_remote_tlbs(kvm);
+
+	do {
+		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
+		WARN_ON(!sp->role.invalid || sp->root_count);
+		kvm_mmu_free_page(kvm, sp);
+	} while (!list_empty(invalid_list));
+
+}
+
+static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	LIST_HEAD(invalid_list);
+	int ret;
+
+	ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+	kvm_mmu_commit_zap_page(kvm, &invalid_list);
+	return ret;
+}
+
 /*
  * Changing the number of mmu pages allocated to the vm
  * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 42f07b1bfbc9..3aab0f0930ef 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -190,7 +190,7 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
 	TP_ARGS(sp)
 );
 
-DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page,
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
 	TP_PROTO(struct kvm_mmu_page *sp),
 
 	TP_ARGS(sp)
-- 
cgit v1.2.3


From 103ad25a86a6ec5418b3dca6a0d2bf2ba01a8318 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 4 Jun 2010 21:54:38 +0800
Subject: KVM: MMU: don't get free page number in the loop

In the later patch, we will modify sp's zapping way like below:

	kvm_mmu_prepare_zap_page A
	kvm_mmu_prepare_zap_page B
	kvm_mmu_prepare_zap_page C
	....
	kvm_mmu_commit_zap_page

[ zaped multiple sps only need to call kvm_mmu_commit_zap_page once ]

In __kvm_mmu_free_some_pages() function, the free page number is
getted form 'vcpu->kvm->arch.n_free_mmu_pages' in loop, it will
hinders us to apply kvm_mmu_prepare_zap_page() and kvm_mmu_commit_zap_page()
since kvm_mmu_prepare_zap_page() not free sp.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9b849a70742d..1aad8e713f78 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2863,13 +2863,16 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
 
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
-	while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES &&
+	int free_pages;
+
+	free_pages = vcpu->kvm->arch.n_free_mmu_pages;
+	while (free_pages < KVM_REFILL_PAGES &&
 	       !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
 		struct kvm_mmu_page *sp;
 
 		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
 				  struct kvm_mmu_page, link);
-		kvm_mmu_zap_page(vcpu->kvm, sp);
+		free_pages += kvm_mmu_zap_page(vcpu->kvm, sp);
 		++vcpu->kvm->stat.mmu_recycled;
 	}
 }
-- 
cgit v1.2.3


From d98ba053656c033180781007241f2c9d54606d56 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 4 Jun 2010 21:55:29 +0800
Subject: KVM: MMU: gather remote tlb flush which occurs during page zapped

Using kvm_mmu_prepare_zap_page() and kvm_mmu_zap_page() instead of
kvm_mmu_zap_page() that can reduce remote tlb flush IPI

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 84 ++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 53 insertions(+), 31 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1aad8e713f78..44548e346976 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1200,7 +1200,6 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	--kvm->stat.mmu_unsync;
 }
 
-static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
 static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 				    struct list_head *invalid_list);
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
@@ -1218,10 +1217,10 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 			(sp)->role.invalid) {} else
 
 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
-			   bool clear_unsync)
+			   struct list_head *invalid_list, bool clear_unsync)
 {
 	if (sp->role.cr4_pae != !!is_pae(vcpu)) {
-		kvm_mmu_zap_page(vcpu->kvm, sp);
+		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
 		return 1;
 	}
 
@@ -1232,7 +1231,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	}
 
 	if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
-		kvm_mmu_zap_page(vcpu->kvm, sp);
+		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
 		return 1;
 	}
 
@@ -1244,17 +1243,22 @@ static void mmu_convert_notrap(struct kvm_mmu_page *sp);
 static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
 				   struct kvm_mmu_page *sp)
 {
+	LIST_HEAD(invalid_list);
 	int ret;
 
-	ret = __kvm_sync_page(vcpu, sp, false);
+	ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
 	if (!ret)
 		mmu_convert_notrap(sp);
+	else
+		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+
 	return ret;
 }
 
-static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			 struct list_head *invalid_list)
 {
-	return __kvm_sync_page(vcpu, sp, true);
+	return __kvm_sync_page(vcpu, sp, invalid_list, true);
 }
 
 /* @gfn should be write-protected at the call site */
@@ -1262,6 +1266,7 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 {
 	struct kvm_mmu_page *s;
 	struct hlist_node *node, *n;
+	LIST_HEAD(invalid_list);
 	bool flush = false;
 
 	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) {
@@ -1271,13 +1276,14 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
 		if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
 			(vcpu->arch.mmu.sync_page(vcpu, s))) {
-			kvm_mmu_zap_page(vcpu->kvm, s);
+			kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
 			continue;
 		}
 		kvm_unlink_unsync_page(vcpu->kvm, s);
 		flush = true;
 	}
 
+	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 	if (flush)
 		kvm_mmu_flush_tlb(vcpu);
 }
@@ -1348,6 +1354,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 	struct kvm_mmu_page *sp;
 	struct mmu_page_path parents;
 	struct kvm_mmu_pages pages;
+	LIST_HEAD(invalid_list);
 
 	kvm_mmu_pages_init(parent, &parents, &pages);
 	while (mmu_unsync_walk(parent, &pages)) {
@@ -1360,9 +1367,10 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 			kvm_flush_remote_tlbs(vcpu->kvm);
 
 		for_each_sp(pages, sp, parents, i) {
-			kvm_sync_page(vcpu, sp);
+			kvm_sync_page(vcpu, sp, &invalid_list);
 			mmu_pages_clear_parents(&parents);
 		}
+		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 		cond_resched_lock(&vcpu->kvm->mmu_lock);
 		kvm_mmu_pages_init(parent, &parents, &pages);
 	}
@@ -1606,16 +1614,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 
 }
 
-static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
-	LIST_HEAD(invalid_list);
-	int ret;
-
-	ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
-	kvm_mmu_commit_zap_page(kvm, &invalid_list);
-	return ret;
-}
-
 /*
  * Changing the number of mmu pages allocated to the vm
  * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
@@ -1623,6 +1621,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 {
 	int used_pages;
+	LIST_HEAD(invalid_list);
 
 	used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
 	used_pages = max(0, used_pages);
@@ -1640,8 +1639,10 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 
 			page = container_of(kvm->arch.active_mmu_pages.prev,
 					    struct kvm_mmu_page, link);
-			used_pages -= kvm_mmu_zap_page(kvm, page);
+			used_pages -= kvm_mmu_prepare_zap_page(kvm, page,
+							       &invalid_list);
 		}
+		kvm_mmu_commit_zap_page(kvm, &invalid_list);
 		kvm_nr_mmu_pages = used_pages;
 		kvm->arch.n_free_mmu_pages = 0;
 	}
@@ -1656,6 +1657,7 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_mmu_page *sp;
 	struct hlist_node *node, *n;
+	LIST_HEAD(invalid_list);
 	int r;
 
 	pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
@@ -1665,9 +1667,10 @@ restart:
 		pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
 			 sp->role.word);
 		r = 1;
-		if (kvm_mmu_zap_page(kvm, sp))
+		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
 			goto restart;
 	}
+	kvm_mmu_commit_zap_page(kvm, &invalid_list);
 	return r;
 }
 
@@ -1675,14 +1678,16 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_mmu_page *sp;
 	struct hlist_node *node, *nn;
+	LIST_HEAD(invalid_list);
 
 restart:
 	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, nn) {
 		pgprintk("%s: zap %lx %x\n",
 			 __func__, gfn, sp->role.word);
-		if (kvm_mmu_zap_page(kvm, sp))
+		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
 			goto restart;
 	}
+	kvm_mmu_commit_zap_page(kvm, &invalid_list);
 }
 
 static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
@@ -2123,6 +2128,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 {
 	int i;
 	struct kvm_mmu_page *sp;
+	LIST_HEAD(invalid_list);
 
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
@@ -2132,8 +2138,10 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 
 		sp = page_header(root);
 		--sp->root_count;
-		if (!sp->root_count && sp->role.invalid)
-			kvm_mmu_zap_page(vcpu->kvm, sp);
+		if (!sp->root_count && sp->role.invalid) {
+			kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
+			kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+		}
 		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 		spin_unlock(&vcpu->kvm->mmu_lock);
 		return;
@@ -2146,10 +2154,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 			sp = page_header(root);
 			--sp->root_count;
 			if (!sp->root_count && sp->role.invalid)
-				kvm_mmu_zap_page(vcpu->kvm, sp);
+				kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+							 &invalid_list);
 		}
 		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
 	}
+	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 }
@@ -2715,6 +2725,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	struct kvm_mmu_page *sp;
 	struct hlist_node *node, *n;
+	LIST_HEAD(invalid_list);
 	u64 entry, gentry;
 	u64 *spte;
 	unsigned offset = offset_in_page(gpa);
@@ -2801,7 +2812,8 @@ restart:
 			 */
 			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
 				 gpa, bytes, sp->role.word);
-			if (kvm_mmu_zap_page(vcpu->kvm, sp))
+			if (kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+						     &invalid_list))
 				goto restart;
 			++vcpu->kvm->stat.mmu_flooded;
 			continue;
@@ -2836,6 +2848,7 @@ restart:
 			++spte;
 		}
 	}
+	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 	kvm_mmu_audit(vcpu, "post pte write");
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
@@ -2864,6 +2877,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
 	int free_pages;
+	LIST_HEAD(invalid_list);
 
 	free_pages = vcpu->kvm->arch.n_free_mmu_pages;
 	while (free_pages < KVM_REFILL_PAGES &&
@@ -2872,9 +2886,11 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 
 		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
 				  struct kvm_mmu_page, link);
-		free_pages += kvm_mmu_zap_page(vcpu->kvm, sp);
+		free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+						       &invalid_list);
 		++vcpu->kvm->stat.mmu_recycled;
 	}
+	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 }
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
@@ -3009,25 +3025,28 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 void kvm_mmu_zap_all(struct kvm *kvm)
 {
 	struct kvm_mmu_page *sp, *node;
+	LIST_HEAD(invalid_list);
 
 	spin_lock(&kvm->mmu_lock);
 restart:
 	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
-		if (kvm_mmu_zap_page(kvm, sp))
+		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
 			goto restart;
 
+	kvm_mmu_commit_zap_page(kvm, &invalid_list);
 	spin_unlock(&kvm->mmu_lock);
 
 	kvm_flush_remote_tlbs(kvm);
 }
 
-static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm)
+static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
+					       struct list_head *invalid_list)
 {
 	struct kvm_mmu_page *page;
 
 	page = container_of(kvm->arch.active_mmu_pages.prev,
 			    struct kvm_mmu_page, link);
-	return kvm_mmu_zap_page(kvm, page);
+	return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
 }
 
 static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
@@ -3040,6 +3059,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 
 	list_for_each_entry(kvm, &vm_list, vm_list) {
 		int npages, idx, freed_pages;
+		LIST_HEAD(invalid_list);
 
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);
@@ -3047,12 +3067,14 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 			 kvm->arch.n_free_mmu_pages;
 		cache_count += npages;
 		if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
-			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm);
+			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
+							  &invalid_list);
 			cache_count -= freed_pages;
 			kvm_freed = kvm;
 		}
 		nr_to_scan--;
 
+		kvm_mmu_commit_zap_page(kvm, &invalid_list);
 		spin_unlock(&kvm->mmu_lock);
 		srcu_read_unlock(&kvm->srcu, idx);
 	}
-- 
cgit v1.2.3


From f41d335a02d5132c14ec0459d3b2790eeb16fb11 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 4 Jun 2010 21:56:11 +0800
Subject: KVM: MMU: traverse sp hlish safely

Now, we can safely to traverse sp hlish

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 51 +++++++++++++++++++++++----------------------------
 1 file changed, 23 insertions(+), 28 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 44548e346976..3b75689eda95 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1205,13 +1205,13 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 				    struct list_head *invalid_list);
 
-#define for_each_gfn_sp(kvm, sp, gfn, pos, n)				\
-  hlist_for_each_entry_safe(sp, pos, n,					\
+#define for_each_gfn_sp(kvm, sp, gfn, pos)				\
+  hlist_for_each_entry(sp, pos,						\
    &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\
 	if ((sp)->gfn != (gfn)) {} else
 
-#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos, n)		\
-  hlist_for_each_entry_safe(sp, pos, n,					\
+#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos)		\
+  hlist_for_each_entry(sp, pos,						\
    &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\
 		if ((sp)->gfn != (gfn) || (sp)->role.direct ||		\
 			(sp)->role.invalid) {} else
@@ -1265,11 +1265,11 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 {
 	struct kvm_mmu_page *s;
-	struct hlist_node *node, *n;
+	struct hlist_node *node;
 	LIST_HEAD(invalid_list);
 	bool flush = false;
 
-	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) {
+	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
 		if (!s->unsync)
 			continue;
 
@@ -1387,7 +1387,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	union kvm_mmu_page_role role;
 	unsigned quadrant;
 	struct kvm_mmu_page *sp;
-	struct hlist_node *node, *tmp;
+	struct hlist_node *node;
 	bool need_sync = false;
 
 	role = vcpu->arch.mmu.base_role;
@@ -1401,7 +1401,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
 		role.quadrant = quadrant;
 	}
-	for_each_gfn_sp(vcpu->kvm, sp, gfn, node, tmp) {
+	for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
 		if (!need_sync && sp->unsync)
 			need_sync = true;
 
@@ -1656,19 +1656,18 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_mmu_page *sp;
-	struct hlist_node *node, *n;
+	struct hlist_node *node;
 	LIST_HEAD(invalid_list);
 	int r;
 
 	pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
 	r = 0;
-restart:
-	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, n) {
+
+	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
 		pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
 			 sp->role.word);
 		r = 1;
-		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
-			goto restart;
+		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
 	}
 	kvm_mmu_commit_zap_page(kvm, &invalid_list);
 	return r;
@@ -1677,15 +1676,13 @@ restart:
 static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_mmu_page *sp;
-	struct hlist_node *node, *nn;
+	struct hlist_node *node;
 	LIST_HEAD(invalid_list);
 
-restart:
-	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node, nn) {
+	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
 		pgprintk("%s: zap %lx %x\n",
 			 __func__, gfn, sp->role.word);
-		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
-			goto restart;
+		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
 	}
 	kvm_mmu_commit_zap_page(kvm, &invalid_list);
 }
@@ -1830,9 +1827,9 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 {
 	struct kvm_mmu_page *s;
-	struct hlist_node *node, *n;
+	struct hlist_node *node;
 
-	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) {
+	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
 		if (s->unsync)
 			continue;
 		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
@@ -1844,10 +1841,10 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 				  bool can_unsync)
 {
 	struct kvm_mmu_page *s;
-	struct hlist_node *node, *n;
+	struct hlist_node *node;
 	bool need_unsync = false;
 
-	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node, n) {
+	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
 		if (s->role.level != PT_PAGE_TABLE_LEVEL)
 			return 1;
 
@@ -2724,7 +2721,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 {
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	struct kvm_mmu_page *sp;
-	struct hlist_node *node, *n;
+	struct hlist_node *node;
 	LIST_HEAD(invalid_list);
 	u64 entry, gentry;
 	u64 *spte;
@@ -2794,8 +2791,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		}
 	}
 
-restart:
-	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node, n) {
+	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
 		pte_size = sp->role.cr4_pae ? 8 : 4;
 		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
 		misaligned |= bytes < 4;
@@ -2812,9 +2808,8 @@ restart:
 			 */
 			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
 				 gpa, bytes, sp->role.word);
-			if (kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
-						     &invalid_list))
-				goto restart;
+			kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+						     &invalid_list);
 			++vcpu->kvm->stat.mmu_flooded;
 			continue;
 		}
-- 
cgit v1.2.3


From 0671a8e75d8aeb33e15c5152147abb0d2fa0c1e6 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 4 Jun 2010 21:56:59 +0800
Subject: KVM: MMU: reduce remote tlb flush in kvm_mmu_pte_write()

collect remote tlb flush in kvm_mmu_pte_write() path

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3b75689eda95..b285449e82b0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2666,11 +2666,15 @@ static bool need_remote_flush(u64 old, u64 new)
 	return (old & ~new & PT64_PERM_MASK) != 0;
 }
 
-static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
+static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
+				    bool remote_flush, bool local_flush)
 {
-	if (need_remote_flush(old, new))
+	if (zap_page)
+		return;
+
+	if (remote_flush)
 		kvm_flush_remote_tlbs(vcpu->kvm);
-	else
+	else if (local_flush)
 		kvm_mmu_flush_tlb(vcpu);
 }
 
@@ -2735,6 +2739,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	int npte;
 	int r;
 	int invlpg_counter;
+	bool remote_flush, local_flush, zap_page;
+
+	zap_page = remote_flush = local_flush = false;
 
 	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
 
@@ -2808,7 +2815,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 			 */
 			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
 				 gpa, bytes, sp->role.word);
-			kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+			zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
 						     &invalid_list);
 			++vcpu->kvm->stat.mmu_flooded;
 			continue;
@@ -2833,16 +2840,19 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 			if (quadrant != sp->role.quadrant)
 				continue;
 		}
+		local_flush = true;
 		spte = &sp->spt[page_offset / sizeof(*spte)];
 		while (npte--) {
 			entry = *spte;
 			mmu_pte_write_zap_pte(vcpu, sp, spte);
 			if (gentry)
 				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
-			mmu_pte_write_flush_tlb(vcpu, entry, *spte);
+			if (!remote_flush && need_remote_flush(entry, *spte))
+				remote_flush = true;
 			++spte;
 		}
 	}
+	mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 	kvm_mmu_audit(vcpu, "post pte write");
 	spin_unlock(&vcpu->kvm->mmu_lock);
-- 
cgit v1.2.3


From b9d762fa79f541ab480cdb733b46fdb0b4471c2d Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Mon, 7 Jun 2010 10:32:29 +0800
Subject: KVM: VMX: Add all-context INVVPID type support

Add all-context INVVPID type support.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/vmx.h |  1 +
 arch/x86/kvm/vmx.c         | 23 +++++++++++++++++++++--
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index b4e28400c9ff..96a5886d384e 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -377,6 +377,7 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
 
 #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT      (1ull << 9) /* (41 - 32) */
+#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT      (1ull << 10) /* (42 - 32) */
 
 #define VMX_EPT_DEFAULT_GAW			3
 #define VMX_EPT_MAX_GAW				0x4
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 945265361885..622d83b0caf3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -365,6 +365,11 @@ static inline bool cpu_has_vmx_invvpid_single(void)
 	return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
 }
 
+static inline bool cpu_has_vmx_invvpid_global(void)
+{
+	return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
+}
+
 static inline bool cpu_has_vmx_ept(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -513,6 +518,20 @@ static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
 		__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
 }
 
+static inline void vpid_sync_vcpu_global(void)
+{
+	if (cpu_has_vmx_invvpid_global())
+		__invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
+}
+
+static inline void vpid_sync_context(struct vcpu_vmx *vmx)
+{
+	if (cpu_has_vmx_invvpid_single())
+		vpid_sync_vcpu_all(vmx);
+	else
+		vpid_sync_vcpu_global();
+}
+
 static inline void ept_sync_global(void)
 {
 	if (cpu_has_vmx_invept_global())
@@ -1800,7 +1819,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 
 static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
 {
-	vpid_sync_vcpu_all(to_vmx(vcpu));
+	vpid_sync_context(to_vmx(vcpu));
 	if (enable_ept)
 		ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
 }
@@ -2756,7 +2775,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmx_fpu_activate(&vmx->vcpu);
 	update_exception_bitmap(&vmx->vcpu);
 
-	vpid_sync_vcpu_all(vmx);
+	vpid_sync_context(vmx);
 
 	ret = 0;
 
-- 
cgit v1.2.3


From 1760dd4939a62591e492971858fac8cce1e4539e Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Mon, 7 Jun 2010 10:33:27 +0800
Subject: KVM: VMX: rename vpid_sync_vcpu_all() to vpid_sync_vcpu_single()

The name "pid_sync_vcpu_all" isn't appropriate since it just affect
a single vpid, so rename it to vpid_sync_vcpu_single().

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 622d83b0caf3..7d7361750bf2 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -509,7 +509,7 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
 	smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
 }
 
-static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
+static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
 {
 	if (vmx->vpid == 0)
 		return;
@@ -527,7 +527,7 @@ static inline void vpid_sync_vcpu_global(void)
 static inline void vpid_sync_context(struct vcpu_vmx *vmx)
 {
 	if (cpu_has_vmx_invvpid_single())
-		vpid_sync_vcpu_all(vmx);
+		vpid_sync_vcpu_single(vmx);
 	else
 		vpid_sync_vcpu_global();
 }
-- 
cgit v1.2.3


From 4b9d3a04519fb508ad3b7ce8a7962929b2614185 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 8 Jun 2010 10:15:51 +0800
Subject: KVM: VMX: fix rcu usage warning in init_rmode()

fix:

[ INFO: suspicious rcu_dereference_check() usage. ]
---------------------------------------------------
include/linux/kvm_host.h:258 invoked rcu_dereference_check() without protection!

other info that might help us debug this:

rcu_scheduler_active = 1, debug_locks = 1
1 lock held by qemu-system-x86/3796:
 #0:  (&vcpu->mutex){+.+.+.}, at: [<ffffffffa0217fd8>] vcpu_load+0x1a/0x66 [kvm]

stack backtrace:
Pid: 3796, comm: qemu-system-x86 Not tainted 2.6.34 #25
Call Trace:
 [<ffffffff81070ed1>] lockdep_rcu_dereference+0x9d/0xa5
 [<ffffffffa0214fdf>] gfn_to_memslot_unaliased+0x65/0xa0 [kvm]
 [<ffffffffa0216139>] gfn_to_hva+0x22/0x4c [kvm]
 [<ffffffffa0216217>] kvm_write_guest_page+0x2a/0x7f [kvm]
 [<ffffffffa0216286>] kvm_clear_guest_page+0x1a/0x1c [kvm]
 [<ffffffffa0278239>] init_rmode+0x3b/0x180 [kvm_intel]
 [<ffffffffa02786ce>] vmx_set_cr0+0x350/0x4d3 [kvm_intel]
 [<ffffffffa02274ff>] kvm_arch_vcpu_ioctl_set_sregs+0x122/0x31a [kvm]
 [<ffffffffa021859c>] kvm_vcpu_ioctl+0x578/0xa3d [kvm]
 [<ffffffff8106624c>] ? cpu_clock+0x2d/0x40
 [<ffffffff810f7d86>] ? fget_light+0x244/0x28e
 [<ffffffff810709b9>] ? trace_hardirqs_off_caller+0x1f/0x10e
 [<ffffffff8110501b>] vfs_ioctl+0x32/0xa6
 [<ffffffff81105597>] do_vfs_ioctl+0x47f/0x4b8
 [<ffffffff813ae654>] ? sub_preempt_count+0xa3/0xb7
 [<ffffffff810f7da8>] ? fget_light+0x266/0x28e
 [<ffffffff810f7c53>] ? fget_light+0x111/0x28e
 [<ffffffff81105617>] sys_ioctl+0x47/0x6a
 [<ffffffff81002c1b>] system_call_fastpath+0x16/0x1b

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7d7361750bf2..01b054c98139 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2659,21 +2659,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
 static int init_rmode(struct kvm *kvm)
 {
+	int idx, ret = 0;
+
+	idx = srcu_read_lock(&kvm->srcu);
 	if (!init_rmode_tss(kvm))
-		return 0;
+		goto exit;
 	if (!init_rmode_identity_map(kvm))
-		return 0;
-	return 1;
+		goto exit;
+
+	ret = 1;
+exit:
+	srcu_read_unlock(&kvm->srcu, idx);
+	return ret;
 }
 
 static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u64 msr;
-	int ret, idx;
+	int ret;
 
 	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
-	idx = srcu_read_lock(&vcpu->kvm->srcu);
 	if (!init_rmode(vmx->vcpu.kvm)) {
 		ret = -ENOMEM;
 		goto out;
@@ -2783,7 +2789,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmx->emulation_required = 0;
 
 out:
-	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 4f78fd08e91c52f097d64a42d903b76fe52a3a0f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 8 Jun 2010 20:05:05 +0800
Subject: KVM: MMU: remove unnecessary remote tlb flush

This remote tlb flush is no necessary since we have synced while
sp is zapped

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b285449e82b0..098a0b8616b0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3040,8 +3040,6 @@ restart:
 
 	kvm_mmu_commit_zap_page(kvm, &invalid_list);
 	spin_unlock(&kvm->mmu_lock);
-
-	kvm_flush_remote_tlbs(kvm);
 }
 
 static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
-- 
cgit v1.2.3


From 5304efde6ae27deeeae79b97af709d4ceecc336e Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 8 Jun 2010 20:05:57 +0800
Subject: KVM: MMU: use wrapper function to flush local tlb

Use kvm_mmu_flush_tlb() function instead of calling
kvm_x86_ops->tlb_flush(vcpu) directly.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 098a0b8616b0..e087f855461d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1984,7 +1984,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		      reset_host_protection)) {
 		if (write_fault)
 			*ptwrite = 1;
-		kvm_x86_ops->tlb_flush(vcpu);
+		kvm_mmu_flush_tlb(vcpu);
 	}
 
 	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
-- 
cgit v1.2.3


From 3b5d13218667b3ca52efa52cec1d322163bf5465 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Tue, 8 Jun 2010 20:07:01 +0800
Subject: KVM: MMU: delay local tlb flush

delay local tlb flush until enter guest moden, it can reduce vpid flush
frequency and reduce remote tlb flush IPI(if KVM_REQ_TLB_FLUSH bit is
already set, IPI is not sent)

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e087f855461d..4706a936e36f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2364,7 +2364,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.tlb_flush;
-	kvm_x86_ops->tlb_flush(vcpu);
+	set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
 }
 
 static void paging_new_cr3(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 376d41ff2669617a1ef828466ad07a1be99d24d3 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 10 Jun 2010 13:10:47 +0200
Subject: KVM: Fix KVM_SET_SIGNAL_MASK with arg == NULL

When the user passed in a NULL mask pass this on from the ioctl
handler.

Found by gcc 4.6's new warnings.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 78ed71ad6aa1..84a090644d9d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1547,7 +1547,7 @@ out_free2:
 				goto out;
 			p = &sigset;
 		}
-		r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
+		r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
 		break;
 	}
 	case KVM_GET_FPU: {
-- 
cgit v1.2.3


From a24e809902339458416900869abdcc51a44bfd48 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Thu, 10 Jun 2010 13:10:55 +0200
Subject: KVM: Fix unused but set warnings

No real bugs in this one.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 1 +
 virt/kvm/assigned-dev.c    | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 1f7f5dd8306f..9308be2d5c02 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -444,6 +444,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	kvm_mmu_free_some_pages(vcpu);
 	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
 			     level, &write_pt, pfn);
+	(void)sptep;
 	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
 		 sptep, *sptep, write_pt);
 
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index bf9d958d4d5b..7c98928b09d9 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -58,12 +58,10 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
 static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 {
 	struct kvm_assigned_dev_kernel *assigned_dev;
-	struct kvm *kvm;
 	int i;
 
 	assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
 				    interrupt_work);
-	kvm = assigned_dev->kvm;
 
 	spin_lock_irq(&assigned_dev->assigned_dev_lock);
 	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
-- 
cgit v1.2.3


From f495c6e5e8fdc972162241df5bdff5bcebb4dc33 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 10 Jun 2010 17:21:29 +0300
Subject: KVM: VMX: Fix incorrect rcu deref in rmode_tss_base()

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 01b054c98139..26ba61d6af8c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1688,7 +1688,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)
 		gfn_t base_gfn;
 
 		slots = kvm_memslots(kvm);
-		base_gfn = kvm->memslots->memslots[0].base_gfn +
+		base_gfn = slots->memslots[0].base_gfn +
 				 kvm->memslots->memslots[0].npages - 3;
 		return base_gfn << PAGE_SHIFT;
 	}
-- 
cgit v1.2.3


From 2acf923e38fb6a4ce0c57115decbb38d334902ac Mon Sep 17 00:00:00 2001
From: Dexuan Cui <dexuan.cui@intel.com>
Date: Thu, 10 Jun 2010 11:27:12 +0800
Subject: KVM: VMX: Enable XSAVE/XRSTOR for guest

This patch enable guest to use XSAVE/XRSTOR instructions.

We assume that host_xcr0 would use all possible bits that OS supported.

And we loaded xcr0 in the same way we handled fpu - do it as late as we can.

Signed-off-by: Dexuan Cui <dexuan.cui@intel.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   2 +
 arch/x86/include/asm/vmx.h      |   1 +
 arch/x86/kvm/kvm_cache_regs.h   |   6 ++
 arch/x86/kvm/vmx.c              |  13 ++++
 arch/x86/kvm/x86.c              | 130 +++++++++++++++++++++++++++++++++++++---
 include/linux/kvm_host.h        |   2 +-
 6 files changed, 146 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0cd0f2923af5..91631b8b2090 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -302,6 +302,7 @@ struct kvm_vcpu_arch {
 	} update_pte;
 
 	struct fpu guest_fpu;
+	u64 xcr0;
 
 	gva_t mmio_fault_cr2;
 	struct kvm_pio_request pio;
@@ -605,6 +606,7 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
+int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
 
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 96a5886d384e..9f0cbd987d50 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -267,6 +267,7 @@ enum vmcs_field {
 #define EXIT_REASON_EPT_VIOLATION       48
 #define EXIT_REASON_EPT_MISCONFIG       49
 #define EXIT_REASON_WBINVD		54
+#define EXIT_REASON_XSETBV		55
 
 /*
  * Interruption-information format
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index d2a98f8f9af5..6491ac8e755b 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -71,4 +71,10 @@ static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
 	return kvm_read_cr4_bits(vcpu, ~0UL);
 }
 
+static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
+{
+	return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u)
+		| ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
+}
+
 #endif
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 26ba61d6af8c..864a1b6d155a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -37,6 +37,8 @@
 #include <asm/vmx.h>
 #include <asm/virtext.h>
 #include <asm/mce.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
 
 #include "trace.h"
 
@@ -3390,6 +3392,16 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+static int handle_xsetbv(struct kvm_vcpu *vcpu)
+{
+	u64 new_bv = kvm_read_edx_eax(vcpu);
+	u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
+
+	if (kvm_set_xcr(vcpu, index, new_bv) == 0)
+		skip_emulated_instruction(vcpu);
+	return 1;
+}
+
 static int handle_apic_access(struct kvm_vcpu *vcpu)
 {
 	return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
@@ -3668,6 +3680,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
 	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
 	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
+	[EXIT_REASON_XSETBV]                  = handle_xsetbv,
 	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
 	[EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
 	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b08c0052e332..b5e644701cc1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -65,6 +65,7 @@
 	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
 			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE	\
 			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR	\
+			  | X86_CR4_OSXSAVE \
 			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
@@ -150,6 +151,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ NULL }
 };
 
+u64 __read_mostly host_xcr0;
+
+static inline u32 bit(int bitno)
+{
+	return 1 << (bitno & 31);
+}
+
 static void kvm_on_user_return(struct user_return_notifier *urn)
 {
 	unsigned slot;
@@ -474,6 +482,61 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 }
 EXPORT_SYMBOL_GPL(kvm_lmsw);
 
+int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+{
+	u64 xcr0;
+
+	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
+	if (index != XCR_XFEATURE_ENABLED_MASK)
+		return 1;
+	xcr0 = xcr;
+	if (kvm_x86_ops->get_cpl(vcpu) != 0)
+		return 1;
+	if (!(xcr0 & XSTATE_FP))
+		return 1;
+	if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
+		return 1;
+	if (xcr0 & ~host_xcr0)
+		return 1;
+	vcpu->arch.xcr0 = xcr0;
+	vcpu->guest_xcr0_loaded = 0;
+	return 0;
+}
+
+int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+{
+	if (__kvm_set_xcr(vcpu, index, xcr)) {
+		kvm_inject_gp(vcpu, 0);
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_xcr);
+
+static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 1, 0);
+	return best && (best->ecx & bit(X86_FEATURE_XSAVE));
+}
+
+static void update_cpuid(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 1, 0);
+	if (!best)
+		return;
+
+	/* Update OSXSAVE bit */
+	if (cpu_has_xsave && best->function == 0x1) {
+		best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
+			best->ecx |= bit(X86_FEATURE_OSXSAVE);
+	}
+}
+
 int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
 	unsigned long old_cr4 = kvm_read_cr4(vcpu);
@@ -482,6 +545,9 @@ int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	if (cr4 & CR4_RESERVED_BITS)
 		return 1;
 
+	if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
+		return 1;
+
 	if (is_long_mode(vcpu)) {
 		if (!(cr4 & X86_CR4_PAE))
 			return 1;
@@ -498,6 +564,9 @@ int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	if ((cr4 ^ old_cr4) & pdptr_bits)
 		kvm_mmu_reset_context(vcpu);
 
+	if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
+		update_cpuid(vcpu);
+
 	return 0;
 }
 
@@ -666,11 +735,6 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 }
 EXPORT_SYMBOL_GPL(kvm_get_dr);
 
-static inline u32 bit(int bitno)
-{
-	return 1 << (bitno & 31);
-}
-
 /*
  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
@@ -1814,6 +1878,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
 	r = 0;
 	kvm_apic_set_version(vcpu);
 	kvm_x86_ops->cpuid_update(vcpu);
+	update_cpuid(vcpu);
 
 out_free:
 	vfree(cpuid_entries);
@@ -1837,6 +1902,7 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
 	vcpu->arch.cpuid_nent = cpuid->nent;
 	kvm_apic_set_version(vcpu);
 	kvm_x86_ops->cpuid_update(vcpu);
+	update_cpuid(vcpu);
 	return 0;
 
 out:
@@ -1917,7 +1983,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
 		0 /* Reserved, DCA */ | F(XMM4_1) |
 		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
-		0 /* Reserved, XSAVE, OSXSAVE */;
+		0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */;
 	/* cpuid 0x80000001.ecx */
 	const u32 kvm_supported_word6_x86_features =
 		F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
@@ -1932,7 +1998,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 	switch (function) {
 	case 0:
-		entry->eax = min(entry->eax, (u32)0xb);
+		entry->eax = min(entry->eax, (u32)0xd);
 		break;
 	case 1:
 		entry->edx &= kvm_supported_word0_x86_features;
@@ -1990,6 +2056,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		}
 		break;
 	}
+	case 0xd: {
+		int i;
+
+		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+		for (i = 1; *nent < maxnent; ++i) {
+			if (entry[i - 1].eax == 0 && i != 2)
+				break;
+			do_cpuid_1_ent(&entry[i], function, i);
+			entry[i].flags |=
+			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+			++*nent;
+		}
+		break;
+	}
 	case KVM_CPUID_SIGNATURE: {
 		char signature[12] = "KVMKVMKVM\0\0";
 		u32 *sigptr = (u32 *)signature;
@@ -4125,6 +4205,9 @@ int kvm_arch_init(void *opaque)
 
 	perf_register_guest_info_callbacks(&kvm_guest_cbs);
 
+	if (cpu_has_xsave)
+		host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+
 	return 0;
 
 out:
@@ -4523,6 +4606,25 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
+			!vcpu->guest_xcr0_loaded) {
+		/* kvm_set_xcr() also depends on this */
+		xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
+		vcpu->guest_xcr0_loaded = 1;
+	}
+}
+
+static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->guest_xcr0_loaded) {
+		if (vcpu->arch.xcr0 != host_xcr0)
+			xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
+		vcpu->guest_xcr0_loaded = 0;
+	}
+}
+
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
 	int r;
@@ -4568,6 +4670,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->prepare_guest_switch(vcpu);
 	if (vcpu->fpu_active)
 		kvm_load_guest_fpu(vcpu);
+	kvm_load_guest_xcr0(vcpu);
 
 	atomic_set(&vcpu->guest_mode, 1);
 	smp_wmb();
@@ -5124,6 +5227,11 @@ int fx_init(struct kvm_vcpu *vcpu)
 
 	fpu_finit(&vcpu->arch.guest_fpu);
 
+	/*
+	 * Ensure guest xcr0 is valid for loading
+	 */
+	vcpu->arch.xcr0 = XSTATE_FP;
+
 	vcpu->arch.cr0 |= X86_CR0_ET;
 
 	return 0;
@@ -5140,6 +5248,12 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 	if (vcpu->guest_fpu_loaded)
 		return;
 
+	/*
+	 * Restore all possible states in the guest,
+	 * and assume host would use all available bits.
+	 * Guest xcr0 would be loaded later.
+	 */
+	kvm_put_guest_xcr0(vcpu);
 	vcpu->guest_fpu_loaded = 1;
 	unlazy_fpu(current);
 	fpu_restore_checking(&vcpu->arch.guest_fpu);
@@ -5148,6 +5262,8 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
+	kvm_put_guest_xcr0(vcpu);
+
 	if (!vcpu->guest_fpu_loaded)
 		return;
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2c62319727ef..2d96555cd4ed 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -88,7 +88,7 @@ struct kvm_vcpu {
 	int srcu_idx;
 
 	int fpu_active;
-	int guest_fpu_loaded;
+	int guest_fpu_loaded, guest_xcr0_loaded;
 	wait_queue_head_t wq;
 	int sigset_active;
 	sigset_t sigset;
-- 
cgit v1.2.3


From 49a9b07edcf4aff159c1f3d3a27e58cf38bc27cd Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 10 Jun 2010 17:02:14 +0300
Subject: KVM: Fix mov cr0 #GP at wrong instruction

On Intel, we call skip_emulated_instruction() even if we injected a #GP,
resulting in the #GP pointing at the wrong address.

Fix by injecting the exception and skipping the instruction at the same place,
so we can do just one or the other.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/svm.c              |  2 +-
 arch/x86/kvm/vmx.c              | 13 +++++++++++--
 arch/x86/kvm/x86.c              | 12 +++---------
 4 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 91631b8b2090..b23708450210 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -597,7 +597,7 @@ int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 		    bool has_error_code, u32 error_code);
 
-void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 2ae0c3923293..6d1616d47c54 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -807,7 +807,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 	 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
 	 */
 	svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
-	kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
+	(void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
 
 	save->cr4 = X86_CR4_PAE;
 	/* rdx = ?? */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 864a1b6d155a..1baf4b2d98ee 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3157,11 +3157,20 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
 	hypercall[2] = 0xc1;
 }
 
+static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
+{
+	if (err)
+		kvm_inject_gp(vcpu, 0);
+	else
+		skip_emulated_instruction(vcpu);
+}
+
 static int handle_cr(struct kvm_vcpu *vcpu)
 {
 	unsigned long exit_qualification, val;
 	int cr;
 	int reg;
+	int err;
 
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 	cr = exit_qualification & 15;
@@ -3172,8 +3181,8 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 		trace_kvm_cr_write(cr, val);
 		switch (cr) {
 		case 0:
-			kvm_set_cr0(vcpu, val);
-			skip_emulated_instruction(vcpu);
+			err = kvm_set_cr0(vcpu, val);
+			complete_insn_gp(vcpu, err);
 			return 1;
 		case 3:
 			kvm_set_cr3(vcpu, val);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b5e644701cc1..05e9b5dde646 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -425,7 +425,7 @@ out:
 	return changed;
 }
 
-static int __kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
 	unsigned long old_cr0 = kvm_read_cr0(vcpu);
 	unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
@@ -468,17 +468,11 @@ static int __kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 		kvm_mmu_reset_context(vcpu);
 	return 0;
 }
-
-void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
-{
-	if (__kvm_set_cr0(vcpu, cr0))
-		kvm_inject_gp(vcpu, 0);
-}
 EXPORT_SYMBOL_GPL(kvm_set_cr0);
 
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 {
-	kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
+	(void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
 }
 EXPORT_SYMBOL_GPL(kvm_lmsw);
 
@@ -3732,7 +3726,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
 
 	switch (cr) {
 	case 0:
-		res = __kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
+		res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
 		break;
 	case 2:
 		vcpu->arch.cr2 = val;
-- 
cgit v1.2.3


From a83b29c6ad6d6497e569edbc29e556a384cebddd Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 10 Jun 2010 17:02:15 +0300
Subject: KVM: Fix mov cr4 #GP at wrong instruction

On Intel, we call skip_emulated_instruction() even if we injected a #GP,
resulting in the #GP pointing at the wrong address.

Fix by injecting the exception and skipping the instruction at the same place,
so we can do just one or the other.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/vmx.c              |  4 ++--
 arch/x86/kvm/x86.c              | 10 ++--------
 3 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b23708450210..ea8c319cdffc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -599,7 +599,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 
 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
-void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
 int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1baf4b2d98ee..f64d65dc38c6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3189,8 +3189,8 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 			skip_emulated_instruction(vcpu);
 			return 1;
 		case 4:
-			kvm_set_cr4(vcpu, val);
-			skip_emulated_instruction(vcpu);
+			err = kvm_set_cr4(vcpu, val);
+			complete_insn_gp(vcpu, err);
 			return 1;
 		case 8: {
 				u8 cr8_prev = kvm_get_cr8(vcpu);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 05e9b5dde646..ed3af15d4404 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -531,7 +531,7 @@ static void update_cpuid(struct kvm_vcpu *vcpu)
 	}
 }
 
-int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
 	unsigned long old_cr4 = kvm_read_cr4(vcpu);
 	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
@@ -563,12 +563,6 @@ int __kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 
 	return 0;
 }
-
-void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
-{
-	if (__kvm_set_cr4(vcpu, cr4))
-		kvm_inject_gp(vcpu, 0);
-}
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
 
 static int __kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
@@ -3735,7 +3729,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
 		res = __kvm_set_cr3(vcpu, val);
 		break;
 	case 4:
-		res = __kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+		res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
 		break;
 	case 8:
 		res = __kvm_set_cr8(vcpu, val & 0xfUL);
-- 
cgit v1.2.3


From 2390218b6aa2eb3784b0a82fa811c19097dc793a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 10 Jun 2010 17:02:16 +0300
Subject: KVM: Fix mov cr3 #GP at wrong instruction

On Intel, we call skip_emulated_instruction() even if we injected a #GP,
resulting in the #GP pointing at the wrong address.

Fix by injecting the exception and skipping the instruction at the same place,
so we can do just one or the other.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/mmu.c              |  2 +-
 arch/x86/kvm/svm.c              |  4 ++--
 arch/x86/kvm/vmx.c              |  4 ++--
 arch/x86/kvm/x86.c              | 10 ++--------
 5 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ea8c319cdffc..c2813d658f3e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -598,7 +598,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 		    bool has_error_code, u32 error_code);
 
 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
+int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4706a936e36f..aa98fca03ed7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3203,7 +3203,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
 
 static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 {
-	kvm_set_cr3(vcpu, vcpu->arch.cr3);
+	(void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
 	return 1;
 }
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6d1616d47c54..f7a6fdcf8ef3 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1963,7 +1963,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 		svm->vmcb->save.cr3 = hsave->save.cr3;
 		svm->vcpu.arch.cr3 = hsave->save.cr3;
 	} else {
-		kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
+		(void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
 	}
 	kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
 	kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
@@ -2086,7 +2086,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 		svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
 		svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
 	} else
-		kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
+		(void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
 
 	/* Guest paging mode is active - reset mmu */
 	kvm_mmu_reset_context(&svm->vcpu);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f64d65dc38c6..345a35470511 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3185,8 +3185,8 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 			complete_insn_gp(vcpu, err);
 			return 1;
 		case 3:
-			kvm_set_cr3(vcpu, val);
-			skip_emulated_instruction(vcpu);
+			err = kvm_set_cr3(vcpu, val);
+			complete_insn_gp(vcpu, err);
 			return 1;
 		case 4:
 			err = kvm_set_cr4(vcpu, val);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ed3af15d4404..795999e1ac19 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -565,7 +565,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
 
-static int __kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
 	if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
 		kvm_mmu_sync_roots(vcpu);
@@ -604,12 +604,6 @@ static int __kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 	vcpu->arch.mmu.new_cr3(vcpu);
 	return 0;
 }
-
-void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
-{
-	if (__kvm_set_cr3(vcpu, cr3))
-		kvm_inject_gp(vcpu, 0);
-}
 EXPORT_SYMBOL_GPL(kvm_set_cr3);
 
 int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
@@ -3726,7 +3720,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
 		vcpu->arch.cr2 = val;
 		break;
 	case 3:
-		res = __kvm_set_cr3(vcpu, val);
+		res = kvm_set_cr3(vcpu, val);
 		break;
 	case 4:
 		res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
-- 
cgit v1.2.3


From 69b61833f7ce6e61e196c8a724e4d1278b24ac02 Mon Sep 17 00:00:00 2001
From: Denis Kirjanov <dkirjanov@hera.kernel.org>
Date: Fri, 11 Jun 2010 11:23:26 +0000
Subject: KVM: PPC: fix build warning in kvm_arch_vcpu_ioctl_run

Fix compile warning:
  CC [M]  arch/powerpc/kvm/powerpc.o
  arch/powerpc/kvm/powerpc.c: In function 'kvm_arch_vcpu_ioctl_run':
  arch/powerpc/kvm/powerpc.c:290: warning: 'gpr' may be used uninitialized in this function
  arch/powerpc/kvm/powerpc.c:290: note: 'gpr' was declared here

Signed-off-by: Denis Kirjanov <dkirjanov@kernel.org>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/powerpc/kvm/powerpc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index a1d87508892e..b5ebdfbed20b 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -287,7 +287,7 @@ static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu,
 static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
                                       struct kvm_run *run)
 {
-	u64 gpr;
+	u64 uninitialized_var(gpr);
 
 	if (run->mmio.len > sizeof(gpr)) {
 		printk(KERN_ERR "bad MMIO length: %d\n", run->mmio.len);
-- 
cgit v1.2.3


From 2d5b5a665508c60577c1088e0405850a965b6795 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Sun, 13 Jun 2010 17:29:39 +0800
Subject: KVM: x86: XSAVE/XRSTOR live migration support

This patch enable save/restore of xsave state.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 Documentation/kvm/api.txt    |  74 +++++++++++++++++++++++
 arch/x86/include/asm/kvm.h   |  22 +++++++
 arch/x86/include/asm/xsave.h |   7 ++-
 arch/x86/kvm/x86.c           | 139 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/kvm.h          |  12 ++++
 5 files changed, 252 insertions(+), 2 deletions(-)

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index 159b4efe1b0e..ffba03f55bdf 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -922,6 +922,80 @@ Define which vcpu is the Bootstrap Processor (BSP).  Values are the same
 as the vcpu id in KVM_CREATE_VCPU.  If this ioctl is not called, the default
 is vcpu 0.
 
+4.41 KVM_GET_XSAVE
+
+Capability: KVM_CAP_XSAVE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_xsave (out)
+Returns: 0 on success, -1 on error
+
+struct kvm_xsave {
+	__u32 region[1024];
+};
+
+This ioctl would copy current vcpu's xsave struct to the userspace.
+
+4.42 KVM_SET_XSAVE
+
+Capability: KVM_CAP_XSAVE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_xsave (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_xsave {
+	__u32 region[1024];
+};
+
+This ioctl would copy userspace's xsave struct to the kernel.
+
+4.43 KVM_GET_XCRS
+
+Capability: KVM_CAP_XCRS
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_xcrs (out)
+Returns: 0 on success, -1 on error
+
+struct kvm_xcr {
+	__u32 xcr;
+	__u32 reserved;
+	__u64 value;
+};
+
+struct kvm_xcrs {
+	__u32 nr_xcrs;
+	__u32 flags;
+	struct kvm_xcr xcrs[KVM_MAX_XCRS];
+	__u64 padding[16];
+};
+
+This ioctl would copy current vcpu's xcrs to the userspace.
+
+4.44 KVM_SET_XCRS
+
+Capability: KVM_CAP_XCRS
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_xcrs (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_xcr {
+	__u32 xcr;
+	__u32 reserved;
+	__u64 value;
+};
+
+struct kvm_xcrs {
+	__u32 nr_xcrs;
+	__u32 flags;
+	struct kvm_xcr xcrs[KVM_MAX_XCRS];
+	__u64 padding[16];
+};
+
+This ioctl would set vcpu's xcr to the value userspace specified.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index ff90055c7f0b..4d8dcbdfc120 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -22,6 +22,8 @@
 #define __KVM_HAVE_XEN_HVM
 #define __KVM_HAVE_VCPU_EVENTS
 #define __KVM_HAVE_DEBUGREGS
+#define __KVM_HAVE_XSAVE
+#define __KVM_HAVE_XCRS
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
@@ -299,4 +301,24 @@ struct kvm_debugregs {
 	__u64 reserved[9];
 };
 
+/* for KVM_CAP_XSAVE */
+struct kvm_xsave {
+	__u32 region[1024];
+};
+
+#define KVM_MAX_XCRS	16
+
+struct kvm_xcr {
+	__u32 xcr;
+	__u32 reserved;
+	__u64 value;
+};
+
+struct kvm_xcrs {
+	__u32 nr_xcrs;
+	__u32 flags;
+	struct kvm_xcr xcrs[KVM_MAX_XCRS];
+	__u64 padding[16];
+};
+
 #endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 29ee4e4c64cf..32c36668fa7b 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -13,8 +13,11 @@
 
 #define FXSAVE_SIZE	512
 
-#define XSTATE_YMM_SIZE 256
-#define XSTATE_YMM_OFFSET (512 + 64)
+#define XSAVE_HDR_SIZE	    64
+#define XSAVE_HDR_OFFSET    FXSAVE_SIZE
+
+#define XSAVE_YMM_SIZE	    256
+#define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
 
 /*
  * These are the features that the OS can handle currently.
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 795999e1ac19..0c8dc9614e7d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1680,6 +1680,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PCI_SEGMENT:
 	case KVM_CAP_DEBUGREGS:
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
+	case KVM_CAP_XSAVE:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -1703,6 +1704,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_MCE:
 		r = KVM_MAX_MCE_BANKS;
 		break;
+	case KVM_CAP_XCRS:
+		r = cpu_has_xsave;
+		break;
 	default:
 		r = 0;
 		break;
@@ -2355,6 +2359,77 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+					 struct kvm_xsave *guest_xsave)
+{
+	if (cpu_has_xsave)
+		memcpy(guest_xsave->region,
+			&vcpu->arch.guest_fpu.state->xsave,
+			sizeof(struct xsave_struct));
+	else {
+		memcpy(guest_xsave->region,
+			&vcpu->arch.guest_fpu.state->fxsave,
+			sizeof(struct i387_fxsave_struct));
+		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
+			XSTATE_FPSSE;
+	}
+}
+
+static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
+					struct kvm_xsave *guest_xsave)
+{
+	u64 xstate_bv =
+		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
+
+	if (cpu_has_xsave)
+		memcpy(&vcpu->arch.guest_fpu.state->xsave,
+			guest_xsave->region, sizeof(struct xsave_struct));
+	else {
+		if (xstate_bv & ~XSTATE_FPSSE)
+			return -EINVAL;
+		memcpy(&vcpu->arch.guest_fpu.state->fxsave,
+			guest_xsave->region, sizeof(struct i387_fxsave_struct));
+	}
+	return 0;
+}
+
+static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
+					struct kvm_xcrs *guest_xcrs)
+{
+	if (!cpu_has_xsave) {
+		guest_xcrs->nr_xcrs = 0;
+		return;
+	}
+
+	guest_xcrs->nr_xcrs = 1;
+	guest_xcrs->flags = 0;
+	guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
+	guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
+}
+
+static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
+				       struct kvm_xcrs *guest_xcrs)
+{
+	int i, r = 0;
+
+	if (!cpu_has_xsave)
+		return -EINVAL;
+
+	if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
+		return -EINVAL;
+
+	for (i = 0; i < guest_xcrs->nr_xcrs; i++)
+		/* Only support XCR0 currently */
+		if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
+			r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
+				guest_xcrs->xcrs[0].value);
+			break;
+		}
+	if (r)
+		r = -EINVAL;
+	return r;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 {
@@ -2556,6 +2631,70 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
 		break;
 	}
+	case KVM_GET_XSAVE: {
+		struct kvm_xsave *xsave;
+
+		xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!xsave)
+			break;
+
+		kvm_vcpu_ioctl_x86_get_xsave(vcpu, xsave);
+
+		r = -EFAULT;
+		if (copy_to_user(argp, xsave, sizeof(struct kvm_xsave)))
+			break;
+		r = 0;
+		break;
+	}
+	case KVM_SET_XSAVE: {
+		struct kvm_xsave *xsave;
+
+		xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!xsave)
+			break;
+
+		r = -EFAULT;
+		if (copy_from_user(xsave, argp, sizeof(struct kvm_xsave)))
+			break;
+
+		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, xsave);
+		break;
+	}
+	case KVM_GET_XCRS: {
+		struct kvm_xcrs *xcrs;
+
+		xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!xcrs)
+			break;
+
+		kvm_vcpu_ioctl_x86_get_xcrs(vcpu, xcrs);
+
+		r = -EFAULT;
+		if (copy_to_user(argp, xcrs,
+				 sizeof(struct kvm_xcrs)))
+			break;
+		r = 0;
+		break;
+	}
+	case KVM_SET_XCRS: {
+		struct kvm_xcrs *xcrs;
+
+		xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!xcrs)
+			break;
+
+		r = -EFAULT;
+		if (copy_from_user(xcrs, argp,
+				   sizeof(struct kvm_xcrs)))
+			break;
+
+		r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, xcrs);
+		break;
+	}
 	default:
 		r = -EINVAL;
 	}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 23ea02253900..6fd40f540a8e 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -524,6 +524,12 @@ struct kvm_enable_cap {
 #define KVM_CAP_PPC_OSI 52
 #define KVM_CAP_PPC_UNSET_IRQ 53
 #define KVM_CAP_ENABLE_CAP 54
+#ifdef __KVM_HAVE_XSAVE
+#define KVM_CAP_XSAVE 55
+#endif
+#ifdef __KVM_HAVE_XCRS
+#define KVM_CAP_XCRS 56
+#endif
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -714,6 +720,12 @@ struct kvm_clock_data {
 #define KVM_GET_DEBUGREGS         _IOR(KVMIO,  0xa1, struct kvm_debugregs)
 #define KVM_SET_DEBUGREGS         _IOW(KVMIO,  0xa2, struct kvm_debugregs)
 #define KVM_ENABLE_CAP            _IOW(KVMIO,  0xa3, struct kvm_enable_cap)
+/* Available with KVM_CAP_XSAVE */
+#define KVM_GET_XSAVE		  _IOR(KVMIO,  0xa4, struct kvm_xsave)
+#define KVM_SET_XSAVE		  _IOW(KVMIO,  0xa5, struct kvm_xsave)
+/* Available with KVM_CAP_XCRS */
+#define KVM_GET_XCRS		  _IOR(KVMIO,  0xa6, struct kvm_xcrs)
+#define KVM_SET_XCRS		  _IOW(KVMIO,  0xa7, struct kvm_xcrs)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 
-- 
cgit v1.2.3


From ac3cd03cca91d481b41e8236aaa41a7f9fafa62f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 11 Jun 2010 21:28:14 +0800
Subject: KVM: MMU: rename 'page' and 'shadow_page' to 'sp'

Rename 'page' and 'shadow_page' to 'sp' to better fit the context

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 9308be2d5c02..e461f2393d85 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -253,7 +253,7 @@ err:
 	return 0;
 }
 
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			      u64 *spte, const void *pte)
 {
 	pt_element_t gpte;
@@ -264,7 +264,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 	gpte = *(const pt_element_t *)pte;
 	if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
 		if (!is_present_gpte(gpte)) {
-			if (page->unsync)
+			if (sp->unsync)
 				new_spte = shadow_trap_nonpresent_pte;
 			else
 				new_spte = shadow_notrap_nonpresent_pte;
@@ -273,7 +273,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 		return;
 	}
 	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
-	pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
+	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
 	if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
 		return;
 	pfn = vcpu->arch.update_pte.pfn;
@@ -286,7 +286,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 	 * we call mmu_set_spte() with reset_host_protection = true beacuse that
 	 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
 	 */
-	mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
+	mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
 		     gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL,
 		     gpte_to_gfn(gpte), pfn, true, true);
 }
@@ -300,7 +300,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			 int *ptwrite, pfn_t pfn)
 {
 	unsigned access = gw->pt_access;
-	struct kvm_mmu_page *shadow_page;
+	struct kvm_mmu_page *sp;
 	u64 spte, *sptep = NULL;
 	int direct;
 	gfn_t table_gfn;
@@ -341,9 +341,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 				access &= ~ACC_WRITE_MASK;
 			/*
 			 * It is a large guest pages backed by small host pages,
-			 * So we set @direct(@shadow_page->role.direct)=1, and
-			 * set @table_gfn(@shadow_page->gfn)=the base page frame
-			 * for linear translations.
+			 * So we set @direct(@sp->role.direct)=1, and set
+			 * @table_gfn(@sp->gfn)=the base page frame for linear
+			 * translations.
 			 */
 			table_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
 			access &= gw->pte_access;
@@ -351,21 +351,21 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			direct = 0;
 			table_gfn = gw->table_gfn[level - 2];
 		}
-		shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
+		sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
 					       direct, access, sptep);
 		if (!direct) {
 			r = kvm_read_guest_atomic(vcpu->kvm,
 						  gw->pte_gpa[level - 2],
 						  &curr_pte, sizeof(curr_pte));
 			if (r || curr_pte != gw->ptes[level - 2]) {
-				kvm_mmu_put_page(shadow_page, sptep);
+				kvm_mmu_put_page(sp, sptep);
 				kvm_release_pfn_clean(pfn);
 				sptep = NULL;
 				break;
 			}
 		}
 
-		spte = __pa(shadow_page->spt)
+		spte = __pa(sp->spt)
 			| PT_PRESENT_MASK | PT_ACCESSED_MASK
 			| PT_WRITABLE_MASK | PT_USER_MASK;
 		*sptep = spte;
-- 
cgit v1.2.3


From cb83cad2e7e1cdedb2abb9cef2ac076defa679d4 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 11 Jun 2010 21:29:42 +0800
Subject: KVM: MMU: cleanup for dirty page judgment

Using wrap function to cleanup page dirty judgment

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index e461f2393d85..efba353369e7 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -287,7 +287,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
 	 */
 	mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
-		     gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL,
+		     is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL,
 		     gpte_to_gfn(gpte), pfn, true, true);
 }
 
@@ -319,7 +319,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			mmu_set_spte(vcpu, sptep, access,
 				     gw->pte_access & access,
 				     user_fault, write_fault,
-				     gw->ptes[gw->level-1] & PT_DIRTY_MASK,
+				     is_dirty_gpte(gw->ptes[gw->level-1]),
 				     ptwrite, level,
 				     gw->gfn, pfn, false, true);
 			break;
-- 
cgit v1.2.3


From f918b443527e98476c8cc45683152106b9e4bedc Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 11 Jun 2010 21:30:36 +0800
Subject: KVM: MMU: avoid double write protected in sync page path

The sync page is already write protected in mmu_sync_children(), don't
write protected it again

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index aa98fca03ed7..ff333572be75 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1216,6 +1216,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 		if ((sp)->gfn != (gfn) || (sp)->role.direct ||		\
 			(sp)->role.invalid) {} else
 
+/* @sp->gfn should be write-protected at the call site */
 static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			   struct list_head *invalid_list, bool clear_unsync)
 {
@@ -1224,11 +1225,8 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 		return 1;
 	}
 
-	if (clear_unsync) {
-		if (rmap_write_protect(vcpu->kvm, sp->gfn))
-			kvm_flush_remote_tlbs(vcpu->kvm);
+	if (clear_unsync)
 		kvm_unlink_unsync_page(vcpu->kvm, sp);
-	}
 
 	if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
-- 
cgit v1.2.3


From be71e061d15c0aad4f8c2606f76c57b8a19792fd Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 11 Jun 2010 21:31:38 +0800
Subject: KVM: MMU: don't mark pte notrap if it's just sync transient

If the sync-sp just sync transient, don't mark its pte notrap

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/mmu.c              | 11 ++++-------
 arch/x86/kvm/paging_tmpl.h      |  5 +++--
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c2813d658f3e..2ec2e27a403e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -241,7 +241,7 @@ struct kvm_mmu {
 	void (*prefetch_page)(struct kvm_vcpu *vcpu,
 			      struct kvm_mmu_page *page);
 	int (*sync_page)(struct kvm_vcpu *vcpu,
-			 struct kvm_mmu_page *sp);
+			 struct kvm_mmu_page *sp, bool clear_unsync);
 	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
 	hpa_t root_hpa;
 	int root_level;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ff333572be75..d1e09f3c5614 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1103,7 +1103,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
 }
 
 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
-			       struct kvm_mmu_page *sp)
+			       struct kvm_mmu_page *sp, bool clear_unsync)
 {
 	return 1;
 }
@@ -1228,7 +1228,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	if (clear_unsync)
 		kvm_unlink_unsync_page(vcpu->kvm, sp);
 
-	if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
+	if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
 		return 1;
 	}
@@ -1237,7 +1237,6 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	return 0;
 }
 
-static void mmu_convert_notrap(struct kvm_mmu_page *sp);
 static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
 				   struct kvm_mmu_page *sp)
 {
@@ -1245,9 +1244,7 @@ static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
 	int ret;
 
 	ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
-	if (!ret)
-		mmu_convert_notrap(sp);
-	else
+	if (ret)
 		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 
 	return ret;
@@ -1273,7 +1270,7 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 
 		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
 		if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
-			(vcpu->arch.mmu.sync_page(vcpu, s))) {
+			(vcpu->arch.mmu.sync_page(vcpu, s, true))) {
 			kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
 			continue;
 		}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index efba353369e7..863920f649fb 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -578,7 +578,8 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
  *   can't change unless all sptes pointing to it are nuked first.
  * - Alias changes zap the entire shadow cache.
  */
-static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			    bool clear_unsync)
 {
 	int i, offset, nr_present;
 	bool reset_host_protection;
@@ -615,7 +616,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 			u64 nonpresent;
 
 			rmap_remove(vcpu->kvm, &sp->spt[i]);
-			if (is_present_gpte(gpte))
+			if (is_present_gpte(gpte) || !clear_unsync)
 				nonpresent = shadow_trap_nonpresent_pte;
 			else
 				nonpresent = shadow_notrap_nonpresent_pte;
-- 
cgit v1.2.3


From ebdea638df04ae6293a9a5414d98ad843c69e82f Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 11 Jun 2010 21:32:34 +0800
Subject: KVM: MMU: cleanup for __mmu_unsync_walk()

Decrease sp->unsync_children after clear unsync_child_bitmap bit

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d1e09f3c5614..41e801b53064 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1160,9 +1160,11 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
 					return -ENOSPC;
 
 				ret = __mmu_unsync_walk(child, pvec);
-				if (!ret)
+				if (!ret) {
 					__clear_bit(i, sp->unsync_child_bitmap);
-				else if (ret > 0)
+					sp->unsync_children--;
+					WARN_ON((int)sp->unsync_children < 0);
+				} else if (ret > 0)
 					nr_unsync_leaf += ret;
 				else
 					return ret;
@@ -1176,8 +1178,6 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
 		}
 	}
 
-	if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
-		sp->unsync_children = 0;
 
 	return nr_unsync_leaf;
 }
-- 
cgit v1.2.3


From 7a8f1a74e4193d21e55b35928197486f2c047efb Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 11 Jun 2010 21:34:04 +0800
Subject: KVM: MMU: clear unsync_child_bitmap completely

In current code, some page's unsync_child_bitmap is not cleared completely
in mmu_sync_children(), for example, if two PDPEs shard one PDT, one of
PDPE's unsync_child_bitmap is not cleared.

Currently, it not harm anything just little overload, but it's the prepare
work for the later patch

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 53 +++++++++++++++++++++++++++++------------------------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 41e801b53064..ab12be4eb105 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1149,33 +1149,38 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
 	int i, ret, nr_unsync_leaf = 0;
 
 	for_each_unsync_children(sp->unsync_child_bitmap, i) {
+		struct kvm_mmu_page *child;
 		u64 ent = sp->spt[i];
 
-		if (is_shadow_present_pte(ent) && !is_large_pte(ent)) {
-			struct kvm_mmu_page *child;
-			child = page_header(ent & PT64_BASE_ADDR_MASK);
-
-			if (child->unsync_children) {
-				if (mmu_pages_add(pvec, child, i))
-					return -ENOSPC;
-
-				ret = __mmu_unsync_walk(child, pvec);
-				if (!ret) {
-					__clear_bit(i, sp->unsync_child_bitmap);
-					sp->unsync_children--;
-					WARN_ON((int)sp->unsync_children < 0);
-				} else if (ret > 0)
-					nr_unsync_leaf += ret;
-				else
-					return ret;
-			}
+		if (!is_shadow_present_pte(ent) || is_large_pte(ent))
+			goto clear_child_bitmap;
+
+		child = page_header(ent & PT64_BASE_ADDR_MASK);
+
+		if (child->unsync_children) {
+			if (mmu_pages_add(pvec, child, i))
+				return -ENOSPC;
+
+			ret = __mmu_unsync_walk(child, pvec);
+			if (!ret)
+				goto clear_child_bitmap;
+			else if (ret > 0)
+				nr_unsync_leaf += ret;
+			else
+				return ret;
+		} else if (child->unsync) {
+			nr_unsync_leaf++;
+			if (mmu_pages_add(pvec, child, i))
+				return -ENOSPC;
+		} else
+			 goto clear_child_bitmap;
 
-			if (child->unsync) {
-				nr_unsync_leaf++;
-				if (mmu_pages_add(pvec, child, i))
-					return -ENOSPC;
-			}
-		}
+		continue;
+
+clear_child_bitmap:
+		__clear_bit(i, sp->unsync_child_bitmap);
+		sp->unsync_children--;
+		WARN_ON((int)sp->unsync_children < 0);
 	}
 
 
-- 
cgit v1.2.3


From 1047df1fb682a41eb9885d6b3f2d04d6c8fd3756 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 11 Jun 2010 21:35:15 +0800
Subject: KVM: MMU: don't walk every parent pages while mark unsync

While we mark the parent's unsync_child_bitmap, if the parent is already
unsynced, it no need walk it's parent, it can reduce some unnecessary
workload

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 61 +++++++++++++++---------------------------------------
 1 file changed, 17 insertions(+), 44 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ab12be4eb105..8c2f580956d9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -175,7 +175,7 @@ struct kvm_shadow_walk_iterator {
 	     shadow_walk_okay(&(_walker));			\
 	     shadow_walk_next(&(_walker)))
 
-typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp);
+typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
 
 static struct kmem_cache *pte_chain_cache;
 static struct kmem_cache *rmap_desc_cache;
@@ -1024,7 +1024,6 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
 	BUG();
 }
 
-
 static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
 {
 	struct kvm_pte_chain *pte_chain;
@@ -1034,63 +1033,37 @@ static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
 
 	if (!sp->multimapped && sp->parent_pte) {
 		parent_sp = page_header(__pa(sp->parent_pte));
-		fn(parent_sp);
-		mmu_parent_walk(parent_sp, fn);
+		fn(parent_sp, sp->parent_pte);
 		return;
 	}
+
 	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
 		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
-			if (!pte_chain->parent_ptes[i])
+			u64 *spte = pte_chain->parent_ptes[i];
+
+			if (!spte)
 				break;
-			parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
-			fn(parent_sp);
-			mmu_parent_walk(parent_sp, fn);
+			parent_sp = page_header(__pa(spte));
+			fn(parent_sp, spte);
 		}
 }
 
-static void kvm_mmu_update_unsync_bitmap(u64 *spte)
+static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
+static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
 {
-	unsigned int index;
-	struct kvm_mmu_page *sp = page_header(__pa(spte));
-
-	index = spte - sp->spt;
-	if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
-		sp->unsync_children++;
-	WARN_ON(!sp->unsync_children);
+	mmu_parent_walk(sp, mark_unsync);
 }
 
-static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
+static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
 {
-	struct kvm_pte_chain *pte_chain;
-	struct hlist_node *node;
-	int i;
+	unsigned int index;
 
-	if (!sp->parent_pte)
+	index = spte - sp->spt;
+	if (__test_and_set_bit(index, sp->unsync_child_bitmap))
 		return;
-
-	if (!sp->multimapped) {
-		kvm_mmu_update_unsync_bitmap(sp->parent_pte);
+	if (sp->unsync_children++)
 		return;
-	}
-
-	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
-		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
-			if (!pte_chain->parent_ptes[i])
-				break;
-			kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
-		}
-}
-
-static int unsync_walk_fn(struct kvm_mmu_page *sp)
-{
-	kvm_mmu_update_parents_unsync(sp);
-	return 1;
-}
-
-static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
-{
-	mmu_parent_walk(sp, unsync_walk_fn);
-	kvm_mmu_update_parents_unsync(sp);
+	kvm_mmu_mark_parents_unsync(sp);
 }
 
 static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
-- 
cgit v1.2.3


From bd371396b38ffc4bd6444b0203f33b99d18cedd0 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Mon, 14 Jun 2010 11:42:15 -1000
Subject: KVM: x86: fix -DDEBUG oops

Fix a slight error with assertion in local APIC code.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/lapic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d8258a0060f8..024f6d1c2996 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -329,7 +329,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 		   "dest_mode 0x%x, short_hand 0x%x\n",
 		   target, source, dest, dest_mode, short_hand);
 
-	ASSERT(!target);
+	ASSERT(target);
 	switch (short_hand) {
 	case APIC_DEST_NOSHORT:
 		if (dest_mode == 0)
-- 
cgit v1.2.3


From c37eda138473f8c843f2b4aa8da252fdfdaaafa3 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 15 Jun 2010 09:03:33 +0800
Subject: KVM: x86 emulator: fix pusha instruction emulation

emulate pusha instruction only writeback the last
EDI register, but the other registers which need
to be writeback is ignored. This patch fixed it.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 133 +++++++++++++++++++++++++++----------------------
 1 file changed, 73 insertions(+), 60 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index a4c2dcd10326..c990db0a3a02 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1553,6 +1553,64 @@ exception:
 	return X86EMUL_PROPAGATE_FAULT;
 }
 
+static inline int writeback(struct x86_emulate_ctxt *ctxt,
+			    struct x86_emulate_ops *ops)
+{
+	int rc;
+	struct decode_cache *c = &ctxt->decode;
+	u32 err;
+
+	switch (c->dst.type) {
+	case OP_REG:
+		/* The 4-byte case *is* correct:
+		 * in 64-bit mode we zero-extend.
+		 */
+		switch (c->dst.bytes) {
+		case 1:
+			*(u8 *)c->dst.ptr = (u8)c->dst.val;
+			break;
+		case 2:
+			*(u16 *)c->dst.ptr = (u16)c->dst.val;
+			break;
+		case 4:
+			*c->dst.ptr = (u32)c->dst.val;
+			break;	/* 64b: zero-ext */
+		case 8:
+			*c->dst.ptr = c->dst.val;
+			break;
+		}
+		break;
+	case OP_MEM:
+		if (c->lock_prefix)
+			rc = ops->cmpxchg_emulated(
+					(unsigned long)c->dst.ptr,
+					&c->dst.orig_val,
+					&c->dst.val,
+					c->dst.bytes,
+					&err,
+					ctxt->vcpu);
+		else
+			rc = ops->write_emulated(
+					(unsigned long)c->dst.ptr,
+					&c->dst.val,
+					c->dst.bytes,
+					&err,
+					ctxt->vcpu);
+		if (rc == X86EMUL_PROPAGATE_FAULT)
+			emulate_pf(ctxt,
+					      (unsigned long)c->dst.ptr, err);
+		if (rc != X86EMUL_CONTINUE)
+			return rc;
+		break;
+	case OP_NONE:
+		/* no writeback */
+		break;
+	default:
+		break;
+	}
+	return X86EMUL_CONTINUE;
+}
+
 static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
 				struct x86_emulate_ops *ops)
 {
@@ -1651,11 +1709,12 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
 	return rc;
 }
 
-static void emulate_pusha(struct x86_emulate_ctxt *ctxt,
+static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
 			  struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
 	unsigned long old_esp = c->regs[VCPU_REGS_RSP];
+	int rc = X86EMUL_CONTINUE;
 	int reg = VCPU_REGS_RAX;
 
 	while (reg <= VCPU_REGS_RDI) {
@@ -1663,8 +1722,18 @@ static void emulate_pusha(struct x86_emulate_ctxt *ctxt,
 		(c->src.val = old_esp) : (c->src.val = c->regs[reg]);
 
 		emulate_push(ctxt, ops);
+
+		rc = writeback(ctxt, ops);
+		if (rc != X86EMUL_CONTINUE)
+			return rc;
+
 		++reg;
 	}
+
+	/* Disable writeback. */
+	c->dst.type = OP_NONE;
+
+	return rc;
 }
 
 static int emulate_popa(struct x86_emulate_ctxt *ctxt,
@@ -1817,64 +1886,6 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
 	return rc;
 }
 
-static inline int writeback(struct x86_emulate_ctxt *ctxt,
-			    struct x86_emulate_ops *ops)
-{
-	int rc;
-	struct decode_cache *c = &ctxt->decode;
-	u32 err;
-
-	switch (c->dst.type) {
-	case OP_REG:
-		/* The 4-byte case *is* correct:
-		 * in 64-bit mode we zero-extend.
-		 */
-		switch (c->dst.bytes) {
-		case 1:
-			*(u8 *)c->dst.ptr = (u8)c->dst.val;
-			break;
-		case 2:
-			*(u16 *)c->dst.ptr = (u16)c->dst.val;
-			break;
-		case 4:
-			*c->dst.ptr = (u32)c->dst.val;
-			break;	/* 64b: zero-ext */
-		case 8:
-			*c->dst.ptr = c->dst.val;
-			break;
-		}
-		break;
-	case OP_MEM:
-		if (c->lock_prefix)
-			rc = ops->cmpxchg_emulated(
-					(unsigned long)c->dst.ptr,
-					&c->dst.orig_val,
-					&c->dst.val,
-					c->dst.bytes,
-					&err,
-					ctxt->vcpu);
-		else
-			rc = ops->write_emulated(
-					(unsigned long)c->dst.ptr,
-					&c->dst.val,
-					c->dst.bytes,
-					&err,
-					ctxt->vcpu);
-		if (rc == X86EMUL_PROPAGATE_FAULT)
-			emulate_pf(ctxt,
-					      (unsigned long)c->dst.ptr, err);
-		if (rc != X86EMUL_CONTINUE)
-			return rc;
-		break;
-	case OP_NONE:
-		/* no writeback */
-		break;
-	default:
-		break;
-	}
-	return X86EMUL_CONTINUE;
-}
-
 static inline void
 setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
 			struct x86_emulate_ops *ops, struct desc_struct *cs,
@@ -2689,7 +2700,9 @@ special_insn:
 			goto done;
 		break;
 	case 0x60:	/* pusha */
-		emulate_pusha(ctxt, ops);
+		rc = emulate_pusha(ctxt, ops);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
 		break;
 	case 0x61:	/* popa */
 		rc = emulate_popa(ctxt, ops);
-- 
cgit v1.2.3


From 33572ac0ad5ba5016da72e6654e607726568f9c0 Mon Sep 17 00:00:00 2001
From: Chris Lalancette <clalance@redhat.com>
Date: Wed, 16 Jun 2010 17:11:11 -0400
Subject: KVM: x86: Introduce a workqueue to deliver PIT timer interrupts

We really want to "kvm_set_irq" during the hrtimer callback,
but that is risky because that is during interrupt context.
Instead, offload the work to a workqueue, which is a bit safer
and should provide most of the same functionality.

Signed-off-by: Chris Lalancette <clalance@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/i8254.c | 141 +++++++++++++++++++++++++++++++--------------------
 arch/x86/kvm/i8254.h |   4 +-
 arch/x86/kvm/irq.c   |   1 -
 3 files changed, 88 insertions(+), 58 deletions(-)

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 188d82762c1b..467cc47fb733 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -34,6 +34,7 @@
 
 #include <linux/kvm_host.h>
 #include <linux/slab.h>
+#include <linux/workqueue.h>
 
 #include "irq.h"
 #include "i8254.h"
@@ -244,11 +245,22 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
 	struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
 						 irq_ack_notifier);
-	raw_spin_lock(&ps->inject_lock);
-	if (atomic_dec_return(&ps->pit_timer.pending) < 0)
+	int value;
+
+	spin_lock(&ps->inject_lock);
+	value = atomic_dec_return(&ps->pit_timer.pending);
+	if (value < 0)
+		/* spurious acks can be generated if, for example, the
+		 * PIC is being reset.  Handle it gracefully here
+		 */
 		atomic_inc(&ps->pit_timer.pending);
+	else if (value > 0)
+		/* in this case, we had multiple outstanding pit interrupts
+		 * that we needed to inject.  Reinject
+		 */
+		queue_work(ps->pit->wq, &ps->pit->expired);
 	ps->irq_ack = 1;
-	raw_spin_unlock(&ps->inject_lock);
+	spin_unlock(&ps->inject_lock);
 }
 
 void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -264,10 +276,10 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
 		hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
 
-static void destroy_pit_timer(struct kvm_timer *pt)
+static void destroy_pit_timer(struct kvm_pit *pit)
 {
-	pr_debug("execute del timer!\n");
-	hrtimer_cancel(&pt->timer);
+	hrtimer_cancel(&pit->pit_state.pit_timer.timer);
+	cancel_work_sync(&pit->expired);
 }
 
 static bool kpit_is_periodic(struct kvm_timer *ktimer)
@@ -281,6 +293,60 @@ static struct kvm_timer_ops kpit_ops = {
 	.is_periodic = kpit_is_periodic,
 };
 
+static void pit_do_work(struct work_struct *work)
+{
+	struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
+	struct kvm *kvm = pit->kvm;
+	struct kvm_vcpu *vcpu;
+	int i;
+	struct kvm_kpit_state *ps = &pit->pit_state;
+	int inject = 0;
+
+	/* Try to inject pending interrupts when
+	 * last one has been acked.
+	 */
+	spin_lock(&ps->inject_lock);
+	if (ps->irq_ack) {
+		ps->irq_ack = 0;
+		inject = 1;
+	}
+	spin_unlock(&ps->inject_lock);
+	if (inject) {
+		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
+		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
+
+		/*
+		 * Provides NMI watchdog support via Virtual Wire mode.
+		 * The route is: PIT -> PIC -> LVT0 in NMI mode.
+		 *
+		 * Note: Our Virtual Wire implementation is simplified, only
+		 * propagating PIT interrupts to all VCPUs when they have set
+		 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
+		 * VCPU0, and only if its LVT0 is in EXTINT mode.
+		 */
+		if (kvm->arch.vapics_in_nmi_mode > 0)
+			kvm_for_each_vcpu(i, vcpu, kvm)
+				kvm_apic_nmi_wd_deliver(vcpu);
+	}
+}
+
+static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
+{
+	struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+	struct kvm_pit *pt = ktimer->kvm->arch.vpit;
+
+	if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
+		atomic_inc(&ktimer->pending);
+		queue_work(pt->wq, &pt->expired);
+	}
+
+	if (ktimer->t_ops->is_periodic(ktimer)) {
+		hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+		return HRTIMER_RESTART;
+	} else
+		return HRTIMER_NORESTART;
+}
+
 static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
 {
 	struct kvm_timer *pt = &ps->pit_timer;
@@ -292,13 +358,13 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
 
 	/* TODO The new value only affected after the retriggered */
 	hrtimer_cancel(&pt->timer);
+	cancel_work_sync(&ps->pit->expired);
 	pt->period = interval;
 	ps->is_periodic = is_period;
 
-	pt->timer.function = kvm_timer_fn;
+	pt->timer.function = pit_timer_fn;
 	pt->t_ops = &kpit_ops;
 	pt->kvm = ps->pit->kvm;
-	pt->vcpu = pt->kvm->bsp_vcpu;
 
 	atomic_set(&pt->pending, 0);
 	ps->irq_ack = 1;
@@ -347,7 +413,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
 		}
 		break;
 	default:
-		destroy_pit_timer(&ps->pit_timer);
+		destroy_pit_timer(kvm->arch.vpit);
 	}
 }
 
@@ -626,7 +692,14 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 
 	mutex_init(&pit->pit_state.lock);
 	mutex_lock(&pit->pit_state.lock);
-	raw_spin_lock_init(&pit->pit_state.inject_lock);
+	spin_lock_init(&pit->pit_state.inject_lock);
+
+	pit->wq = create_singlethread_workqueue("kvm-pit-wq");
+	if (!pit->wq) {
+		kfree(pit);
+		return NULL;
+	}
+	INIT_WORK(&pit->expired, pit_do_work);
 
 	kvm->arch.vpit = pit;
 	pit->kvm = kvm;
@@ -685,54 +758,10 @@ void kvm_free_pit(struct kvm *kvm)
 		mutex_lock(&kvm->arch.vpit->pit_state.lock);
 		timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
 		hrtimer_cancel(timer);
+		cancel_work_sync(&kvm->arch.vpit->expired);
 		kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
 		mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+		destroy_workqueue(kvm->arch.vpit->wq);
 		kfree(kvm->arch.vpit);
 	}
 }
-
-static void __inject_pit_timer_intr(struct kvm *kvm)
-{
-	struct kvm_vcpu *vcpu;
-	int i;
-
-	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
-	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
-
-	/*
-	 * Provides NMI watchdog support via Virtual Wire mode.
-	 * The route is: PIT -> PIC -> LVT0 in NMI mode.
-	 *
-	 * Note: Our Virtual Wire implementation is simplified, only
-	 * propagating PIT interrupts to all VCPUs when they have set
-	 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
-	 * VCPU0, and only if its LVT0 is in EXTINT mode.
-	 */
-	if (kvm->arch.vapics_in_nmi_mode > 0)
-		kvm_for_each_vcpu(i, vcpu, kvm)
-			kvm_apic_nmi_wd_deliver(vcpu);
-}
-
-void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
-{
-	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
-	struct kvm *kvm = vcpu->kvm;
-	struct kvm_kpit_state *ps;
-
-	if (pit) {
-		int inject = 0;
-		ps = &pit->pit_state;
-
-		/* Try to inject pending interrupts when
-		 * last one has been acked.
-		 */
-		raw_spin_lock(&ps->inject_lock);
-		if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
-			ps->irq_ack = 0;
-			inject = 1;
-		}
-		raw_spin_unlock(&ps->inject_lock);
-		if (inject)
-			__inject_pit_timer_intr(kvm);
-	}
-}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 900d6b0ba7c2..46d08ca0b48f 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -27,7 +27,7 @@ struct kvm_kpit_state {
 	u32    speaker_data_on;
 	struct mutex lock;
 	struct kvm_pit *pit;
-	raw_spinlock_t inject_lock;
+	spinlock_t inject_lock;
 	unsigned long irq_ack;
 	struct kvm_irq_ack_notifier irq_ack_notifier;
 };
@@ -40,6 +40,8 @@ struct kvm_pit {
 	struct kvm_kpit_state pit_state;
 	int irq_source_id;
 	struct kvm_irq_mask_notifier mask_notifier;
+	struct workqueue_struct *wq;
+	struct work_struct expired;
 };
 
 #define KVM_PIT_BASE_ADDRESS	    0x40
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 0f4e488331c7..2095a049835e 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
 {
 	kvm_inject_apic_timer_irqs(vcpu);
-	kvm_inject_pit_timer_irqs(vcpu);
 	/* TODO: PIT, RTC etc. */
 }
 EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
-- 
cgit v1.2.3


From e7dca5c0eba63e4ba8e3586c4b37863fd7fadb5a Mon Sep 17 00:00:00 2001
From: Chris Lalancette <clalance@redhat.com>
Date: Wed, 16 Jun 2010 17:11:12 -0400
Subject: KVM: x86: Allow any LAPIC to accept PIC interrupts

If the guest wants to accept timer interrupts on a CPU other
than the BSP, we need to remove this gate.

Signed-off-by: Chris Lalancette <clalance@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/lapic.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 024f6d1c2996..49573c78c24b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1107,13 +1107,11 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
 	u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
 	int r = 0;
 
-	if (kvm_vcpu_is_bsp(vcpu)) {
-		if (!apic_hw_enabled(vcpu->arch.apic))
-			r = 1;
-		if ((lvt0 & APIC_LVT_MASKED) == 0 &&
-		    GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
-			r = 1;
-	}
+	if (!apic_hw_enabled(vcpu->arch.apic))
+		r = 1;
+	if ((lvt0 & APIC_LVT_MASKED) == 0 &&
+	    GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
+		r = 1;
 	return r;
 }
 
-- 
cgit v1.2.3


From aefd18f01ee848448d834c80e601ccff61515811 Mon Sep 17 00:00:00 2001
From: Chris Lalancette <clalance@redhat.com>
Date: Wed, 16 Jun 2010 17:11:13 -0400
Subject: KVM: x86: In DM_LOWEST, only deliver interrupts to vcpus with enabled
 LAPIC's

Otherwise we might try to deliver a timer interrupt to a cpu that
can't possibly handle it.

Signed-off-by: Chris Lalancette <clalance@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 virt/kvm/irq_comm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 52f412f16be3..06cf61e729d2 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -100,7 +100,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 			if (r < 0)
 				r = 0;
 			r += kvm_apic_set_irq(vcpu, irq);
-		} else {
+		} else if (kvm_lapic_enabled(vcpu)) {
 			if (!lowest)
 				lowest = vcpu;
 			else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
-- 
cgit v1.2.3


From 21bbe18b2d53e0941cbd1621400ba7b4028972cb Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Thu, 17 Jun 2010 16:49:22 +0800
Subject: KVM: Fix typos in Documentation/kvm/mmu.txt

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 Documentation/kvm/mmu.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/kvm/mmu.txt b/Documentation/kvm/mmu.txt
index 8cb42b957c73..142cc5136650 100644
--- a/Documentation/kvm/mmu.txt
+++ b/Documentation/kvm/mmu.txt
@@ -77,10 +77,10 @@ Memory
 
 Guest memory (gpa) is part of the user address space of the process that is
 using kvm.  Userspace defines the translation between guest addresses and user
-addresses (gpa->hva); note that two gpas may alias to the same gva, but not
+addresses (gpa->hva); note that two gpas may alias to the same hva, but not
 vice versa.
 
-These gvas may be backed using any method available to the host: anonymous
+These hvas may be backed using any method available to the host: anonymous
 memory, file backed memory, and device memory.  Memory might be paged by the
 host at any time.
 
-- 
cgit v1.2.3


From 6045be5dea7f7905433585e524f884b711e3796a Mon Sep 17 00:00:00 2001
From: Asias He <asias.hejun@gmail.com>
Date: Sat, 19 Jun 2010 16:52:12 +0800
Subject: KVM: PPC: fix uninitialized variable warning in
 kvm_ppc_core_deliver_interrupts

Fixes:
arch/powerpc/kvm/booke.c: In function 'kvmppc_core_deliver_interrupts':
arch/powerpc/kvm/booke.c:147: warning: 'msr_mask' may be used uninitialized in this function

Signed-off-by: Asias He <asias.hejun@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/kvm/booke.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index b687f430f8c0..8d4e35f5372c 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -144,7 +144,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
                                         unsigned int priority)
 {
 	int allowed = 0;
-	ulong msr_mask;
+	ulong uninitialized_var(msr_mask);
 	bool update_esr = false, update_dear = false;
 
 	switch (priority) {
-- 
cgit v1.2.3


From 7d5993d63f2bac75b89e171a7098044ec4bc701f Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Thu, 17 Jun 2010 17:33:55 +0800
Subject: KVM: x86 emulator: fix group3 instruction decoding

Group 3 instruction with ModRM reg field as 001 is
defined as test instruction under AMD arch, and
emulate_grp3() is ready for emulate it, so fix the
decoding.

static inline int emulate_grp3(...)
{
	...
	switch (c->modrm_reg) {
	case 0 ... 1:   /* test */
		emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
	...
}

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c990db0a3a02..abb8cec420a2 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -336,11 +336,11 @@ static u32 group_table[] = {
 	[Group1A*8] =
 	DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
 	[Group3_Byte*8] =
-	ByteOp | SrcImm | DstMem | ModRM, 0,
+	ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM,
 	ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
 	0, 0, 0, 0,
 	[Group3*8] =
-	DstMem | SrcImm | ModRM, 0,
+	DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
 	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
 	0, 0, 0, 0,
 	[Group4*8] =
-- 
cgit v1.2.3


From a1a005f36e0defea7c5490772c318c6af2261d31 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 20 Jun 2010 15:47:34 +0300
Subject: KVM: Fix xsave and xcr save/restore memory leak

We allocate temporary kernel buffers for these structures, but never free them.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0c8dc9614e7d..d918cb15e5b5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2437,6 +2437,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	void __user *argp = (void __user *)arg;
 	int r;
 	struct kvm_lapic_state *lapic = NULL;
+	struct kvm_xsave *xsave = NULL;
+	struct kvm_xcrs *xcrs = NULL;
 
 	switch (ioctl) {
 	case KVM_GET_LAPIC: {
@@ -2632,8 +2634,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_GET_XSAVE: {
-		struct kvm_xsave *xsave;
-
 		xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
 		r = -ENOMEM;
 		if (!xsave)
@@ -2648,8 +2648,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_SET_XSAVE: {
-		struct kvm_xsave *xsave;
-
 		xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
 		r = -ENOMEM;
 		if (!xsave)
@@ -2663,8 +2661,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_GET_XCRS: {
-		struct kvm_xcrs *xcrs;
-
 		xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
 		r = -ENOMEM;
 		if (!xcrs)
@@ -2680,8 +2676,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_SET_XCRS: {
-		struct kvm_xcrs *xcrs;
-
 		xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
 		r = -ENOMEM;
 		if (!xcrs)
@@ -2700,6 +2694,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	}
 out:
 	kfree(lapic);
+	kfree(xsave);
+	kfree(xcrs);
 	return r;
 }
 
-- 
cgit v1.2.3


From d1ac91d8a2f00dc6a3954f7e8971339b0893edc4 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 20 Jun 2010 15:54:43 +0300
Subject: KVM: Consolidate load/save temporary buffer allocation and freeing

Instead of three temporary variables and three free calls, have one temporary
variable (with four names) and one free call.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 62 ++++++++++++++++++++++++++++--------------------------
 1 file changed, 32 insertions(+), 30 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d918cb15e5b5..8e60b6c9c0b0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2436,25 +2436,29 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 	struct kvm_vcpu *vcpu = filp->private_data;
 	void __user *argp = (void __user *)arg;
 	int r;
-	struct kvm_lapic_state *lapic = NULL;
-	struct kvm_xsave *xsave = NULL;
-	struct kvm_xcrs *xcrs = NULL;
+	union {
+		struct kvm_lapic_state *lapic;
+		struct kvm_xsave *xsave;
+		struct kvm_xcrs *xcrs;
+		void *buffer;
+	} u;
 
+	u.buffer = NULL;
 	switch (ioctl) {
 	case KVM_GET_LAPIC: {
 		r = -EINVAL;
 		if (!vcpu->arch.apic)
 			goto out;
-		lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+		u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
 
 		r = -ENOMEM;
-		if (!lapic)
+		if (!u.lapic)
 			goto out;
-		r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
+		r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
 		if (r)
 			goto out;
 		r = -EFAULT;
-		if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
+		if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
 			goto out;
 		r = 0;
 		break;
@@ -2463,14 +2467,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = -EINVAL;
 		if (!vcpu->arch.apic)
 			goto out;
-		lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+		u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
 		r = -ENOMEM;
-		if (!lapic)
+		if (!u.lapic)
 			goto out;
 		r = -EFAULT;
-		if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
+		if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state)))
 			goto out;
-		r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
+		r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
 		if (r)
 			goto out;
 		r = 0;
@@ -2634,68 +2638,66 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_GET_XSAVE: {
-		xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
 		r = -ENOMEM;
-		if (!xsave)
+		if (!u.xsave)
 			break;
 
-		kvm_vcpu_ioctl_x86_get_xsave(vcpu, xsave);
+		kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
 
 		r = -EFAULT;
-		if (copy_to_user(argp, xsave, sizeof(struct kvm_xsave)))
+		if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
 			break;
 		r = 0;
 		break;
 	}
 	case KVM_SET_XSAVE: {
-		xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
 		r = -ENOMEM;
-		if (!xsave)
+		if (!u.xsave)
 			break;
 
 		r = -EFAULT;
-		if (copy_from_user(xsave, argp, sizeof(struct kvm_xsave)))
+		if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave)))
 			break;
 
-		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, xsave);
+		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
 		break;
 	}
 	case KVM_GET_XCRS: {
-		xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
 		r = -ENOMEM;
-		if (!xcrs)
+		if (!u.xcrs)
 			break;
 
-		kvm_vcpu_ioctl_x86_get_xcrs(vcpu, xcrs);
+		kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
 
 		r = -EFAULT;
-		if (copy_to_user(argp, xcrs,
+		if (copy_to_user(argp, u.xcrs,
 				 sizeof(struct kvm_xcrs)))
 			break;
 		r = 0;
 		break;
 	}
 	case KVM_SET_XCRS: {
-		xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
 		r = -ENOMEM;
-		if (!xcrs)
+		if (!u.xcrs)
 			break;
 
 		r = -EFAULT;
-		if (copy_from_user(xcrs, argp,
+		if (copy_from_user(u.xcrs, argp,
 				   sizeof(struct kvm_xcrs)))
 			break;
 
-		r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, xcrs);
+		r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
 		break;
 	}
 	default:
 		r = -EINVAL;
 	}
 out:
-	kfree(lapic);
-	kfree(xsave);
-	kfree(xcrs);
+	kfree(u.buffer);
 	return r;
 }
 
-- 
cgit v1.2.3


From 971eb77f87bfd352a56a24651922d0383db76219 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Sat, 12 Jun 2010 08:54:13 +0200
Subject: KVM: s390: Fix build failure due to centralized vcpu locking patches

This patch fixes
ERROR: "__kvm_s390_vcpu_store_status" [arch/s390/kvm/kvm.ko] undefined!

triggered by
commit 3268c56840dcee78c3e928336550f4e1861504c4 (kvm.git)
Author: Avi Kivity <avi@redhat.com>
Date:   Thu May 13 12:21:46 2010 +0300
    KVM: s390: Centrally lock arch specific vcpu ioctls

Reported-by: Sachin Sant <sachinp@in.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/s390/kvm/intercept.c | 2 +-
 arch/s390/kvm/kvm-s390.c  | 2 +-
 arch/s390/kvm/kvm-s390.h  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 3ddc30895e31..f7b6df45d8be 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -135,7 +135,7 @@ static int handle_stop(struct kvm_vcpu *vcpu)
 	spin_lock_bh(&vcpu->arch.local_int.lock);
 	if (vcpu->arch.local_int.action_bits & ACTION_STORE_ON_STOP) {
 		vcpu->arch.local_int.action_bits &= ~ACTION_STORE_ON_STOP;
-		rc = __kvm_s390_vcpu_store_status(vcpu,
+		rc = kvm_s390_vcpu_store_status(vcpu,
 						  KVM_S390_STORE_STATUS_NOADDR);
 		if (rc >= 0)
 			rc = -EOPNOTSUPP;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 0cb0da7822d5..0079b8c2a9a6 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -569,7 +569,7 @@ static int __guestcopy(struct kvm_vcpu *vcpu, u64 guestdest, const void *from,
  * KVM_S390_STORE_STATUS_NOADDR: -> 0x1200 on 64 bit
  * KVM_S390_STORE_STATUS_PREFIXED: -> prefix
  */
-static int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
+int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
 {
 	const unsigned char archmode = 1;
 	int prefix;
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index cfa9d1777457..a7b7586626db 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -92,7 +92,7 @@ int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
 
 /* implemented in kvm-s390.c */
-int __kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu,
+int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu,
 				 unsigned long addr);
 /* implemented in diag.c */
 int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
-- 
cgit v1.2.3


From fc34531db3cf8c422f2ff7cf4ef507a3ca672cd2 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Thu, 17 Jun 2010 23:16:20 +0200
Subject: KVM: s390: Don't exit SIE on SIGP sense running

Newer (guest) kernels use sigp sense running in their spinlock
implementation to check if the other cpu is running before yielding
the processor. This revealed some wrong guest settings, causing
unnecessary exits for every sigp sense running.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/s390/include/asm/kvm_host.h | 2 +-
 arch/s390/kvm/kvm-s390.c         | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 27605b62b980..b95710a1f5dd 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -26,7 +26,7 @@
 
 struct sca_entry {
 	atomic_t scn;
-	__u64	reserved;
+	__u32	reserved;
 	__u64	sda;
 	__u64	reserved2[2];
 } __attribute__((packed));
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 0079b8c2a9a6..08a3b35d30be 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -207,6 +207,7 @@ out_nokvm:
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
 	VCPU_EVENT(vcpu, 3, "%s", "free cpu");
+	clear_bit(63 - vcpu->vcpu_id, (unsigned long *) &vcpu->kvm->arch.sca->mcn);
 	if (vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda ==
 		(__u64) vcpu->arch.sie_block)
 		vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda = 0;
@@ -296,7 +297,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
 	atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH);
 	set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests);
-	vcpu->arch.sie_block->ecb   = 2;
+	vcpu->arch.sie_block->ecb   = 6;
 	vcpu->arch.sie_block->eca   = 0xC1002001U;
 	vcpu->arch.sie_block->fac   = (int) (long) facilities;
 	hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
@@ -329,6 +330,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 		kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block;
 	vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32);
 	vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca;
+	set_bit(63 - id, (unsigned long *) &kvm->arch.sca->mcn);
 
 	spin_lock_init(&vcpu->arch.local_int.lock);
 	INIT_LIST_HEAD(&vcpu->arch.local_int.list);
-- 
cgit v1.2.3


From a1f4d39500ad8ed61825eff061debff42386ab5b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 21 Jun 2010 11:44:20 +0300
Subject: KVM: Remove memory alias support

As advertised in feature-removal-schedule.txt.  Equivalent support is provided
by overlapping memory regions.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/feature-removal-schedule.txt |  11 ---
 Documentation/kvm/api.txt                  |  12 +--
 arch/ia64/kvm/kvm-ia64.c                   |   5 --
 arch/powerpc/kvm/powerpc.c                 |   5 --
 arch/s390/kvm/kvm-s390.c                   |   5 --
 arch/x86/include/asm/kvm_host.h            |  21 -----
 arch/x86/kvm/mmu.c                         |  17 +---
 arch/x86/kvm/paging_tmpl.h                 |   3 +-
 arch/x86/kvm/x86.c                         | 125 -----------------------------
 arch/x86/kvm/x86.h                         |   7 --
 include/linux/kvm.h                        |   1 +
 include/linux/kvm_host.h                   |   6 --
 virt/kvm/kvm_main.c                        |  18 +----
 13 files changed, 11 insertions(+), 225 deletions(-)

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 1571c0c83dba..ad1e90dd2780 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -538,17 +538,6 @@ Who:	Jan Kiszka <jan.kiszka@web.de>
 
 ----------------------------
 
-What:	KVM memory aliases support
-When:	July 2010
-Why:	Memory aliasing support is used for speeding up guest vga access
-	through the vga windows.
-
-	Modern userspace no longer uses this feature, so it's just bitrotted
-	code and can be removed with no impact.
-Who:	Avi Kivity <avi@redhat.com>
-
-----------------------------
-
 What:	xtime, wall_to_monotonic
 When:	2.6.36+
 Files:	kernel/time/timekeeping.c include/linux/time.h
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index ffba03f55bdf..7e415943a11e 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -226,17 +226,7 @@ Type: vm ioctl
 Parameters: struct kvm_memory_alias (in)
 Returns: 0 (success), -1 (error)
 
-struct kvm_memory_alias {
-	__u32 slot;  /* this has a different namespace than memory slots */
-	__u32 flags;
-	__u64 guest_phys_addr;
-	__u64 memory_size;
-	__u64 target_phys_addr;
-};
-
-Defines a guest physical address space region as an alias to another
-region.  Useful for aliased address, for example the VGA low memory
-window. Should not be used with userspace memory.
+This ioctl is obsolete and has been removed.
 
 4.9 KVM_RUN
 
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 91760e80e268..bd510beb43af 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1946,11 +1946,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 	return vcpu->arch.timer_fired;
 }
 
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
-	return gfn;
-}
-
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
 	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) ||
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index b5ebdfbed20b..72a4ad86ee91 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -36,11 +36,6 @@
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
-	return gfn;
-}
-
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
 	return !(v->arch.msr & MSR_WE) || !!(v->arch.pending_exceptions);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 08a3b35d30be..4fe68650535c 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -723,11 +723,6 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
 {
 }
 
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
-	return gfn;
-}
-
 static int __init kvm_s390_init(void)
 {
 	int ret;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2ec2e27a403e..a57cdeacc4d2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -69,8 +69,6 @@
 
 #define IOPL_SHIFT 12
 
-#define KVM_ALIAS_SLOTS 4
-
 #define KVM_PERMILLE_MMU_PAGES 20
 #define KVM_MIN_ALLOC_MMU_PAGES 64
 #define KVM_MMU_HASH_SHIFT 10
@@ -362,24 +360,7 @@ struct kvm_vcpu_arch {
 	u64 hv_vapic;
 };
 
-struct kvm_mem_alias {
-	gfn_t base_gfn;
-	unsigned long npages;
-	gfn_t target_gfn;
-#define KVM_ALIAS_INVALID     1UL
-	unsigned long flags;
-};
-
-#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION
-
-struct kvm_mem_aliases {
-	struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
-	int naliases;
-};
-
 struct kvm_arch {
-	struct kvm_mem_aliases *aliases;
-
 	unsigned int n_free_mmu_pages;
 	unsigned int n_requested_mmu_pages;
 	unsigned int n_alloc_mmu_pages;
@@ -655,8 +636,6 @@ void kvm_disable_tdp(void);
 int complete_pio(struct kvm_vcpu *vcpu);
 bool kvm_check_iopl(struct kvm_vcpu *vcpu);
 
-struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
-
 static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
 {
 	struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8c2f580956d9..c5501bc10106 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -434,9 +434,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
 	int *write_count;
 	int i;
 
-	gfn = unalias_gfn(kvm, gfn);
-
-	slot = gfn_to_memslot_unaliased(kvm, gfn);
+	slot = gfn_to_memslot(kvm, gfn);
 	for (i = PT_DIRECTORY_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 		write_count   = slot_largepage_idx(gfn, slot, i);
@@ -450,8 +448,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 	int *write_count;
 	int i;
 
-	gfn = unalias_gfn(kvm, gfn);
-	slot = gfn_to_memslot_unaliased(kvm, gfn);
+	slot = gfn_to_memslot(kvm, gfn);
 	for (i = PT_DIRECTORY_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 		write_count   = slot_largepage_idx(gfn, slot, i);
@@ -467,8 +464,7 @@ static int has_wrprotected_page(struct kvm *kvm,
 	struct kvm_memory_slot *slot;
 	int *largepage_idx;
 
-	gfn = unalias_gfn(kvm, gfn);
-	slot = gfn_to_memslot_unaliased(kvm, gfn);
+	slot = gfn_to_memslot(kvm, gfn);
 	if (slot) {
 		largepage_idx = slot_largepage_idx(gfn, slot, level);
 		return *largepage_idx;
@@ -521,7 +517,6 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 
 /*
  * Take gfn and return the reverse mapping to it.
- * Note: gfn must be unaliased before this function get called
  */
 
 static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
@@ -561,7 +556,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 
 	if (!is_rmap_spte(*spte))
 		return count;
-	gfn = unalias_gfn(vcpu->kvm, gfn);
 	sp = page_header(__pa(spte));
 	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
@@ -698,7 +692,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 	u64 *spte;
 	int i, write_protected = 0;
 
-	gfn = unalias_gfn(kvm, gfn);
 	rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
 
 	spte = rmap_next(kvm, rmapp, NULL);
@@ -885,7 +878,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 
 	sp = page_header(__pa(spte));
 
-	gfn = unalias_gfn(vcpu->kvm, gfn);
 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
 
 	kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
@@ -3510,8 +3502,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
 		if (sp->unsync)
 			continue;
 
-		gfn = unalias_gfn(vcpu->kvm, sp->gfn);
-		slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
+		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
 		rmapp = &slot->rmap[gfn - slot->base_gfn];
 
 		spte = rmap_next(vcpu->kvm, rmapp, NULL);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 863920f649fb..a21a86ef9e20 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -576,7 +576,6 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
  * Using the cached information from sp->gfns is safe because:
  * - The spte has a reference to the struct page, so the pfn for a given gfn
  *   can't change unless all sptes pointing to it are nuked first.
- * - Alias changes zap the entire shadow cache.
  */
 static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			    bool clear_unsync)
@@ -611,7 +610,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			return -EINVAL;
 
 		gfn = gpte_to_gfn(gpte);
-		if (unalias_gfn(vcpu->kvm, gfn) != sp->gfns[i] ||
+		if (gfn != sp->gfns[i] ||
 		      !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) {
 			u64 nonpresent;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8e60b6c9c0b0..62596d373a49 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2740,115 +2740,6 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
 	return kvm->arch.n_alloc_mmu_pages;
 }
 
-gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
-{
-	int i;
-	struct kvm_mem_alias *alias;
-	struct kvm_mem_aliases *aliases;
-
-	aliases = kvm_aliases(kvm);
-
-	for (i = 0; i < aliases->naliases; ++i) {
-		alias = &aliases->aliases[i];
-		if (alias->flags & KVM_ALIAS_INVALID)
-			continue;
-		if (gfn >= alias->base_gfn
-		    && gfn < alias->base_gfn + alias->npages)
-			return alias->target_gfn + gfn - alias->base_gfn;
-	}
-	return gfn;
-}
-
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
-	int i;
-	struct kvm_mem_alias *alias;
-	struct kvm_mem_aliases *aliases;
-
-	aliases = kvm_aliases(kvm);
-
-	for (i = 0; i < aliases->naliases; ++i) {
-		alias = &aliases->aliases[i];
-		if (gfn >= alias->base_gfn
-		    && gfn < alias->base_gfn + alias->npages)
-			return alias->target_gfn + gfn - alias->base_gfn;
-	}
-	return gfn;
-}
-
-/*
- * Set a new alias region.  Aliases map a portion of physical memory into
- * another portion.  This is useful for memory windows, for example the PC
- * VGA region.
- */
-static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
-					 struct kvm_memory_alias *alias)
-{
-	int r, n;
-	struct kvm_mem_alias *p;
-	struct kvm_mem_aliases *aliases, *old_aliases;
-
-	r = -EINVAL;
-	/* General sanity checks */
-	if (alias->memory_size & (PAGE_SIZE - 1))
-		goto out;
-	if (alias->guest_phys_addr & (PAGE_SIZE - 1))
-		goto out;
-	if (alias->slot >= KVM_ALIAS_SLOTS)
-		goto out;
-	if (alias->guest_phys_addr + alias->memory_size
-	    < alias->guest_phys_addr)
-		goto out;
-	if (alias->target_phys_addr + alias->memory_size
-	    < alias->target_phys_addr)
-		goto out;
-
-	r = -ENOMEM;
-	aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
-	if (!aliases)
-		goto out;
-
-	mutex_lock(&kvm->slots_lock);
-
-	/* invalidate any gfn reference in case of deletion/shrinking */
-	memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
-	aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID;
-	old_aliases = kvm->arch.aliases;
-	rcu_assign_pointer(kvm->arch.aliases, aliases);
-	synchronize_srcu_expedited(&kvm->srcu);
-	kvm_mmu_zap_all(kvm);
-	kfree(old_aliases);
-
-	r = -ENOMEM;
-	aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
-	if (!aliases)
-		goto out_unlock;
-
-	memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
-
-	p = &aliases->aliases[alias->slot];
-	p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
-	p->npages = alias->memory_size >> PAGE_SHIFT;
-	p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
-	p->flags &= ~(KVM_ALIAS_INVALID);
-
-	for (n = KVM_ALIAS_SLOTS; n > 0; --n)
-		if (aliases->aliases[n - 1].npages)
-			break;
-	aliases->naliases = n;
-
-	old_aliases = kvm->arch.aliases;
-	rcu_assign_pointer(kvm->arch.aliases, aliases);
-	synchronize_srcu_expedited(&kvm->srcu);
-	kfree(old_aliases);
-	r = 0;
-
-out_unlock:
-	mutex_unlock(&kvm->slots_lock);
-out:
-	return r;
-}
-
 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 {
 	int r;
@@ -3056,7 +2947,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	union {
 		struct kvm_pit_state ps;
 		struct kvm_pit_state2 ps2;
-		struct kvm_memory_alias alias;
 		struct kvm_pit_config pit_config;
 	} u;
 
@@ -3101,14 +2991,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	case KVM_GET_NR_MMU_PAGES:
 		r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
 		break;
-	case KVM_SET_MEMORY_ALIAS:
-		r = -EFAULT;
-		if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
-			goto out;
-		r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
-		if (r)
-			goto out;
-		break;
 	case KVM_CREATE_IRQCHIP: {
 		struct kvm_pic *vpic;
 
@@ -5559,12 +5441,6 @@ struct  kvm *kvm_arch_create_vm(void)
 	if (!kvm)
 		return ERR_PTR(-ENOMEM);
 
-	kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
-	if (!kvm->arch.aliases) {
-		kfree(kvm);
-		return ERR_PTR(-ENOMEM);
-	}
-
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 
@@ -5622,7 +5498,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	if (kvm->arch.ept_identity_pagetable)
 		put_page(kvm->arch.ept_identity_pagetable);
 	cleanup_srcu_struct(&kvm->srcu);
-	kfree(kvm->arch.aliases);
 	kfree(kvm);
 }
 
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index f4b54458285b..b7a404722d2b 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -65,13 +65,6 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
 	return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
 }
 
-static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm)
-{
-	return rcu_dereference_check(kvm->arch.aliases,
-			srcu_read_lock_held(&kvm->srcu)
-			|| lockdep_is_held(&kvm->slots_lock));
-}
-
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 6fd40f540a8e..636fc381c897 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -619,6 +619,7 @@ struct kvm_clock_data {
  */
 #define KVM_CREATE_VCPU           _IO(KVMIO,   0x41)
 #define KVM_GET_DIRTY_LOG         _IOW(KVMIO,  0x42, struct kvm_dirty_log)
+/* KVM_SET_MEMORY_ALIAS is obsolete: */
 #define KVM_SET_MEMORY_ALIAS      _IOW(KVMIO,  0x43, struct kvm_memory_alias)
 #define KVM_SET_NR_MMU_PAGES      _IO(KVMIO,   0x44)
 #define KVM_GET_NR_MMU_PAGES      _IO(KVMIO,   0x45)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2d96555cd4ed..240e460777bc 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -286,8 +286,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 				int user_alloc);
 void kvm_disable_largepages(void);
 void kvm_arch_flush_shadow(struct kvm *kvm);
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
-gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn);
 
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
@@ -564,10 +562,6 @@ static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_se
 }
 #endif
 
-#ifndef KVM_ARCH_HAS_UNALIAS_INSTANTIATION
-#define unalias_gfn_instantiation unalias_gfn
-#endif
-
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
 
 #define KVM_MAX_IRQ_ROUTES 1024
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 84a090644d9d..65417e3d8462 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -841,7 +841,7 @@ int kvm_is_error_hva(unsigned long addr)
 }
 EXPORT_SYMBOL_GPL(kvm_is_error_hva);
 
-struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn)
+struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 {
 	int i;
 	struct kvm_memslots *slots = kvm_memslots(kvm);
@@ -855,20 +855,13 @@ struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn)
 	}
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(gfn_to_memslot_unaliased);
-
-struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
-{
-	gfn = unalias_gfn(kvm, gfn);
-	return gfn_to_memslot_unaliased(kvm, gfn);
-}
+EXPORT_SYMBOL_GPL(gfn_to_memslot);
 
 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 {
 	int i;
 	struct kvm_memslots *slots = kvm_memslots(kvm);
 
-	gfn = unalias_gfn_instantiation(kvm, gfn);
 	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
 		struct kvm_memory_slot *memslot = &slots->memslots[i];
 
@@ -913,7 +906,6 @@ int memslot_id(struct kvm *kvm, gfn_t gfn)
 	struct kvm_memslots *slots = kvm_memslots(kvm);
 	struct kvm_memory_slot *memslot = NULL;
 
-	gfn = unalias_gfn(kvm, gfn);
 	for (i = 0; i < slots->nmemslots; ++i) {
 		memslot = &slots->memslots[i];
 
@@ -934,8 +926,7 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_memory_slot *slot;
 
-	gfn = unalias_gfn_instantiation(kvm, gfn);
-	slot = gfn_to_memslot_unaliased(kvm, gfn);
+	slot = gfn_to_memslot(kvm, gfn);
 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
 		return bad_hva();
 	return gfn_to_hva_memslot(slot, gfn);
@@ -1202,8 +1193,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
 {
 	struct kvm_memory_slot *memslot;
 
-	gfn = unalias_gfn(kvm, gfn);
-	memslot = gfn_to_memslot_unaliased(kvm, gfn);
+	memslot = gfn_to_memslot(kvm, gfn);
 	if (memslot && memslot->dirty_bitmap) {
 		unsigned long rel_gfn = gfn - memslot->base_gfn;
 
-- 
cgit v1.2.3


From b74a07beed0e64bfba413dcb70dd6749c57f43dc Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 21 Jun 2010 11:48:05 +0300
Subject: KVM: Remove kernel-allocated memory regions

Equivalent (and better) functionality is provided by user-allocated memory
regions.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 Documentation/feature-removal-schedule.txt | 10 ----------
 Documentation/kvm/api.txt                  | 24 +-----------------------
 arch/x86/kvm/x86.c                         | 16 ----------------
 3 files changed, 1 insertion(+), 49 deletions(-)

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index ad1e90dd2780..123ec0e23738 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -548,16 +548,6 @@ Who:	John Stultz <johnstul@us.ibm.com>
 
 ----------------------------
 
-What:	KVM kernel-allocated memory slots
-When:	July 2010
-Why:	Since 2.6.25, kvm supports user-allocated memory slots, which are
-	much more flexible than kernel-allocated slots.  All current userspace
-	supports the newer interface and this code can be removed with no
-	impact.
-Who:	Avi Kivity <avi@redhat.com>
-
-----------------------------
-
 What:	KVM paravirt mmu host support
 When:	January 2011
 Why:	The paravirt mmu host support is slower than non-paravirt mmu, both
diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index 7e415943a11e..d9b00f15fbe6 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -160,29 +160,7 @@ Type: vm ioctl
 Parameters: struct kvm_memory_region (in)
 Returns: 0 on success, -1 on error
 
-struct kvm_memory_region {
-	__u32 slot;
-	__u32 flags;
-	__u64 guest_phys_addr;
-	__u64 memory_size; /* bytes */
-};
-
-/* for kvm_memory_region::flags */
-#define KVM_MEM_LOG_DIRTY_PAGES  1UL
-
-This ioctl allows the user to create or modify a guest physical memory
-slot.  When changing an existing slot, it may be moved in the guest
-physical memory space, or its flags may be modified.  It may not be
-resized.  Slots may not overlap.
-
-The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which
-instructs kvm to keep track of writes to memory within the slot.  See
-the KVM_GET_DIRTY_LOG ioctl.
-
-It is recommended to use the KVM_SET_USER_MEMORY_REGION ioctl instead
-of this API, if available.  This newer API allows placing guest memory
-at specified locations in the host address space, yielding better
-control and easy access.
+This ioctl is obsolete and has been removed.
 
 4.6 KVM_CREATE_VCPU
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 62596d373a49..9be6e4e5e8ee 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2967,22 +2967,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto out;
 		break;
 	}
-	case KVM_SET_MEMORY_REGION: {
-		struct kvm_memory_region kvm_mem;
-		struct kvm_userspace_memory_region kvm_userspace_mem;
-
-		r = -EFAULT;
-		if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
-			goto out;
-		kvm_userspace_mem.slot = kvm_mem.slot;
-		kvm_userspace_mem.flags = kvm_mem.flags;
-		kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
-		kvm_userspace_mem.memory_size = kvm_mem.memory_size;
-		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
-		if (r)
-			goto out;
-		break;
-	}
 	case KVM_SET_NR_MMU_PAGES:
 		r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
 		if (r)
-- 
cgit v1.2.3


From 073d46133ab0b42154f6b8429f4f66dbe2760bda Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 3 May 2010 17:34:34 +0300
Subject: KVM: i8259: reduce excessive abstraction for pic_irq_request()

Part of the i8259 code pretends it isn't part of kvm, but we know better.
Reduce excessive abstraction, eliminating callbacks and void pointers.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8259.c | 17 +++++++----------
 arch/x86/kvm/irq.h   |  4 ----
 2 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 2c73f4493148..caf6e1b3d957 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -34,6 +34,8 @@
 #include <linux/kvm_host.h>
 #include "trace.h"
 
+static void pic_irq_request(struct kvm *kvm, int level);
+
 static void pic_lock(struct kvm_pic *s)
 	__acquires(&s->lock)
 {
@@ -175,9 +177,9 @@ static void pic_update_irq(struct kvm_pic *s)
 	}
 	irq = pic_get_irq(&s->pics[0]);
 	if (irq >= 0)
-		s->irq_request(s->irq_request_opaque, 1);
+		pic_irq_request(s->kvm, 1);
 	else
-		s->irq_request(s->irq_request_opaque, 0);
+		pic_irq_request(s->kvm, 0);
 }
 
 void kvm_pic_update_irq(struct kvm_pic *s)
@@ -262,8 +264,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 void kvm_pic_reset(struct kvm_kpic_state *s)
 {
 	int irq;
-	struct kvm *kvm = s->pics_state->irq_request_opaque;
-	struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
+	struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu;
 	u8 irr = s->irr, isr = s->imr;
 
 	s->last_irr = 0;
@@ -302,8 +303,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
 			/*
 			 * deassert a pending interrupt
 			 */
-			s->pics_state->irq_request(s->pics_state->
-						   irq_request_opaque, 0);
+			pic_irq_request(s->pics_state->kvm, 0);
 			s->init_state = 1;
 			s->init4 = val & 1;
 			if (val & 0x02)
@@ -519,9 +519,8 @@ static int picdev_read(struct kvm_io_device *this,
 /*
  * callback when PIC0 irq status changed
  */
-static void pic_irq_request(void *opaque, int level)
+static void pic_irq_request(struct kvm *kvm, int level)
 {
-	struct kvm *kvm = opaque;
 	struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
 	struct kvm_pic *s = pic_irqchip(kvm);
 	int irq = pic_get_irq(&s->pics[0]);
@@ -550,8 +549,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
 	s->kvm = kvm;
 	s->pics[0].elcr_mask = 0xf8;
 	s->pics[1].elcr_mask = 0xde;
-	s->irq_request = pic_irq_request;
-	s->irq_request_opaque = kvm;
 	s->pics[0].pics_state = s;
 	s->pics[1].pics_state = s;
 
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index cd1f362f413d..ffed06871c5c 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -38,8 +38,6 @@
 struct kvm;
 struct kvm_vcpu;
 
-typedef void irq_request_func(void *opaque, int level);
-
 struct kvm_kpic_state {
 	u8 last_irr;	/* edge detection */
 	u8 irr;		/* interrupt request register */
@@ -67,8 +65,6 @@ struct kvm_pic {
 	unsigned pending_acks;
 	struct kvm *kvm;
 	struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
-	irq_request_func *irq_request;
-	void *irq_request_opaque;
 	int output;		/* intr from master PIC */
 	struct kvm_io_device dev;
 	void (*ack_notifier)(void *opaque, int irq);
-- 
cgit v1.2.3


From 36633f32ba4c238403d19584754b30fe469d6dcb Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 3 May 2010 17:38:06 +0300
Subject: KVM: i8259: simplify pic_irq_request() calling sequence

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8259.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index caf6e1b3d957..bc10f0bd3819 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -176,10 +176,7 @@ static void pic_update_irq(struct kvm_pic *s)
 		pic_set_irq1(&s->pics[0], 2, 0);
 	}
 	irq = pic_get_irq(&s->pics[0]);
-	if (irq >= 0)
-		pic_irq_request(s->kvm, 1);
-	else
-		pic_irq_request(s->kvm, 0);
+	pic_irq_request(s->kvm, irq >= 0);
 }
 
 void kvm_pic_update_irq(struct kvm_pic *s)
-- 
cgit v1.2.3


From a8eeb04a44dd6dc4c8158953d9bae48849c9a188 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 10 May 2010 12:34:53 +0300
Subject: KVM: Add mini-API for vcpu->requests

Makes it a little more readable and hackable.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/lapic.c     |  2 +-
 arch/x86/kvm/mmu.c       |  6 +++---
 arch/x86/kvm/svm.c       |  2 +-
 arch/x86/kvm/timer.c     |  2 +-
 arch/x86/kvm/vmx.c       |  2 +-
 arch/x86/kvm/x86.c       | 27 +++++++++++++--------------
 include/linux/kvm_host.h | 15 +++++++++++++++
 virt/kvm/kvm_main.c      |  4 ++--
 8 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 49573c78c24b..77d8c0f4817d 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -534,7 +534,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
 	struct kvm_vcpu *vcpu = apic->vcpu;
 	struct kvm_run *run = vcpu->run;
 
-	set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
+	kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);
 	run->tpr_access.rip = kvm_rip_read(vcpu);
 	run->tpr_access.is_write = write;
 }
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c5501bc10106..690a7fc58c17 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1378,7 +1378,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 
 		mmu_page_add_parent_pte(vcpu, sp, parent_pte);
 		if (sp->unsync_children) {
-			set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
+			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
 			kvm_mmu_mark_parents_unsync(sp);
 		} else if (sp->unsync)
 			kvm_mmu_mark_parents_unsync(sp);
@@ -2131,7 +2131,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
 	int ret = 0;
 
 	if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
-		set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 		ret = 1;
 	}
 
@@ -2329,7 +2329,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.tlb_flush;
-	set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
+	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 }
 
 static void paging_new_cr3(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f7a6fdcf8ef3..587b99d37d44 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1494,7 +1494,7 @@ static void svm_handle_mce(struct vcpu_svm *svm)
 		 */
 		pr_err("KVM: Guest triggered AMD Erratum 383\n");
 
-		set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests);
+		kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
 
 		return;
 	}
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index 564548fbb3d6..e16a0dbe74d8 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -32,7 +32,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
 	if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
 		atomic_inc(&ktimer->pending);
 		/* FIXME: this code should not know anything about vcpus */
-		set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
 	}
 
 	if (waitqueue_active(q))
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 345a35470511..661c6e199b4a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -899,7 +899,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		unsigned long sysenter_esp;
 
 		kvm_migrate_timers(vcpu);
-		set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
+		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 		local_irq_disable();
 		list_add(&vmx->local_vcpus_link,
 			 &per_cpu(vcpus_on_cpu, cpu));
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9be6e4e5e8ee..7ef44107a14a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -296,7 +296,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 	prev_nr = vcpu->arch.exception.nr;
 	if (prev_nr == DF_VECTOR) {
 		/* triple fault -> shutdown */
-		set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 		return;
 	}
 	class1 = exception_class(prev_nr);
@@ -948,7 +948,7 @@ static int kvm_request_guest_time_update(struct kvm_vcpu *v)
 
 	if (!vcpu->time_page)
 		return 0;
-	set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
+	kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
 	return 1;
 }
 
@@ -2253,7 +2253,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
 			printk(KERN_DEBUG "kvm: set_mce: "
 			       "injects mce exception while "
 			       "previous one is in progress!\n");
-			set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 			return 0;
 		}
 		if (banks[1] & MCI_STATUS_VAL)
@@ -4617,7 +4617,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		vcpu->run->request_interrupt_window;
 
 	if (vcpu->requests)
-		if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
+		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
 			kvm_mmu_unload(vcpu);
 
 	r = kvm_mmu_reload(vcpu);
@@ -4625,26 +4625,25 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		goto out;
 
 	if (vcpu->requests) {
-		if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
+		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
 			__kvm_migrate_timers(vcpu);
-		if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
+		if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu))
 			kvm_write_guest_time(vcpu);
-		if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
+		if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
 			kvm_mmu_sync_roots(vcpu);
-		if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
 			kvm_x86_ops->tlb_flush(vcpu);
-		if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
-				       &vcpu->requests)) {
+		if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
 			vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
 			r = 0;
 			goto out;
 		}
-		if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
+		if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
 			vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
 			r = 0;
 			goto out;
 		}
-		if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) {
+		if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
 			vcpu->fpu_active = 0;
 			kvm_x86_ops->fpu_deactivate(vcpu);
 		}
@@ -4773,7 +4772,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 			kvm_vcpu_block(vcpu);
 			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
-			if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
+			if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
 			{
 				switch(vcpu->arch.mp_state) {
 				case KVM_MP_STATE_HALTED:
@@ -5255,7 +5254,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 	vcpu->guest_fpu_loaded = 0;
 	fpu_save_init(&vcpu->arch.guest_fpu);
 	++vcpu->stat.fpu_reload;
-	set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
+	kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
 	trace_kvm_fpu(0);
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 240e460777bc..c8a9d628898e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -624,5 +624,20 @@ static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 
 #endif
 
+static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
+{
+	set_bit(req, &vcpu->requests);
+}
+
+static inline bool kvm_make_check_request(int req, struct kvm_vcpu *vcpu)
+{
+	return test_and_set_bit(req, &vcpu->requests);
+}
+
+static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
+{
+	return test_and_clear_bit(req, &vcpu->requests);
+}
+
 #endif
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 65417e3d8462..5bd2f34ba576 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -145,7 +145,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 	raw_spin_lock(&kvm->requests_lock);
 	me = smp_processor_id();
 	kvm_for_each_vcpu(i, vcpu, kvm) {
-		if (test_and_set_bit(req, &vcpu->requests))
+		if (kvm_make_check_request(req, vcpu))
 			continue;
 		cpu = vcpu->cpu;
 		if (cpus != NULL && cpu != -1 && cpu != me)
@@ -1212,7 +1212,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 
 		if (kvm_arch_vcpu_runnable(vcpu)) {
-			set_bit(KVM_REQ_UNHALT, &vcpu->requests);
+			kvm_make_request(KVM_REQ_UNHALT, vcpu);
 			break;
 		}
 		if (kvm_cpu_has_pending_timer(vcpu))
-- 
cgit v1.2.3


From 0719837c0832a7b305e42327caa7d330462360ea Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 10 May 2010 13:08:26 +0300
Subject: KVM: Reduce atomic operations on vcpu->requests

Usually the vcpu->requests bitmap is sparse, so a test_and_clear_bit() for
each request generates a large number of unneeded atomics if a bit is set.

Replace with a separate test/clear sequence.  This is safe since there is
no clear_bit() outside the vcpu thread.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c8a9d628898e..e820eb579108 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -636,7 +636,12 @@ static inline bool kvm_make_check_request(int req, struct kvm_vcpu *vcpu)
 
 static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
 {
-	return test_and_clear_bit(req, &vcpu->requests);
+	if (test_bit(req, &vcpu->requests)) {
+		clear_bit(req, &vcpu->requests);
+		return true;
+	} else {
+		return false;
+	}
 }
 
 #endif
-- 
cgit v1.2.3


From e36d96f7cfaa71870c407131eb4fbd38ea285c01 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 21 Jun 2010 10:56:36 +0300
Subject: KVM: Keep slot ID in memory slot structure

May be used for distinguishing between internal and user slots, or for sorting
slots in size order.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h | 1 +
 virt/kvm/kvm_main.c      | 1 +
 2 files changed, 2 insertions(+)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e820eb579108..e796326f3646 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -124,6 +124,7 @@ struct kvm_memory_slot {
 	} *lpage_info[KVM_NR_PAGE_SIZES - 1];
 	unsigned long userspace_addr;
 	int user_alloc;
+	int id;
 };
 
 static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5bd2f34ba576..74f731920945 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -570,6 +570,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 
 	new = old = *memslot;
 
+	new.id = mem->slot;
 	new.base_gfn = base_gfn;
 	new.npages = npages;
 	new.flags = mem->flags;
-- 
cgit v1.2.3


From 7ac77099ce88a0c31b75acd0ec5ef3da4415a6d8 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 21 Jun 2010 10:57:45 +0300
Subject: KVM: Prevent internal slots from being COWed

If a process with a memory slot is COWed, the page will change its address
(despite having an elevated reference count).  This breaks internal memory
slots which have their physical addresses loaded into vmcs registers (see
the APIC access memory slot).

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7ef44107a14a..68be38e233f5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5491,6 +5491,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				int user_alloc)
 {
 	int npages = memslot->npages;
+	int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
+
+	/* Prevent internal slot pages from being moved by fork()/COW. */
+	if (memslot->id >= KVM_MEMORY_SLOTS)
+		map_flags = MAP_SHARED | MAP_ANONYMOUS;
 
 	/*To keep backward compatibility with older userspace,
 	 *x86 needs to hanlde !user_alloc case.
@@ -5503,7 +5508,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 			userspace_addr = do_mmap(NULL, 0,
 						 npages * PAGE_SIZE,
 						 PROT_READ | PROT_WRITE,
-						 MAP_PRIVATE | MAP_ANONYMOUS,
+						 map_flags,
 						 0);
 			up_write(&current->mm->mmap_sem);
 
-- 
cgit v1.2.3


From 6c3f6041172b78d5532c6bf3680d304e92ec2e66 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Tue, 22 Jun 2010 13:49:21 +0800
Subject: KVM: x86: Enable AVX for guest

Enable Intel(R) Advanced Vector Extension(AVX) for guest.

The detection of AVX feature includes OSXSAVE bit testing. When OSXSAVE bit is
not set, even if AVX is supported, the AVX instruction would result in UD as
well. So we're safe to expose AVX bits to guest directly.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 68be38e233f5..d39d6b25d3e5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1963,13 +1963,13 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
 	/* cpuid 1.ecx */
 	const u32 kvm_supported_word4_x86_features =
-		F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
+		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
 		0 /* DS-CPL, VMX, SMX, EST */ |
 		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
 		0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
 		0 /* Reserved, DCA */ | F(XMM4_1) |
 		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
-		0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */;
+		0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX);
 	/* cpuid 0x80000001.ecx */
 	const u32 kvm_supported_word6_x86_features =
 		F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
-- 
cgit v1.2.3


From bbeb34062fbad287c949a945a516a0c15b179993 Mon Sep 17 00:00:00 2001
From: Huang Ying <ying.huang@intel.com>
Date: Tue, 22 Jun 2010 14:23:11 +0800
Subject: KVM: Fix a race condition for usage of is_hwpoison_address()

is_hwpoison_address accesses the page table, so the caller must hold
current->mm->mmap_sem in read mode. So fix its usage in hva_to_pfn of
kvm accordingly.

Comment is_hwpoison_address to remind other users.

Reported-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 mm/memory-failure.c | 3 +++
 virt/kvm/kvm_main.c | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 378b0f61fd3c..6b44e52cacaa 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1298,6 +1298,9 @@ done:
 	return ret;
 }
 
+/*
+ * The caller must hold current->mm->mmap_sem in read mode.
+ */
 int is_hwpoison_address(unsigned long addr)
 {
 	pgd_t *pgdp;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 74f731920945..ec2e3c6ac7ed 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -947,12 +947,13 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
 	if (unlikely(npages != 1)) {
 		struct vm_area_struct *vma;
 
+		down_read(&current->mm->mmap_sem);
 		if (is_hwpoison_address(addr)) {
+			up_read(&current->mm->mmap_sem);
 			get_page(hwpoison_page);
 			return page_to_pfn(hwpoison_page);
 		}
 
-		down_read(&current->mm->mmap_sem);
 		vma = find_vma(current->mm, addr);
 
 		if (vma == NULL || addr < vma->vm_start ||
-- 
cgit v1.2.3


From a576f7a29481438db0fa6f3d9ed6939019c441d3 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 21 Jun 2010 15:25:40 +0200
Subject: KVM: PPC: Remove obsolete kvmppc_mmu_find_pte

Initially we had to search for pte entries to invalidate them. Since
the logic has improved since then, we can just get rid of the search
function.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/include/asm/kvm_book3s.h |  1 -
 arch/powerpc/kvm/book3s_32_mmu_host.c | 20 --------------------
 arch/powerpc/kvm/book3s_64_mmu_host.c | 20 --------------------
 3 files changed, 41 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 6f74d93725a0..4e995593e479 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -115,7 +115,6 @@ extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
 extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
 extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
 extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
-extern struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data);
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
index 0bb66005338f..81ca9dba4098 100644
--- a/arch/powerpc/kvm/book3s_32_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -150,26 +150,6 @@ void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
 	}
 }
 
-struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data)
-{
-	int i;
-	u64 guest_vp;
-
-	guest_vp = vcpu->arch.mmu.ea_to_vp(vcpu, ea, false);
-	for (i=0; i<vcpu->arch.hpte_cache_offset; i++) {
-		struct hpte_cache *pte;
-
-		pte = &vcpu->arch.hpte_cache[i];
-		if (!pte->host_va)
-			continue;
-
-		if (pte->pte.vpage == guest_vp)
-			return &pte->pte;
-	}
-
-	return NULL;
-}
-
 static int kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
 {
 	if (vcpu->arch.hpte_cache_offset == HPTEG_CACHE_NUM)
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index e4b5744977f6..eb1d26b64714 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -132,26 +132,6 @@ void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
 	}
 }
 
-struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data)
-{
-	int i;
-	u64 guest_vp;
-
-	guest_vp = vcpu->arch.mmu.ea_to_vp(vcpu, ea, false);
-	for (i=0; i<vcpu->arch.hpte_cache_offset; i++) {
-		struct hpte_cache *pte;
-
-		pte = &vcpu->arch.hpte_cache[i];
-		if (!pte->host_va)
-			continue;
-
-		if (pte->pte.vpage == guest_vp)
-			return &pte->pte;
-	}
-
-	return NULL;
-}
-
 static int kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
 {
 	if (vcpu->arch.hpte_cache_offset == HPTEG_CACHE_NUM)
-- 
cgit v1.2.3


From 3b249157c03154109d31ec95a3c80203adcf8b70 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 21 Jun 2010 15:25:19 +0200
Subject: KVM: PPC: Use kernel hash function

The linux kernel already provides a hash function. Let's reuse that
instead of reinventing the wheel!

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/kvm/book3s_32_mmu_host.c | 10 ++--------
 arch/powerpc/kvm/book3s_64_mmu_host.c | 11 ++---------
 2 files changed, 4 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
index 81ca9dba4098..904f5ac78f54 100644
--- a/arch/powerpc/kvm/book3s_32_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -19,6 +19,7 @@
  */
 
 #include <linux/kvm_host.h>
+#include <linux/hash.h>
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
@@ -162,14 +163,7 @@ static int kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
  * a hash, so we don't waste cycles on looping */
 static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
 {
-	return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK));
+	return hash_64(gvsid, SID_MAP_BITS);
 }
 
 
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index eb1d26b64714..4ccdde152c37 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -20,6 +20,7 @@
  */
 
 #include <linux/kvm_host.h>
+#include <linux/hash.h>
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
@@ -144,17 +145,9 @@ static int kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
  * a hash, so we don't waste cycles on looping */
 static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
 {
-	return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK));
+	return hash_64(gvsid, SID_MAP_BITS);
 }
 
-
 static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
 {
 	struct kvmppc_sid_map *map;
-- 
cgit v1.2.3


From 4d29bdbf12f35f7e7816d67e99c150630684604f Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 21 Jun 2010 15:24:55 +0200
Subject: KVM: PPC: Make BAT only guest segments work

When a guest sets its SR entry to invalid, we may still find a
corresponding entry in a BAT. So we need to make sure we're not
faulting on invalid SR entries, but instead just claim them to be
BAT resolved.

This resolves breakage experienced when using libogc based guests.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/kvm/book3s_32_mmu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c
index 0b10503c8a4a..3292d76101d2 100644
--- a/arch/powerpc/kvm/book3s_32_mmu.c
+++ b/arch/powerpc/kvm/book3s_32_mmu.c
@@ -354,10 +354,10 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 		*vsid = VSID_REAL_DR | gvsid;
 		break;
 	case MSR_DR|MSR_IR:
-		if (!sr->valid)
-			return -1;
-
-		*vsid = sr->vsid;
+		if (sr->valid)
+			*vsid = sr->vsid;
+		else
+			*vsid = VSID_BAT | gvsid;
 		break;
 	default:
 		BUG();
-- 
cgit v1.2.3


From 4482b06c0459469c101a1da3f4b24594da557f99 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Wed, 23 Jun 2010 14:59:29 +0900
Subject: KVM: ia64: fix dirty_log_lock spin_lock section not to include
 get_dirty_log()

kvm_get_dirty_log() calls copy_to_user(). So we need to narrow the
dirty_log_lock spin_lock section not to include this.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index bd510beb43af..d85b5d2d52d1 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1817,10 +1817,12 @@ static int kvm_ia64_sync_dirty_log(struct kvm *kvm,
 	n = kvm_dirty_bitmap_bytes(memslot);
 	base = memslot->base_gfn / BITS_PER_LONG;
 
+	spin_lock(&kvm->arch.dirty_log_lock);
 	for (i = 0; i < n/sizeof(long); ++i) {
 		memslot->dirty_bitmap[i] = dirty_bitmap[base + i];
 		dirty_bitmap[base + i] = 0;
 	}
+	spin_unlock(&kvm->arch.dirty_log_lock);
 	r = 0;
 out:
 	return r;
@@ -1835,7 +1837,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 	int is_dirty = 0;
 
 	mutex_lock(&kvm->slots_lock);
-	spin_lock(&kvm->arch.dirty_log_lock);
 
 	r = kvm_ia64_sync_dirty_log(kvm, log);
 	if (r)
@@ -1855,7 +1856,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 	r = 0;
 out:
 	mutex_unlock(&kvm->slots_lock);
-	spin_unlock(&kvm->arch.dirty_log_lock);
 	return r;
 }
 
-- 
cgit v1.2.3


From 979586e0b5809c46ebe5d784794a9f6d859b665e Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Date: Wed, 23 Jun 2010 15:00:29 +0900
Subject: KVM: ia64: cleanup kvm_ia64_sync_dirty_log()

kvm_ia64_sync_dirty_log() is a helper function for kvm_vm_ioctl_get_dirty_log()
which copies ia64's arch specific dirty bitmap to general one in memslot.
So doing sanity checks in this function is unnatural. We move these checks
outside of this and change the prototype appropriately.

Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c | 30 +++++++++++-------------------
 1 file changed, 11 insertions(+), 19 deletions(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index d85b5d2d52d1..5cb58655cd5f 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1795,25 +1795,15 @@ void kvm_arch_exit(void)
 	kvm_vmm_info = NULL;
 }
 
-static int kvm_ia64_sync_dirty_log(struct kvm *kvm,
-		struct kvm_dirty_log *log)
+static void kvm_ia64_sync_dirty_log(struct kvm *kvm,
+				    struct kvm_memory_slot *memslot)
 {
-	struct kvm_memory_slot *memslot;
-	int r, i;
+	int i;
 	long base;
 	unsigned long n;
 	unsigned long *dirty_bitmap = (unsigned long *)(kvm->arch.vm_base +
 			offsetof(struct kvm_vm_data, kvm_mem_dirty_log));
 
-	r = -EINVAL;
-	if (log->slot >= KVM_MEMORY_SLOTS)
-		goto out;
-
-	memslot = &kvm->memslots->memslots[log->slot];
-	r = -ENOENT;
-	if (!memslot->dirty_bitmap)
-		goto out;
-
 	n = kvm_dirty_bitmap_bytes(memslot);
 	base = memslot->base_gfn / BITS_PER_LONG;
 
@@ -1823,9 +1813,6 @@ static int kvm_ia64_sync_dirty_log(struct kvm *kvm,
 		dirty_bitmap[base + i] = 0;
 	}
 	spin_unlock(&kvm->arch.dirty_log_lock);
-	r = 0;
-out:
-	return r;
 }
 
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
@@ -1838,10 +1825,16 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 
 	mutex_lock(&kvm->slots_lock);
 
-	r = kvm_ia64_sync_dirty_log(kvm, log);
-	if (r)
+	r = -EINVAL;
+	if (log->slot >= KVM_MEMORY_SLOTS)
+		goto out;
+
+	memslot = &kvm->memslots->memslots[log->slot];
+	r = -ENOENT;
+	if (!memslot->dirty_bitmap)
 		goto out;
 
+	kvm_ia64_sync_dirty_log(kvm, memslot);
 	r = kvm_get_dirty_log(kvm, log, &is_dirty);
 	if (r)
 		goto out;
@@ -1849,7 +1842,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 	/* If nothing is dirty, don't bother messing with page tables. */
 	if (is_dirty) {
 		kvm_flush_remote_tlbs(kvm);
-		memslot = &kvm->memslots->memslots[log->slot];
 		n = kvm_dirty_bitmap_bytes(memslot);
 		memset(memslot->dirty_bitmap, 0, n);
 	}
-- 
cgit v1.2.3


From 529df65e394e30a78f2633b575fd81fa5b973e30 Mon Sep 17 00:00:00 2001
From: Chris Lalancette <clalance@redhat.com>
Date: Mon, 21 Jun 2010 11:29:40 -0400
Subject: KVM: Search the LAPIC's for one that will accept a PIC interrupt

Older versions of 32-bit linux have a "Checking 'hlt' instruction"
test where they repeatedly call the 'hlt' instruction, and then
expect a timer interrupt to kick the CPU out of halt.  This happens
before any LAPIC or IOAPIC setup happens, which means that all of
the APIC's are in virtual wire mode at this point.  Unfortunately,
the current implementation of virtual wire mode is hardcoded to
only kick the BSP, so if a crash+kexec occurs on a different
vcpu, it will never get kicked.

This patch makes pic_unlock() do the equivalent of
kvm_irq_delivery_to_apic() for the IOAPIC code.  That is, it runs
through all of the vcpus looking for one that is in virtual wire
mode.  In the normal case where LAPICs and IOAPICs are configured,
this won't be used at all.  In the bootstrap phase of a modern
OS, before the LAPICs and IOAPICs are configured, this will have
exactly the same behavior as today; VCPU0 is always looked at
first, so it will always get out of the loop after the first
iteration.  This will only go through the loop more than once
during a kexec/kdump, in which case it will only do it a few times
until the kexec'ed kernel programs the LAPIC and IOAPIC.

Signed-off-by: Chris Lalancette <clalance@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8259.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index bc10f0bd3819..819b748a33f9 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -46,16 +46,25 @@ static void pic_unlock(struct kvm_pic *s)
 	__releases(&s->lock)
 {
 	bool wakeup = s->wakeup_needed;
-	struct kvm_vcpu *vcpu;
+	struct kvm_vcpu *vcpu, *found = NULL;
+	int i;
 
 	s->wakeup_needed = false;
 
 	raw_spin_unlock(&s->lock);
 
 	if (wakeup) {
-		vcpu = s->kvm->bsp_vcpu;
-		if (vcpu)
-			kvm_vcpu_kick(vcpu);
+		kvm_for_each_vcpu(i, vcpu, s->kvm) {
+			if (kvm_apic_accept_pic_intr(vcpu)) {
+				found = vcpu;
+				break;
+			}
+		}
+
+		if (!found)
+			found = s->kvm->bsp_vcpu;
+
+		kvm_vcpu_kick(found);
 	}
 }
 
-- 
cgit v1.2.3


From 3e0075094734de122e4cb09f930fa853a3c59f09 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 23 Jun 2010 14:26:18 +0300
Subject: KVM: Simplify vcpu_enter_guest() mmu reload logic slightly

No need to reload the mmu in between two different vcpu->requests checks.

kvm_mmu_reload() may trigger KVM_REQ_TRIPLE_FAULT, but that will be caught
during atomic guest entry later.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d39d6b25d3e5..27322d341232 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4616,15 +4616,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
 		vcpu->run->request_interrupt_window;
 
-	if (vcpu->requests)
+	if (vcpu->requests) {
 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
 			kvm_mmu_unload(vcpu);
-
-	r = kvm_mmu_reload(vcpu);
-	if (unlikely(r))
-		goto out;
-
-	if (vcpu->requests) {
 		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
 			__kvm_migrate_timers(vcpu);
 		if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu))
@@ -4649,6 +4643,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		}
 	}
 
+	r = kvm_mmu_reload(vcpu);
+	if (unlikely(r))
+		goto out;
+
 	preempt_disable();
 
 	kvm_x86_ops->prepare_guest_switch(vcpu);
-- 
cgit v1.2.3


From cf3e3d3e19868ca01da163200bbfc687523df0fc Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 24 Jun 2010 14:10:09 +0300
Subject: KVM: Document KVM specific review items

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 Documentation/kvm/review-checklist.txt | 38 ++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 Documentation/kvm/review-checklist.txt

diff --git a/Documentation/kvm/review-checklist.txt b/Documentation/kvm/review-checklist.txt
new file mode 100644
index 000000000000..730475ae1b8d
--- /dev/null
+++ b/Documentation/kvm/review-checklist.txt
@@ -0,0 +1,38 @@
+Review checklist for kvm patches
+================================
+
+1.  The patch must follow Documentation/CodingStyle and
+    Documentation/SubmittingPatches.
+
+2.  Patches should be against kvm.git master branch.
+
+3.  If the patch introduces or modifies a new userspace API:
+    - the API must be documented in Documentation/kvm/api.txt
+    - the API must be discoverable using KVM_CHECK_EXTENSION
+
+4.  New state must include support for save/restore.
+
+5.  New features must default to off (userspace should explicitly request them).
+    Performance improvements can and should default to on.
+
+6.  New cpu features should be exposed via KVM_GET_SUPPORTED_CPUID2
+
+7.  Emulator changes should be accompanied by unit tests for qemu-kvm.git
+    kvm/test directory.
+
+8.  Changes should be vendor neutral when possible.  Changes to common code
+    are better than duplicating changes to vendor code.
+
+9.  Similarly, prefer changes to arch independent code than to arch dependent
+    code.
+
+10. User/kernel interfaces and guest/host interfaces must be 64-bit clean
+    (all variables and sizes naturally aligned on 64-bit; use specific types
+    only - u64 rather than ulong).
+
+11. New guest visible features must either be documented in a hardware manual
+    or be accompanied by documentation.
+
+12. Features must be robust against reset and kexec - for example, shared
+    host/guest memory must be unshared to prevent the host from writing to
+    guest memory that the guest has not reserved for this purpose.
-- 
cgit v1.2.3


From f5f48ee15c2ee3e44cf429e34b16c6fa9b900246 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 30 Jun 2010 12:25:15 +0800
Subject: KVM: VMX: Execute WBINVD to keep data consistency with assigned
 devices

Some guest device driver may leverage the "Non-Snoop" I/O, and explicitly
WBINVD or CLFLUSH to a RAM space. Since migration may occur before WBINVD or
CLFLUSH, we need to maintain data consistency either by:
1: flushing cache (wbinvd) when the guest is scheduled out if there is no
wbinvd exit, or
2: execute wbinvd on all dirty physical CPUs when guest wbinvd exits.

Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  6 ++++++
 arch/x86/kvm/emulate.c          |  5 ++++-
 arch/x86/kvm/svm.c              |  7 +++++++
 arch/x86/kvm/vmx.c              | 10 +++++++++-
 arch/x86/kvm/x86.c              | 41 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a57cdeacc4d2..2bda62485c4c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -15,6 +15,7 @@
 #include <linux/mm.h>
 #include <linux/mmu_notifier.h>
 #include <linux/tracepoint.h>
+#include <linux/cpumask.h>
 
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
@@ -358,6 +359,8 @@ struct kvm_vcpu_arch {
 
 	/* fields used by HYPER-V emulation */
 	u64 hv_vapic;
+
+	cpumask_var_t wbinvd_dirty_mask;
 };
 
 struct kvm_arch {
@@ -514,6 +517,8 @@ struct kvm_x86_ops {
 
 	void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
 
+	bool (*has_wbinvd_exit)(void);
+
 	const struct trace_print_flags *exit_reasons_str;
 };
 
@@ -571,6 +576,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
 int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
 int emulate_clts(struct kvm_vcpu *vcpu);
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
 
 void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index abb8cec420a2..e8bdddc4509e 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3138,8 +3138,11 @@ twobyte_insn:
 		emulate_clts(ctxt->vcpu);
 		c->dst.type = OP_NONE;
 		break;
-	case 0x08:		/* invd */
 	case 0x09:		/* wbinvd */
+		kvm_emulate_wbinvd(ctxt->vcpu);
+		c->dst.type = OP_NONE;
+		break;
+	case 0x08:		/* invd */
 	case 0x0d:		/* GrpP (prefetch) */
 	case 0x18:		/* Grp16 (prefetch/nop) */
 		c->dst.type = OP_NONE;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 587b99d37d44..56c9b6bd7655 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3424,6 +3424,11 @@ static bool svm_rdtscp_supported(void)
 	return false;
 }
 
+static bool svm_has_wbinvd_exit(void)
+{
+	return true;
+}
+
 static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -3508,6 +3513,8 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.rdtscp_supported = svm_rdtscp_supported,
 
 	.set_supported_cpuid = svm_set_supported_cpuid,
+
+	.has_wbinvd_exit = svm_has_wbinvd_exit,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 661c6e199b4a..4dfb1dc09c88 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -412,6 +412,12 @@ static inline bool cpu_has_virtual_nmis(void)
 	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
 }
 
+static inline bool cpu_has_vmx_wbinvd_exit(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_WBINVD_EXITING;
+}
+
 static inline bool report_flexpriority(void)
 {
 	return flexpriority_enabled;
@@ -3397,7 +3403,7 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
 static int handle_wbinvd(struct kvm_vcpu *vcpu)
 {
 	skip_emulated_instruction(vcpu);
-	/* TODO: Add support for VT-d/pass-through device */
+	kvm_emulate_wbinvd(vcpu);
 	return 1;
 }
 
@@ -4347,6 +4353,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.rdtscp_supported = vmx_rdtscp_supported,
 
 	.set_supported_cpuid = vmx_set_supported_cpuid,
+
+	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 };
 
 static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 27322d341232..3d72fc067059 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1783,8 +1783,28 @@ out:
 	return r;
 }
 
+static void wbinvd_ipi(void *garbage)
+{
+	wbinvd();
+}
+
+static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+	return vcpu->kvm->arch.iommu_domain &&
+		!(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
+}
+
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
+	/* Address WBINVD may be executed by guest */
+	if (need_emulate_wbinvd(vcpu)) {
+		if (kvm_x86_ops->has_wbinvd_exit())
+			cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
+		else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
+			smp_call_function_single(vcpu->cpu,
+					wbinvd_ipi, NULL, 1);
+	}
+
 	kvm_x86_ops->vcpu_load(vcpu, cpu);
 	if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
 		unsigned long khz = cpufreq_quick_get(cpu);
@@ -3660,6 +3680,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
 	return X86EMUL_CONTINUE;
 }
 
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+	if (!need_emulate_wbinvd(vcpu))
+		return X86EMUL_CONTINUE;
+
+	if (kvm_x86_ops->has_wbinvd_exit()) {
+		smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
+				wbinvd_ipi, NULL, 1);
+		cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
+	}
+	wbinvd();
+	return X86EMUL_CONTINUE;
+}
+EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
+
 int emulate_clts(struct kvm_vcpu *vcpu)
 {
 	kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
@@ -5263,6 +5298,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 		vcpu->arch.time_page = NULL;
 	}
 
+	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
 	fx_free(vcpu);
 	kvm_x86_ops->vcpu_free(vcpu);
 }
@@ -5392,7 +5428,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	}
 	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
 
+	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
+		goto fail_free_mce_banks;
+
 	return 0;
+fail_free_mce_banks:
+	kfree(vcpu->arch.mce_banks);
 fail_free_lapic:
 	kvm_free_lapic(vcpu);
 fail_mmu_destroy:
-- 
cgit v1.2.3


From 36a2e6774bfb5f32a0f23bb155f1f960321f291b Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Wed, 30 Jun 2010 16:02:02 +0800
Subject: KVM: MMU: fix writable sync sp mapping

While we sync many unsync sp at one time(in mmu_sync_children()),
we may mapping the spte writable, it's dangerous, if one unsync
sp's mapping gfn is another unsync page's gfn.

For example:

SP1.pte[0] = P
SP2.gfn's pfn = P
[SP1.pte[0] = SP2.gfn's pfn]

First, we write protected SP1 and SP2, but SP1 and SP2 are still the
unsync sp.

Then, sync SP1 first, it will detect SP1.pte[0].gfn only has one unsync-sp,
that is SP2, so it will mapping it writable, but we plan to sync SP2 soon,
at this point, the SP2->unsync is not reliable since later we sync SP2 but
SP2->gfn is already writable.

So the final result is: SP2 is the sync page but SP2.gfn is writable.

This bug will corrupt guest's page table, fixed by mark read-only mapping
if the mapped gfn has shadow pages.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 690a7fc58c17..ca07ed083b59 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1810,11 +1810,14 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 	bool need_unsync = false;
 
 	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+		if (!can_unsync)
+			return 1;
+
 		if (s->role.level != PT_PAGE_TABLE_LEVEL)
 			return 1;
 
 		if (!need_unsync && !s->unsync) {
-			if (!can_unsync || !oos_shadow)
+			if (!oos_shadow)
 				return 1;
 			need_unsync = true;
 		}
-- 
cgit v1.2.3


From 5fd5387c89ec99ff6cb82d2477ffeb7211b781c2 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Wed, 30 Jun 2010 16:02:45 +0800
Subject: KVM: MMU: fix conflict access permissions in direct sp

In no-direct mapping, we mark sp is 'direct' when we mapping the
guest's larger page, but its access is encoded form upper page-struct
entire not include the last mapping, it will cause access conflict.

For example, have this mapping:
        [W]
      / PDE1 -> |---|
  P[W]          |   | LPA
      \ PDE2 -> |---|
        [R]

P have two children, PDE1 and PDE2, both PDE1 and PDE2 mapping the
same lage page(LPA). The P's access is WR, PDE1's access is WR,
PDE2's access is RO(just consider read-write permissions here)

When guest access PDE1, we will create a direct sp for LPA, the sp's
access is from P, is W, then we will mark the ptes is W in this sp.

Then, guest access PDE2, we will find LPA's shadow page, is the same as
PDE's, and mark the ptes is RO.

So, if guest access PDE1, the incorrect #PF is occured.

Fixed by encode the last mapping access into direct shadow page

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a21a86ef9e20..f4e4aaa65ff3 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -339,6 +339,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			direct = 1;
 			if (!is_dirty_gpte(gw->ptes[level - delta]))
 				access &= ~ACC_WRITE_MASK;
+			access &= gw->pte_access;
+
 			/*
 			 * It is a large guest pages backed by small host pages,
 			 * So we set @direct(@sp->role.direct)=1, and set
-- 
cgit v1.2.3


From 9e7b0e7fba45ca3c6357aeb7091ebc281f1de365 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Wed, 30 Jun 2010 16:03:28 +0800
Subject: KVM: MMU: fix direct sp's access corrupted

If the mapping is writable but the dirty flag is not set, we will find
the read-only direct sp and setup the mapping, then if the write #PF
occur, we will mark this mapping writable in the read-only direct sp,
now, other real read-only mapping will happily write it without #PF.

It may hurt guest's COW

Fixed by re-install the mapping when write #PF occur.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 28 ++++++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index f4e4aaa65ff3..117d63f6304d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -325,8 +325,32 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			break;
 		}
 
-		if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
-			continue;
+		if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
+			struct kvm_mmu_page *child;
+			unsigned direct_access;
+
+			if (level != gw->level)
+				continue;
+
+			/*
+			 * For the direct sp, if the guest pte's dirty bit
+			 * changed form clean to dirty, it will corrupt the
+			 * sp's access: allow writable in the read-only sp,
+			 * so we should update the spte at this point to get
+			 * a new sp with the correct access.
+			 */
+			direct_access = gw->pt_access & gw->pte_access;
+			if (!is_dirty_gpte(gw->ptes[gw->level - 1]))
+				direct_access &= ~ACC_WRITE_MASK;
+
+			child = page_header(*sptep & PT64_BASE_ADDR_MASK);
+			if (child->role.access == direct_access)
+				continue;
+
+			mmu_page_remove_parent_pte(child, sptep);
+			__set_spte(sptep, shadow_trap_nonpresent_pte);
+			kvm_flush_remote_tlbs(vcpu->kvm);
+		}
 
 		if (is_large_pte(*sptep)) {
 			rmap_remove(vcpu->kvm, sptep);
-- 
cgit v1.2.3


From 84754cd8fca66ed476585eabad68cacf42834199 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Wed, 30 Jun 2010 16:05:00 +0800
Subject: KVM: MMU: cleanup FNAME(fetch)() functions

Cleanup this function that we are already get the direct sp's access

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 117d63f6304d..59e750c1a269 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -306,12 +306,18 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	gfn_t table_gfn;
 	int r;
 	int level;
+	bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
+	unsigned direct_access;
 	pt_element_t curr_pte;
 	struct kvm_shadow_walk_iterator iterator;
 
 	if (!is_present_gpte(gw->ptes[gw->level - 1]))
 		return NULL;
 
+	direct_access = gw->pt_access & gw->pte_access;
+	if (!dirty)
+		direct_access &= ~ACC_WRITE_MASK;
+
 	for_each_shadow_entry(vcpu, addr, iterator) {
 		level = iterator.level;
 		sptep = iterator.sptep;
@@ -319,15 +325,13 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			mmu_set_spte(vcpu, sptep, access,
 				     gw->pte_access & access,
 				     user_fault, write_fault,
-				     is_dirty_gpte(gw->ptes[gw->level-1]),
-				     ptwrite, level,
+				     dirty, ptwrite, level,
 				     gw->gfn, pfn, false, true);
 			break;
 		}
 
 		if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
 			struct kvm_mmu_page *child;
-			unsigned direct_access;
 
 			if (level != gw->level)
 				continue;
@@ -339,10 +343,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			 * so we should update the spte at this point to get
 			 * a new sp with the correct access.
 			 */
-			direct_access = gw->pt_access & gw->pte_access;
-			if (!is_dirty_gpte(gw->ptes[gw->level - 1]))
-				direct_access &= ~ACC_WRITE_MASK;
-
 			child = page_header(*sptep & PT64_BASE_ADDR_MASK);
 			if (child->role.access == direct_access)
 				continue;
@@ -359,11 +359,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 		}
 
 		if (level <= gw->level) {
-			int delta = level - gw->level + 1;
 			direct = 1;
-			if (!is_dirty_gpte(gw->ptes[level - delta]))
-				access &= ~ACC_WRITE_MASK;
-			access &= gw->pte_access;
+			access = direct_access;
 
 			/*
 			 * It is a large guest pages backed by small host pages,
-- 
cgit v1.2.3


From 7741909bf19d9437a6aa3559c0470a640f637dce Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Wed, 30 Jun 2010 15:18:45 +0200
Subject: KVM: PPC: Add generic hpte management functions

Currently the shadow paging code keeps an array of entries it knows about.
Whenever the guest invalidates an entry, we loop through that entry,
trying to invalidate matching parts.

While this is a really simple implementation, it is probably the most
ineffective one possible. So instead, let's keep an array of lists around
that are indexed by a hash. This way each PTE can be added by 4 list_add,
removed by 4 list_del invocations and the search only needs to loop through
entries that share the same hash.

This patch implements said lookup and exports generic functions that both
the 32-bit and 64-bit backend can use.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/powerpc/kvm/book3s_mmu_hpte.c | 277 +++++++++++++++++++++++++++++++++++++
 1 file changed, 277 insertions(+)
 create mode 100644 arch/powerpc/kvm/book3s_mmu_hpte.c

diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
new file mode 100644
index 000000000000..4868d4a7ebc5
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -0,0 +1,277 @@
+/*
+ * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *     Alexander Graf <agraf@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/hash.h>
+#include <linux/slab.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/machdep.h>
+#include <asm/mmu_context.h>
+#include <asm/hw_irq.h>
+
+#define PTE_SIZE	12
+
+/* #define DEBUG_MMU */
+
+#ifdef DEBUG_MMU
+#define dprintk_mmu(a, ...) printk(KERN_INFO a, __VA_ARGS__)
+#else
+#define dprintk_mmu(a, ...) do { } while(0)
+#endif
+
+static struct kmem_cache *hpte_cache;
+
+static inline u64 kvmppc_mmu_hash_pte(u64 eaddr)
+{
+	return hash_64(eaddr >> PTE_SIZE, HPTEG_HASH_BITS_PTE);
+}
+
+static inline u64 kvmppc_mmu_hash_vpte(u64 vpage)
+{
+	return hash_64(vpage & 0xfffffffffULL, HPTEG_HASH_BITS_VPTE);
+}
+
+static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage)
+{
+	return hash_64((vpage & 0xffffff000ULL) >> 12,
+		       HPTEG_HASH_BITS_VPTE_LONG);
+}
+
+void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
+{
+	u64 index;
+
+	/* Add to ePTE list */
+	index = kvmppc_mmu_hash_pte(pte->pte.eaddr);
+	hlist_add_head(&pte->list_pte, &vcpu->arch.hpte_hash_pte[index]);
+
+	/* Add to vPTE list */
+	index = kvmppc_mmu_hash_vpte(pte->pte.vpage);
+	hlist_add_head(&pte->list_vpte, &vcpu->arch.hpte_hash_vpte[index]);
+
+	/* Add to vPTE_long list */
+	index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage);
+	hlist_add_head(&pte->list_vpte_long,
+		       &vcpu->arch.hpte_hash_vpte_long[index]);
+}
+
+static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
+{
+	dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) -> 0x%llx\n",
+		    pte->pte.eaddr, pte->pte.vpage, pte->host_va);
+
+	/* Different for 32 and 64 bit */
+	kvmppc_mmu_invalidate_pte(vcpu, pte);
+
+	if (pte->pte.may_write)
+		kvm_release_pfn_dirty(pte->pfn);
+	else
+		kvm_release_pfn_clean(pte->pfn);
+
+	hlist_del(&pte->list_pte);
+	hlist_del(&pte->list_vpte);
+	hlist_del(&pte->list_vpte_long);
+
+	vcpu->arch.hpte_cache_count--;
+	kmem_cache_free(hpte_cache, pte);
+}
+
+static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
+{
+	struct hpte_cache *pte;
+	struct hlist_node *node, *tmp;
+	int i;
+
+	for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
+		struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i];
+
+		hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long)
+			invalidate_pte(vcpu, pte);
+	}
+}
+
+static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea)
+{
+	struct hlist_head *list;
+	struct hlist_node *node, *tmp;
+	struct hpte_cache *pte;
+
+	/* Find the list of entries in the map */
+	list = &vcpu->arch.hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)];
+
+	/* Check the list for matching entries and invalidate */
+	hlist_for_each_entry_safe(pte, node, tmp, list, list_pte)
+		if ((pte->pte.eaddr & ~0xfffUL) == guest_ea)
+			invalidate_pte(vcpu, pte);
+}
+
+void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
+{
+	u64 i;
+
+	dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%lx & 0x%lx\n",
+		    vcpu->arch.hpte_cache_count, guest_ea, ea_mask);
+
+	guest_ea &= ea_mask;
+
+	switch (ea_mask) {
+	case ~0xfffUL:
+		kvmppc_mmu_pte_flush_page(vcpu, guest_ea);
+		break;
+	case 0x0ffff000:
+		/* 32-bit flush w/o segment, go through all possible segments */
+		for (i = 0; i < 0x100000000ULL; i += 0x10000000ULL)
+			kvmppc_mmu_pte_flush(vcpu, guest_ea | i, ~0xfffUL);
+		break;
+	case 0:
+		/* Doing a complete flush -> start from scratch */
+		kvmppc_mmu_pte_flush_all(vcpu);
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
+}
+
+/* Flush with mask 0xfffffffff */
+static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)
+{
+	struct hlist_head *list;
+	struct hlist_node *node, *tmp;
+	struct hpte_cache *pte;
+	u64 vp_mask = 0xfffffffffULL;
+
+	list = &vcpu->arch.hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)];
+
+	/* Check the list for matching entries and invalidate */
+	hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte)
+		if ((pte->pte.vpage & vp_mask) == guest_vp)
+			invalidate_pte(vcpu, pte);
+}
+
+/* Flush with mask 0xffffff000 */
+static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp)
+{
+	struct hlist_head *list;
+	struct hlist_node *node, *tmp;
+	struct hpte_cache *pte;
+	u64 vp_mask = 0xffffff000ULL;
+
+	list = &vcpu->arch.hpte_hash_vpte_long[
+		kvmppc_mmu_hash_vpte_long(guest_vp)];
+
+	/* Check the list for matching entries and invalidate */
+	hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long)
+		if ((pte->pte.vpage & vp_mask) == guest_vp)
+			invalidate_pte(vcpu, pte);
+}
+
+void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
+{
+	dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n",
+		    vcpu->arch.hpte_cache_count, guest_vp, vp_mask);
+	guest_vp &= vp_mask;
+
+	switch(vp_mask) {
+	case 0xfffffffffULL:
+		kvmppc_mmu_pte_vflush_short(vcpu, guest_vp);
+		break;
+	case 0xffffff000ULL:
+		kvmppc_mmu_pte_vflush_long(vcpu, guest_vp);
+		break;
+	default:
+		WARN_ON(1);
+		return;
+	}
+}
+
+void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
+{
+	struct hlist_node *node, *tmp;
+	struct hpte_cache *pte;
+	int i;
+
+	dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%lx - 0x%lx\n",
+		    vcpu->arch.hpte_cache_count, pa_start, pa_end);
+
+	for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
+		struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i];
+
+		hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long)
+			if ((pte->pte.raddr >= pa_start) &&
+			    (pte->pte.raddr < pa_end))
+				invalidate_pte(vcpu, pte);
+	}
+}
+
+struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
+{
+	struct hpte_cache *pte;
+
+	pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL);
+	vcpu->arch.hpte_cache_count++;
+
+	if (vcpu->arch.hpte_cache_count == HPTEG_CACHE_NUM)
+		kvmppc_mmu_pte_flush_all(vcpu);
+
+	return pte;
+}
+
+void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu)
+{
+	kvmppc_mmu_pte_flush(vcpu, 0, 0);
+}
+
+static void kvmppc_mmu_hpte_init_hash(struct hlist_head *hash_list, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		INIT_HLIST_HEAD(&hash_list[i]);
+}
+
+int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)
+{
+	/* init hpte lookup hashes */
+	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte,
+				  ARRAY_SIZE(vcpu->arch.hpte_hash_pte));
+	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte,
+				  ARRAY_SIZE(vcpu->arch.hpte_hash_vpte));
+	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte_long,
+				  ARRAY_SIZE(vcpu->arch.hpte_hash_vpte_long));
+
+	return 0;
+}
+
+int kvmppc_mmu_hpte_sysinit(void)
+{
+	/* init hpte slab cache */
+	hpte_cache = kmem_cache_create("kvm-spt", sizeof(struct hpte_cache),
+				       sizeof(struct hpte_cache), 0, NULL);
+
+	return 0;
+}
+
+void kvmppc_mmu_hpte_sysexit(void)
+{
+	kmem_cache_destroy(hpte_cache);
+}
-- 
cgit v1.2.3


From fef093bec0364ff5e6fd488cd81637f6bb3a2d0d Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Wed, 30 Jun 2010 15:18:46 +0200
Subject: KVM: PPC: Make use of hash based Shadow MMU

We just introduced generic functions to handle shadow pages on PPC.
This patch makes the respective backends make use of them, getting
rid of a lot of duplicate code along the way.

Signed-off-by: Alexander Graf <agraf@suse.de>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/powerpc/include/asm/kvm_book3s.h |   9 +++
 arch/powerpc/include/asm/kvm_host.h   |  17 +++++-
 arch/powerpc/kvm/Makefile             |   2 +
 arch/powerpc/kvm/book3s.c             |  14 ++++-
 arch/powerpc/kvm/book3s_32_mmu_host.c | 104 ++++------------------------------
 arch/powerpc/kvm/book3s_64_mmu_host.c |  98 +++-----------------------------
 6 files changed, 54 insertions(+), 190 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 4e995593e479..8274a2d43925 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -115,6 +115,15 @@ extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
 extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
 extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
 extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
+
+extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
+extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu);
+extern void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu);
+extern int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu);
+extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
+extern int kvmppc_mmu_hpte_sysinit(void);
+extern void kvmppc_mmu_hpte_sysexit(void);
+
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 0c9ad869decd..e004eafcd3f0 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -38,7 +38,13 @@
 #define KVM_NR_PAGE_SIZES	1
 #define KVM_PAGES_PER_HPAGE(x)	(1UL<<31)
 
-#define HPTEG_CACHE_NUM 1024
+#define HPTEG_CACHE_NUM			(1 << 15)
+#define HPTEG_HASH_BITS_PTE		13
+#define HPTEG_HASH_BITS_VPTE		13
+#define HPTEG_HASH_BITS_VPTE_LONG	5
+#define HPTEG_HASH_NUM_PTE		(1 << HPTEG_HASH_BITS_PTE)
+#define HPTEG_HASH_NUM_VPTE		(1 << HPTEG_HASH_BITS_VPTE)
+#define HPTEG_HASH_NUM_VPTE_LONG	(1 << HPTEG_HASH_BITS_VPTE_LONG)
 
 struct kvm;
 struct kvm_run;
@@ -151,6 +157,9 @@ struct kvmppc_mmu {
 };
 
 struct hpte_cache {
+	struct hlist_node list_pte;
+	struct hlist_node list_vpte;
+	struct hlist_node list_vpte_long;
 	u64 host_va;
 	u64 pfn;
 	ulong slot;
@@ -282,8 +291,10 @@ struct kvm_vcpu_arch {
 	unsigned long pending_exceptions;
 
 #ifdef CONFIG_PPC_BOOK3S
-	struct hpte_cache hpte_cache[HPTEG_CACHE_NUM];
-	int hpte_cache_offset;
+	struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
+	struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
+	struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG];
+	int hpte_cache_count;
 #endif
 };
 
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index ff436066bf77..d45c818a384c 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -45,6 +45,7 @@ kvm-book3s_64-objs := \
 	book3s.o \
 	book3s_emulate.o \
 	book3s_interrupts.o \
+	book3s_mmu_hpte.o \
 	book3s_64_mmu_host.o \
 	book3s_64_mmu.o \
 	book3s_32_mmu.o
@@ -57,6 +58,7 @@ kvm-book3s_32-objs := \
 	book3s.o \
 	book3s_emulate.o \
 	book3s_interrupts.o \
+	book3s_mmu_hpte.o \
 	book3s_32_mmu_host.o \
 	book3s_32_mmu.o
 kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs)
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 801d9f3c70ae..a3cef30d1d42 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1384,12 +1384,22 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 static int kvmppc_book3s_init(void)
 {
-	return kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0,
-			THIS_MODULE);
+	int r;
+
+	r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0,
+		     THIS_MODULE);
+
+	if (r)
+		return r;
+
+	r = kvmppc_mmu_hpte_sysinit();
+
+	return r;
 }
 
 static void kvmppc_book3s_exit(void)
 {
+	kvmppc_mmu_hpte_sysexit();
 	kvm_exit();
 }
 
diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
index 904f5ac78f54..0b51ef872c1e 100644
--- a/arch/powerpc/kvm/book3s_32_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -58,105 +58,19 @@
 static ulong htab;
 static u32 htabmask;
 
-static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
+void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
 	volatile u32 *pteg;
 
-	dprintk_mmu("KVM: Flushing SPTE: 0x%llx (0x%llx) -> 0x%llx\n",
-		    pte->pte.eaddr, pte->pte.vpage, pte->host_va);
-
+	/* Remove from host HTAB */
 	pteg = (u32*)pte->slot;
-
 	pteg[0] = 0;
+
+	/* And make sure it's gone from the TLB too */
 	asm volatile ("sync");
 	asm volatile ("tlbie %0" : : "r" (pte->pte.eaddr) : "memory");
 	asm volatile ("sync");
 	asm volatile ("tlbsync");
-
-	pte->host_va = 0;
-
-	if (pte->pte.may_write)
-		kvm_release_pfn_dirty(pte->pfn);
-	else
-		kvm_release_pfn_clean(pte->pfn);
-}
-
-void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
-{
-	int i;
-
-	dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%x & 0x%x\n",
-		    vcpu->arch.hpte_cache_offset, guest_ea, ea_mask);
-	BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
-
-	guest_ea &= ea_mask;
-	for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
-		struct hpte_cache *pte;
-
-		pte = &vcpu->arch.hpte_cache[i];
-		if (!pte->host_va)
-			continue;
-
-		if ((pte->pte.eaddr & ea_mask) == guest_ea) {
-			invalidate_pte(vcpu, pte);
-		}
-	}
-
-	/* Doing a complete flush -> start from scratch */
-	if (!ea_mask)
-		vcpu->arch.hpte_cache_offset = 0;
-}
-
-void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
-{
-	int i;
-
-	dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n",
-		    vcpu->arch.hpte_cache_offset, guest_vp, vp_mask);
-	BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
-
-	guest_vp &= vp_mask;
-	for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
-		struct hpte_cache *pte;
-
-		pte = &vcpu->arch.hpte_cache[i];
-		if (!pte->host_va)
-			continue;
-
-		if ((pte->pte.vpage & vp_mask) == guest_vp) {
-			invalidate_pte(vcpu, pte);
-		}
-	}
-}
-
-void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
-{
-	int i;
-
-	dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%llx & 0x%llx\n",
-		    vcpu->arch.hpte_cache_offset, pa_start, pa_end);
-	BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
-
-	for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
-		struct hpte_cache *pte;
-
-		pte = &vcpu->arch.hpte_cache[i];
-		if (!pte->host_va)
-			continue;
-
-		if ((pte->pte.raddr >= pa_start) &&
-		    (pte->pte.raddr < pa_end)) {
-			invalidate_pte(vcpu, pte);
-		}
-	}
-}
-
-static int kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
-{
-	if (vcpu->arch.hpte_cache_offset == HPTEG_CACHE_NUM)
-		kvmppc_mmu_pte_flush(vcpu, 0, 0);
-
-	return vcpu->arch.hpte_cache_offset++;
 }
 
 /* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using
@@ -230,7 +144,6 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
 	register int rr = 0;
 	bool primary = false;
 	bool evict = false;
-	int hpte_id;
 	struct hpte_cache *pte;
 
 	/* Get host physical address for gpa */
@@ -315,8 +228,7 @@ next_pteg:
 
 	/* Now tell our Shadow PTE code about the new page */
 
-	hpte_id = kvmppc_mmu_hpte_cache_next(vcpu);
-	pte = &vcpu->arch.hpte_cache[hpte_id];
+	pte = kvmppc_mmu_hpte_cache_next(vcpu);
 
 	dprintk_mmu("KVM: %c%c Map 0x%llx: [%lx] 0x%llx (0x%llx) -> %lx\n",
 		    orig_pte->may_write ? 'w' : '-',
@@ -329,6 +241,8 @@ next_pteg:
 	pte->pte = *orig_pte;
 	pte->pfn = hpaddr >> PAGE_SHIFT;
 
+	kvmppc_mmu_hpte_cache_map(vcpu, pte);
+
 	return 0;
 }
 
@@ -413,7 +327,7 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
 
 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
 {
-	kvmppc_mmu_pte_flush(vcpu, 0, 0);
+	kvmppc_mmu_hpte_destroy(vcpu);
 	preempt_disable();
 	__destroy_context(to_book3s(vcpu)->context_id);
 	preempt_enable();
@@ -453,5 +367,7 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
 	htabmask = ((sdr1 & 0x1FF) << 16) | 0xFFC0;
 	htab = (ulong)__va(sdr1 & 0xffff0000);
 
+	kvmppc_mmu_hpte_init(vcpu);
+
 	return 0;
 }
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 4ccdde152c37..384179a5002b 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -47,98 +47,11 @@
 #define dprintk_slb(a, ...) do { } while(0)
 #endif
 
-static void invalidate_pte(struct hpte_cache *pte)
+void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
-	dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) -> 0x%llx\n",
-		    pte->pte.eaddr, pte->pte.vpage, pte->host_va);
-
 	ppc_md.hpte_invalidate(pte->slot, pte->host_va,
 			       MMU_PAGE_4K, MMU_SEGSIZE_256M,
 			       false);
-	pte->host_va = 0;
-
-	if (pte->pte.may_write)
-		kvm_release_pfn_dirty(pte->pfn);
-	else
-		kvm_release_pfn_clean(pte->pfn);
-}
-
-void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
-{
-	int i;
-
-	dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%lx & 0x%lx\n",
-		    vcpu->arch.hpte_cache_offset, guest_ea, ea_mask);
-	BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
-
-	guest_ea &= ea_mask;
-	for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
-		struct hpte_cache *pte;
-
-		pte = &vcpu->arch.hpte_cache[i];
-		if (!pte->host_va)
-			continue;
-
-		if ((pte->pte.eaddr & ea_mask) == guest_ea) {
-			invalidate_pte(pte);
-		}
-	}
-
-	/* Doing a complete flush -> start from scratch */
-	if (!ea_mask)
-		vcpu->arch.hpte_cache_offset = 0;
-}
-
-void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
-{
-	int i;
-
-	dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n",
-		    vcpu->arch.hpte_cache_offset, guest_vp, vp_mask);
-	BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
-
-	guest_vp &= vp_mask;
-	for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
-		struct hpte_cache *pte;
-
-		pte = &vcpu->arch.hpte_cache[i];
-		if (!pte->host_va)
-			continue;
-
-		if ((pte->pte.vpage & vp_mask) == guest_vp) {
-			invalidate_pte(pte);
-		}
-	}
-}
-
-void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
-{
-	int i;
-
-	dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%lx & 0x%lx\n",
-		    vcpu->arch.hpte_cache_offset, pa_start, pa_end);
-	BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
-
-	for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
-		struct hpte_cache *pte;
-
-		pte = &vcpu->arch.hpte_cache[i];
-		if (!pte->host_va)
-			continue;
-
-		if ((pte->pte.raddr >= pa_start) &&
-		    (pte->pte.raddr < pa_end)) {
-			invalidate_pte(pte);
-		}
-	}
-}
-
-static int kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
-{
-	if (vcpu->arch.hpte_cache_offset == HPTEG_CACHE_NUM)
-		kvmppc_mmu_pte_flush(vcpu, 0, 0);
-
-	return vcpu->arch.hpte_cache_offset++;
 }
 
 /* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using
@@ -246,8 +159,7 @@ map_again:
 		attempt++;
 		goto map_again;
 	} else {
-		int hpte_id = kvmppc_mmu_hpte_cache_next(vcpu);
-		struct hpte_cache *pte = &vcpu->arch.hpte_cache[hpte_id];
+		struct hpte_cache *pte = kvmppc_mmu_hpte_cache_next(vcpu);
 
 		dprintk_mmu("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx\n",
 			    ((rflags & HPTE_R_PP) == 3) ? '-' : 'w',
@@ -265,6 +177,8 @@ map_again:
 		pte->host_va = va;
 		pte->pte = *orig_pte;
 		pte->pfn = hpaddr >> PAGE_SHIFT;
+
+		kvmppc_mmu_hpte_cache_map(vcpu, pte);
 	}
 
 	return 0;
@@ -391,7 +305,7 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
 
 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
 {
-	kvmppc_mmu_pte_flush(vcpu, 0, 0);
+	kvmppc_mmu_hpte_destroy(vcpu);
 	__destroy_context(to_book3s(vcpu)->context_id);
 }
 
@@ -409,5 +323,7 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
 	vcpu3s->vsid_first = vcpu3s->context_id << USER_ESID_BITS;
 	vcpu3s->vsid_next = vcpu3s->vsid_first;
 
+	kvmppc_mmu_hpte_init(vcpu);
+
 	return 0;
 }
-- 
cgit v1.2.3


From 95c87e2b4460a488ec7ce42f273893e410ab385a Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Thu, 1 Jul 2010 15:00:50 +0800
Subject: KVM: Fix IOMMU memslot reference warning

This patch fixes the following warning.

===================================================
[ INFO: suspicious rcu_dereference_check() usage. ]
---------------------------------------------------
include/linux/kvm_host.h:259 invoked rcu_dereference_check() without
protection!

other info that might help us debug this:

rcu_scheduler_active = 1, debug_locks = 0
no locks held by qemu-system-x86/29679.

stack backtrace:
Pid: 29679, comm: qemu-system-x86 Not tainted 2.6.35-rc3+ #200
Call Trace:
 [<ffffffff810a224e>] lockdep_rcu_dereference+0xa8/0xb1
 [<ffffffffa018a06f>] kvm_iommu_unmap_memslots+0xc9/0xde [kvm]
 [<ffffffffa018a0c4>] kvm_iommu_unmap_guest+0x40/0x4e [kvm]
 [<ffffffffa018f772>] kvm_arch_destroy_vm+0x1a/0x186 [kvm]
 [<ffffffffa01800d0>] kvm_put_kvm+0x110/0x167 [kvm]
 [<ffffffffa0180ecc>] kvm_vcpu_release+0x18/0x1c [kvm]
 [<ffffffff81156f5d>] fput+0x22a/0x3a0
 [<ffffffff81152288>] filp_close+0xb4/0xcd
 [<ffffffff8106599f>] put_files_struct+0x1b7/0x36b
 [<ffffffff81065830>] ? put_files_struct+0x48/0x36b
 [<ffffffff8131ee59>] ? do_raw_spin_unlock+0x118/0x160
 [<ffffffff81065bc0>] exit_files+0x6d/0x75
 [<ffffffff81068348>] do_exit+0x47d/0xc60
 [<ffffffff8177e7b5>] ? _raw_spin_unlock_irq+0x30/0x36
 [<ffffffff81068bfa>] do_group_exit+0xcf/0x134
 [<ffffffff81080790>] get_signal_to_deliver+0x732/0x81d
 [<ffffffff81095996>] ? cpu_clock+0x4e/0x60
 [<ffffffff81002082>] do_notify_resume+0x117/0xc43
 [<ffffffff810a2fa3>] ? trace_hardirqs_on+0xd/0xf
 [<ffffffff81080d79>] ? sys_rt_sigtimedwait+0x2b5/0x3bf
 [<ffffffff8177d9f2>] ? trace_hardirqs_off_thunk+0x3a/0x3c
 [<ffffffff81003221>] ? sysret_signal+0x5/0x3d
 [<ffffffff8100343b>] int_signal+0x12/0x17

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 virt/kvm/iommu.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 673c88a8efe9..779559552ce7 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -126,9 +126,10 @@ unmap_pages:
 
 static int kvm_iommu_map_memslots(struct kvm *kvm)
 {
-	int i, r = 0;
+	int i, idx, r = 0;
 	struct kvm_memslots *slots;
 
+	idx = srcu_read_lock(&kvm->srcu);
 	slots = kvm_memslots(kvm);
 
 	for (i = 0; i < slots->nmemslots; i++) {
@@ -136,6 +137,7 @@ static int kvm_iommu_map_memslots(struct kvm *kvm)
 		if (r)
 			break;
 	}
+	srcu_read_unlock(&kvm->srcu, idx);
 
 	return r;
 }
@@ -285,15 +287,17 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
 
 static int kvm_iommu_unmap_memslots(struct kvm *kvm)
 {
-	int i;
+	int i, idx;
 	struct kvm_memslots *slots;
 
+	idx = srcu_read_lock(&kvm->srcu);
 	slots = kvm_memslots(kvm);
 
 	for (i = 0; i < slots->nmemslots; i++) {
 		kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn,
 				    slots->memslots[i].npages);
 	}
+	srcu_read_unlock(&kvm->srcu, idx);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 828554136bbacae6e39fc31b9cd7e7c660ad7530 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 1 Jul 2010 16:00:11 +0200
Subject: KVM: Remove unnecessary divide operations

This patch converts unnecessary divide and modulo operations
in the KVM large page related code into logical operations.
This allows to convert gfn_t to u64 while not breaking 32
bit builds.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/ia64/include/asm/kvm_host.h    |  1 +
 arch/powerpc/include/asm/kvm_host.h |  1 +
 arch/s390/include/asm/kvm_host.h    |  3 ++-
 arch/x86/include/asm/kvm_host.h     |  3 ++-
 arch/x86/kvm/mmu.c                  |  8 ++++----
 virt/kvm/kvm_main.c                 | 10 +++++-----
 6 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index a362e67e0ca6..2f229e5de498 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -235,6 +235,7 @@ struct kvm_vm_data {
 #define KVM_REQ_PTC_G		32
 #define KVM_REQ_RESUME		33
 
+#define KVM_HPAGE_GFN_SHIFT(x)	0
 #define KVM_NR_PAGE_SIZES	1
 #define KVM_PAGES_PER_HPAGE(x)	1
 
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index e004eafcd3f0..b0b23c007d6e 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -35,6 +35,7 @@
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 
 /* We don't currently support large pages. */
+#define KVM_HPAGE_GFN_SHIFT(x)	0
 #define KVM_NR_PAGE_SIZES	1
 #define KVM_PAGES_PER_HPAGE(x)	(1UL<<31)
 
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index b95710a1f5dd..cef7dbf69dfc 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -41,7 +41,8 @@ struct sca_block {
 } __attribute__((packed));
 
 #define KVM_NR_PAGE_SIZES 2
-#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + ((x) - 1) * 8)
+#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 8)
+#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
 #define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
 #define KVM_HPAGE_MASK(x)	(~(KVM_HPAGE_SIZE(x) - 1))
 #define KVM_PAGES_PER_HPAGE(x)	(KVM_HPAGE_SIZE(x) / PAGE_SIZE)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 2bda62485c4c..50c79b9f5c38 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -44,7 +44,8 @@
 
 /* KVM Hugepage definitions for x86 */
 #define KVM_NR_PAGE_SIZES	3
-#define KVM_HPAGE_SHIFT(x)	(PAGE_SHIFT + (((x) - 1) * 9))
+#define KVM_HPAGE_GFN_SHIFT(x)	(((x) - 1) * 9)
+#define KVM_HPAGE_SHIFT(x)	(PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
 #define KVM_HPAGE_SIZE(x)	(1UL << KVM_HPAGE_SHIFT(x))
 #define KVM_HPAGE_MASK(x)	(~(KVM_HPAGE_SIZE(x) - 1))
 #define KVM_PAGES_PER_HPAGE(x)	(KVM_HPAGE_SIZE(x) / PAGE_SIZE)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ca07ed083b59..a20fd613acfe 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -423,8 +423,8 @@ static int *slot_largepage_idx(gfn_t gfn,
 {
 	unsigned long idx;
 
-	idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
-	      (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
+	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+	      (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
 	return &slot->lpage_info[level - 2][idx].write_count;
 }
 
@@ -528,8 +528,8 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
 	if (likely(level == PT_PAGE_TABLE_LEVEL))
 		return &slot->rmap[gfn - slot->base_gfn];
 
-	idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
-		(slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
+	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+		(slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
 
 	return &slot->lpage_info[level - 2][idx].rmap_pde;
 }
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ec2e3c6ac7ed..a60b6b053b6f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -626,9 +626,9 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		if (new.lpage_info[i])
 			continue;
 
-		lpages = 1 + (base_gfn + npages - 1) /
-			     KVM_PAGES_PER_HPAGE(level);
-		lpages -= base_gfn / KVM_PAGES_PER_HPAGE(level);
+		lpages = 1 + ((base_gfn + npages - 1)
+			     >> KVM_HPAGE_GFN_SHIFT(level));
+		lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
 
 		new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i]));
 
@@ -638,9 +638,9 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		memset(new.lpage_info[i], 0,
 		       lpages * sizeof(*new.lpage_info[i]));
 
-		if (base_gfn % KVM_PAGES_PER_HPAGE(level))
+		if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
 			new.lpage_info[i][0].write_count = 1;
-		if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE(level))
+		if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
 			new.lpage_info[i][lpages - 1].write_count = 1;
 		ugfn = new.userspace_addr >> PAGE_SHIFT;
 		/*
-- 
cgit v1.2.3


From 5689cc53fa9d09b5bf41b1b1a7c90bd6c112ab40 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 1 Jul 2010 16:00:12 +0200
Subject: KVM: Use u64 for frame data types

For 32bit machines where the physical address width is
larger than the virtual address width the frame number types
in KVM may overflow. Fix this by changing them to u64.

[sfr: fix build on 32-bit ppc]

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/powerpc/kvm/44x_tlb.c | 3 ++-
 include/linux/kvm_types.h  | 4 ++--
 virt/kvm/iommu.c           | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 812312542e50..9b9b5cdea840 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -316,7 +316,8 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,
 	gfn = gpaddr >> PAGE_SHIFT;
 	new_page = gfn_to_page(vcpu->kvm, gfn);
 	if (is_error_page(new_page)) {
-		printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn);
+		printk(KERN_ERR "Couldn't get guest page for gfn %llx!\n",
+			(unsigned long long)gfn);
 		kvm_release_page_clean(new_page);
 		return;
 	}
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index fb46efbeabec..7ac0d4eee430 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -32,11 +32,11 @@
 
 typedef unsigned long  gva_t;
 typedef u64            gpa_t;
-typedef unsigned long  gfn_t;
+typedef u64            gfn_t;
 
 typedef unsigned long  hva_t;
 typedef u64            hpa_t;
-typedef unsigned long  hfn_t;
+typedef u64            hfn_t;
 
 typedef hfn_t pfn_t;
 
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 779559552ce7..62a9caf0563c 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -108,7 +108,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
 			      get_order(page_size), flags);
 		if (r) {
 			printk(KERN_ERR "kvm_iommu_map_address:"
-			       "iommu failed to map pfn=%lx\n", pfn);
+			       "iommu failed to map pfn=%llx\n", pfn);
 			goto unmap_pages;
 		}
 
-- 
cgit v1.2.3


From dd180b3e90253cb4ca95d603a8c17413f8daec69 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 3 Jul 2010 16:02:42 +0800
Subject: KVM: VMX: fix tlb flush with invalid root

Commit 341d9b535b6c simplify reload logic while entry guest mode, it
can avoid unnecessary sync-root if KVM_REQ_MMU_RELOAD and
KVM_REQ_MMU_SYNC both set.

But, it cause a issue that when we handle 'KVM_REQ_TLB_FLUSH', the
root is invalid, it is triggered during my test:

Kernel BUG at ffffffffa00212b8 [verbose debug info unavailable]
......

Fixed by directly return if the root is not ready.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 ++
 arch/x86/kvm/mmu.c              | 2 --
 arch/x86/kvm/vmx.c              | 5 ++++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 50c79b9f5c38..502e53f999cf 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -40,6 +40,8 @@
 				  0xFFFFFF0000000000ULL)
 
 #define INVALID_PAGE (~(hpa_t)0)
+#define VALID_PAGE(x) ((x) != INVALID_PAGE)
+
 #define UNMAPPED_GVA (~(gpa_t)0)
 
 /* KVM Hugepage definitions for x86 */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a20fd613acfe..70cdf6876b5f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -92,8 +92,6 @@ module_param(oos_shadow, bool, 0644);
 #define PT_FIRST_AVAIL_BITS_SHIFT 9
 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
 
-#define VALID_PAGE(x) ((x) != INVALID_PAGE)
-
 #define PT64_LEVEL_BITS 9
 
 #define PT64_LEVEL_SHIFT(level) \
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4dfb1dc09c88..2fdcc9819f36 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1828,8 +1828,11 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
 {
 	vpid_sync_context(to_vmx(vcpu));
-	if (enable_ept)
+	if (enable_ept) {
+		if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+			return;
 		ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
+	}
 }
 
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From be38d276b0189fa86231fc311428622a1981ad62 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 6 Jun 2010 14:31:27 +0300
Subject: KVM: MMU: Introduce drop_spte()

When we call rmap_remove(), we (almost) always immediately follow it by
an __set_spte() to a nonpresent pte.  Since we need to perform the two
operations atomically, to avoid losing the dirty and accessed bits, introduce
a helper drop_spte() and convert all call sites.

The operation is still nonatomic at this point.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 30 +++++++++++++++++-------------
 arch/x86/kvm/paging_tmpl.h | 13 ++++++-------
 2 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 70cdf6876b5f..1ad39cf70e18 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -658,6 +658,12 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	}
 }
 
+static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+{
+	rmap_remove(kvm, sptep);
+	__set_spte(sptep, new_spte);
+}
+
 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
 {
 	struct kvm_rmap_desc *desc;
@@ -722,9 +728,9 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 			BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
 			pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
 			if (is_writable_pte(*spte)) {
-				rmap_remove(kvm, spte);
+				drop_spte(kvm, spte,
+					  shadow_trap_nonpresent_pte);
 				--kvm->stat.lpages;
-				__set_spte(spte, shadow_trap_nonpresent_pte);
 				spte = NULL;
 				write_protected = 1;
 			}
@@ -744,8 +750,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	while ((spte = rmap_next(kvm, rmapp, NULL))) {
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
-		rmap_remove(kvm, spte);
-		__set_spte(spte, shadow_trap_nonpresent_pte);
+		drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
 		need_tlb_flush = 1;
 	}
 	return need_tlb_flush;
@@ -767,8 +772,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
 		need_flush = 1;
 		if (pte_write(*ptep)) {
-			rmap_remove(kvm, spte);
-			__set_spte(spte, shadow_trap_nonpresent_pte);
+			drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
 			spte = rmap_next(kvm, rmapp, NULL);
 		} else {
 			new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -1464,7 +1468,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 			} else {
 				if (is_large_pte(ent))
 					--kvm->stat.lpages;
-				rmap_remove(kvm, &pt[i]);
+				drop_spte(kvm, &pt[i],
+					  shadow_trap_nonpresent_pte);
 			}
 		}
 		pt[i] = shadow_trap_nonpresent_pte;
@@ -1868,9 +1873,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		if (level > PT_PAGE_TABLE_LEVEL &&
 		    has_wrprotected_page(vcpu->kvm, gfn, level)) {
 			ret = 1;
-			rmap_remove(vcpu->kvm, sptep);
-			spte = shadow_trap_nonpresent_pte;
-			goto set_pte;
+			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+			goto done;
 		}
 
 		spte |= PT_WRITABLE_MASK;
@@ -1902,6 +1906,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
 set_pte:
 	__set_spte(sptep, spte);
+done:
 	return ret;
 }
 
@@ -1938,8 +1943,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		} else if (pfn != spte_to_pfn(*sptep)) {
 			pgprintk("hfn old %lx new %lx\n",
 				 spte_to_pfn(*sptep), pfn);
-			rmap_remove(vcpu->kvm, sptep);
-			__set_spte(sptep, shadow_trap_nonpresent_pte);
+			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
 			kvm_flush_remote_tlbs(vcpu->kvm);
 		} else
 			was_rmapped = 1;
@@ -2591,7 +2595,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
 	pte = *spte;
 	if (is_shadow_present_pte(pte)) {
 		if (is_last_spte(pte, sp->role.level))
-			rmap_remove(vcpu->kvm, spte);
+			drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
 		else {
 			child = page_header(pte & PT64_BASE_ADDR_MASK);
 			mmu_page_remove_parent_pte(child, spte);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 59e750c1a269..796a325c7e59 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -353,8 +353,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 		}
 
 		if (is_large_pte(*sptep)) {
-			rmap_remove(vcpu->kvm, sptep);
-			__set_spte(sptep, shadow_trap_nonpresent_pte);
+			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
 			kvm_flush_remote_tlbs(vcpu->kvm);
 		}
 
@@ -516,12 +515,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
 
 			if (is_shadow_present_pte(*sptep)) {
-				rmap_remove(vcpu->kvm, sptep);
 				if (is_large_pte(*sptep))
 					--vcpu->kvm->stat.lpages;
+				drop_spte(vcpu->kvm, sptep,
+					  shadow_trap_nonpresent_pte);
 				need_flush = 1;
-			}
-			__set_spte(sptep, shadow_trap_nonpresent_pte);
+			} else
+				__set_spte(sptep, shadow_trap_nonpresent_pte);
 			break;
 		}
 
@@ -637,12 +637,11 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 		      !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) {
 			u64 nonpresent;
 
-			rmap_remove(vcpu->kvm, &sp->spt[i]);
 			if (is_present_gpte(gpte) || !clear_unsync)
 				nonpresent = shadow_trap_nonpresent_pte;
 			else
 				nonpresent = shadow_notrap_nonpresent_pte;
-			__set_spte(&sp->spt[i], nonpresent);
+			drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
 			continue;
 		}
 
-- 
cgit v1.2.3


From ce061867aa2877605cda96fa8ec7dff15f70a983 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 6 Jun 2010 14:38:12 +0300
Subject: KVM: MMU: Move accessed/dirty bit checks from rmap_remove() to
 drop_spte()

Since we need to make the check atomic, move it to the place that will
set the new spte.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1ad39cf70e18..fbdca08b8d8c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -612,19 +612,11 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	struct kvm_rmap_desc *desc;
 	struct kvm_rmap_desc *prev_desc;
 	struct kvm_mmu_page *sp;
-	pfn_t pfn;
 	gfn_t gfn;
 	unsigned long *rmapp;
 	int i;
 
-	if (!is_rmap_spte(*spte))
-		return;
 	sp = page_header(__pa(spte));
-	pfn = spte_to_pfn(*spte);
-	if (*spte & shadow_accessed_mask)
-		kvm_set_pfn_accessed(pfn);
-	if (is_writable_pte(*spte))
-		kvm_set_pfn_dirty(pfn);
 	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
 	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
 	if (!*rmapp) {
@@ -660,6 +652,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 
 static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
 {
+	pfn_t pfn;
+
+	if (!is_rmap_spte(*sptep)) {
+		__set_spte(sptep, new_spte);
+		return;
+	}
+	pfn = spte_to_pfn(*sptep);
+	if (*sptep & shadow_accessed_mask)
+		kvm_set_pfn_accessed(pfn);
+	if (is_writable_pte(*sptep))
+		kvm_set_pfn_dirty(pfn);
 	rmap_remove(kvm, sptep);
 	__set_spte(sptep, new_spte);
 }
-- 
cgit v1.2.3


From a9221dd5ec125fbec1702fae016c6d2ea1a9a3da Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 6 Jun 2010 14:48:06 +0300
Subject: KVM: MMU: Atomically check for accessed bit when dropping an spte

Currently, in the window between the check for the accessed bit, and actually
dropping the spte, a vcpu can access the page through the spte and set the bit,
which will be ignored by the mmu.

Fix by using an exchange operation to atmoically fetch the spte and drop it.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fbdca08b8d8c..ba2efcf2b86e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -288,6 +288,21 @@ static void __set_spte(u64 *sptep, u64 spte)
 #endif
 }
 
+static u64 __xchg_spte(u64 *sptep, u64 new_spte)
+{
+#ifdef CONFIG_X86_64
+	return xchg(sptep, new_spte);
+#else
+	u64 old_spte;
+
+	do {
+		old_spte = *sptep;
+	} while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
+
+	return old_spte;
+#endif
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 				  struct kmem_cache *base_cache, int min)
 {
@@ -653,18 +668,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
 {
 	pfn_t pfn;
+	u64 old_spte;
 
-	if (!is_rmap_spte(*sptep)) {
-		__set_spte(sptep, new_spte);
+	old_spte = __xchg_spte(sptep, new_spte);
+	if (!is_rmap_spte(old_spte))
 		return;
-	}
-	pfn = spte_to_pfn(*sptep);
-	if (*sptep & shadow_accessed_mask)
+	pfn = spte_to_pfn(old_spte);
+	if (old_spte & shadow_accessed_mask)
 		kvm_set_pfn_accessed(pfn);
-	if (is_writable_pte(*sptep))
+	if (is_writable_pte(old_spte))
 		kvm_set_pfn_dirty(pfn);
 	rmap_remove(kvm, sptep);
-	__set_spte(sptep, new_spte);
 }
 
 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
-- 
cgit v1.2.3


From b79b93f92cb3b66b89d75525fdfd2454b1e1f446 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 6 Jun 2010 15:46:44 +0300
Subject: KVM: MMU: Don't drop accessed bit while updating an spte

__set_spte() will happily replace an spte with the accessed bit set with
one that has the accessed bit clear.  Add a helper update_spte() which checks
for this condition and updates the page flag if needed.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ba2efcf2b86e..d8d48329cb82 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -303,6 +303,19 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte)
 #endif
 }
 
+static void update_spte(u64 *sptep, u64 new_spte)
+{
+	u64 old_spte;
+
+	if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask)) {
+		__set_spte(sptep, new_spte);
+	} else {
+		old_spte = __xchg_spte(sptep, new_spte);
+		if (old_spte & shadow_accessed_mask)
+			mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
+	}
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 				  struct kmem_cache *base_cache, int min)
 {
@@ -721,7 +734,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
 		if (is_writable_pte(*spte)) {
-			__set_spte(spte, *spte & ~PT_WRITABLE_MASK);
+			update_spte(spte, *spte & ~PT_WRITABLE_MASK);
 			write_protected = 1;
 		}
 		spte = rmap_next(kvm, rmapp, spte);
@@ -777,7 +790,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			     unsigned long data)
 {
 	int need_flush = 0;
-	u64 *spte, new_spte;
+	u64 *spte, new_spte, old_spte;
 	pte_t *ptep = (pte_t *)data;
 	pfn_t new_pfn;
 
@@ -797,9 +810,13 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 			new_spte &= ~PT_WRITABLE_MASK;
 			new_spte &= ~SPTE_HOST_WRITEABLE;
+			new_spte &= ~shadow_accessed_mask;
 			if (is_writable_pte(*spte))
 				kvm_set_pfn_dirty(spte_to_pfn(*spte));
-			__set_spte(spte, new_spte);
+			old_spte = __xchg_spte(spte, new_spte);
+			if (is_shadow_present_pte(old_spte)
+			    && (old_spte & shadow_accessed_mask))
+				mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
 			spte = rmap_next(kvm, rmapp, spte);
 		}
 	}
@@ -1922,7 +1939,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		mark_page_dirty(vcpu->kvm, gfn);
 
 set_pte:
-	__set_spte(sptep, spte);
+	update_spte(sptep, spte);
 done:
 	return ret;
 }
-- 
cgit v1.2.3


From a5046e6c7d97d6574ffe6367311ea0b0de56aa58 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 6 Jul 2010 16:49:05 +0800
Subject: KVM: x86 emulator: fix 'mov sreg,rm16' instruction decoding

Memory reads for 'mov sreg,rm16' should be 16 bits only.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e8bdddc4509e..d842a7d2bc64 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -170,7 +170,7 @@ static u32 opcode_table[256] = {
 	ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
 	ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
 	DstMem | SrcReg | ModRM | Mov, ModRM | DstReg,
-	ImplicitOps | SrcMem | ModRM, Group | Group1A,
+	ImplicitOps | SrcMem16 | ModRM, Group | Group1A,
 	/* 0x90 - 0x97 */
 	DstReg, DstReg, DstReg, DstReg,	DstReg, DstReg, DstReg, DstReg,
 	/* 0x98 - 0x9F */
-- 
cgit v1.2.3


From ce7a0ad3bdcd86e6cf907eb5992fecb1503daa26 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 6 Jul 2010 16:50:21 +0800
Subject: KVM: x86 emulator: fix the comment of out instruction

Fix the comment of out instruction, using the same style as the
other instructions.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d842a7d2bc64..ad8d7cdd1eb9 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2949,8 +2949,8 @@ special_insn:
 				     &c->dst.val))
 			goto done; /* IO is needed */
 		break;
-	case 0xee: /* out al,dx */
-	case 0xef: /* out (e/r)ax,dx */
+	case 0xee: /* out dx,al */
+	case 0xef: /* out dx,(e/r)ax */
 		c->src.val = c->regs[VCPU_REGS_RDX];
 	do_io_out:
 		c->dst.bytes = min(c->dst.bytes, 4u);
-- 
cgit v1.2.3


From e97e883f8bfbe02cfc2bfff45e68921dfe590c7e Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 6 Jul 2010 16:51:09 +0800
Subject: KVM: x86 emulator: fix 'and AL,imm8' instruction decoding

'and AL,imm8' should be mask as ByteOp, otherwise the dest operand
length will no correct and we may fill the full EAX when writeback.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index ad8d7cdd1eb9..59568ad21ab3 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -123,7 +123,7 @@ static u32 opcode_table[256] = {
 	/* 0x20 - 0x27 */
 	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
+	ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
 	/* 0x28 - 0x2F */
 	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-- 
cgit v1.2.3


From b16b2b7bb5a78afceb7fe22f2a04476cd70182b7 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 6 Jul 2010 16:52:53 +0800
Subject: KVM: x86 emulator: fix 'mov rm,sreg' instruction decoding

The source operand of 'mov rm,sreg' is segment register, not
general-purpose register, so remove SrcReg from decoding.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 59568ad21ab3..8337567a0f44 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -169,7 +169,7 @@ static u32 opcode_table[256] = {
 	/* 0x88 - 0x8F */
 	ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
 	ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstMem | SrcReg | ModRM | Mov, ModRM | DstReg,
+	DstMem | SrcNone | ModRM | Mov, ModRM | DstReg,
 	ImplicitOps | SrcMem16 | ModRM, Group | Group1A,
 	/* 0x90 - 0x97 */
 	DstReg, DstReg, DstReg, DstReg,	DstReg, DstReg, DstReg, DstReg,
-- 
cgit v1.2.3


From 07cbc6c185aee2c0479776845988242a040c7c93 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 6 Jul 2010 16:54:19 +0800
Subject: KVM: x86 emulator: fix cli/sti instruction emulation

If IOPL check fail, the cli/sti emulate GP and then we should
skip writeback since the default write OP is OP_REG.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 8337567a0f44..286572a5675b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2979,17 +2979,19 @@ special_insn:
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xfa: /* cli */
-		if (emulator_bad_iopl(ctxt, ops))
+		if (emulator_bad_iopl(ctxt, ops)) {
 			emulate_gp(ctxt, 0);
-		else {
+			goto done;
+		} else {
 			ctxt->eflags &= ~X86_EFLAGS_IF;
 			c->dst.type = OP_NONE;	/* Disable writeback. */
 		}
 		break;
 	case 0xfb: /* sti */
-		if (emulator_bad_iopl(ctxt, ops))
+		if (emulator_bad_iopl(ctxt, ops)) {
 			emulate_gp(ctxt, 0);
-		else {
+			goto done;
+		} else {
 			ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
 			ctxt->eflags |= X86_EFLAGS_IF;
 			c->dst.type = OP_NONE;	/* Disable writeback. */
-- 
cgit v1.2.3


From 5d55f299f97769130c6cc67896414c988db309ab Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Wed, 7 Jul 2010 17:43:35 +0800
Subject: KVM: x86 emulator: re-implementing 'mov AL,moffs' instruction
 decoding

This patch change to use DstAcc for decoding 'mov AL, moffs'
and introduced SrcAcc for decoding 'mov moffs, AL'.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/emulate.c | 32 +++++++++++++++++++++++---------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 286572a5675b..255473f974ab 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -70,6 +70,7 @@
 #define SrcSI       (0xa<<4)	/* Source is in the DS:RSI */
 #define SrcImmFAddr (0xb<<4)	/* Source is immediate far address */
 #define SrcMemFAddr (0xc<<4)	/* Source is far address in memory */
+#define SrcAcc      (0xd<<4)	/* Source Accumulator */
 #define SrcMask     (0xf<<4)
 /* Generic ModRM decode. */
 #define ModRM       (1<<8)
@@ -177,8 +178,8 @@ static u32 opcode_table[256] = {
 	0, 0, SrcImmFAddr | No64, 0,
 	ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
 	/* 0xA0 - 0xA7 */
-	ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
-	ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
+	ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs,
+	ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs,
 	ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
 	ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
 	/* 0xA8 - 0xAF */
@@ -1186,6 +1187,25 @@ done_prefixes:
 		else
 			c->src.val = insn_fetch(u8, 1, c->eip);
 		break;
+	case SrcAcc:
+		c->src.type = OP_REG;
+		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->src.ptr = &c->regs[VCPU_REGS_RAX];
+		switch (c->src.bytes) {
+			case 1:
+				c->src.val = *(u8 *)c->src.ptr;
+				break;
+			case 2:
+				c->src.val = *(u16 *)c->src.ptr;
+				break;
+			case 4:
+				c->src.val = *(u32 *)c->src.ptr;
+				break;
+			case 8:
+				c->src.val = *(u64 *)c->src.ptr;
+				break;
+		}
+		break;
 	case SrcOne:
 		c->src.bytes = 1;
 		c->src.val = 1;
@@ -2854,13 +2874,7 @@ special_insn:
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 		break;
-	case 0xa0 ... 0xa1:	/* mov */
-		c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
-		c->dst.val = c->src.val;
-		break;
-	case 0xa2 ... 0xa3:	/* mov */
-		c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
-		break;
+	case 0xa0 ... 0xa3:	/* mov */
 	case 0xa4 ... 0xa5:	/* movs */
 		goto mov;
 	case 0xa6 ... 0xa7:	/* cmps */
-- 
cgit v1.2.3


From b0eeec29fe7a5b114000f769bd68ffa02652bfb7 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 6 Jul 2010 15:40:18 +0300
Subject: KVM: MMU: Only indicate a fetch fault in page fault error code if nx
 is enabled

Bit 4 of the page fault error code is set only if EFER.NX is set.

Reviewed-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 796a325c7e59..3a3f6d784d79 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -245,7 +245,7 @@ err:
 		walker->error_code |= PFERR_WRITE_MASK;
 	if (user_fault)
 		walker->error_code |= PFERR_USER_MASK;
-	if (fetch_fault)
+	if (fetch_fault && is_nx(vcpu))
 		walker->error_code |= PFERR_FETCH_MASK;
 	if (rsvd_fault)
 		walker->error_code |= PFERR_RSVD_MASK;
-- 
cgit v1.2.3


From f59c1d2ded54e4bd7a9126f4a32c9eca8b336457 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 6 Jul 2010 16:20:43 +0300
Subject: KVM: MMU: Keep going on permission error

Real hardware disregards permission errors when computing page fault error
code bit 0 (page present).  Do the same.

Reviewed-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 52 ++++++++++++++++++++++++++--------------------
 1 file changed, 30 insertions(+), 22 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 3a3f6d784d79..1cea41cad069 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -119,21 +119,25 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
 {
 	pt_element_t pte;
 	gfn_t table_gfn;
-	unsigned index, pt_access, pte_access;
+	unsigned index, pt_access, uninitialized_var(pte_access);
 	gpa_t pte_gpa;
-	int rsvd_fault = 0;
+	bool eperm, present, rsvd_fault;
 
 	trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
 				     fetch_fault);
 walk:
+	present = true;
+	eperm = rsvd_fault = false;
 	walker->level = vcpu->arch.mmu.root_level;
 	pte = vcpu->arch.cr3;
 #if PTTYPE == 64
 	if (!is_long_mode(vcpu)) {
 		pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
 		trace_kvm_mmu_paging_element(pte, walker->level);
-		if (!is_present_gpte(pte))
-			goto not_present;
+		if (!is_present_gpte(pte)) {
+			present = false;
+			goto error;
+		}
 		--walker->level;
 	}
 #endif
@@ -151,31 +155,36 @@ walk:
 		walker->table_gfn[walker->level - 1] = table_gfn;
 		walker->pte_gpa[walker->level - 1] = pte_gpa;
 
-		if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)))
-			goto not_present;
+		if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) {
+			present = false;
+			break;
+		}
 
 		trace_kvm_mmu_paging_element(pte, walker->level);
 
-		if (!is_present_gpte(pte))
-			goto not_present;
+		if (!is_present_gpte(pte)) {
+			present = false;
+			break;
+		}
 
-		rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
-		if (rsvd_fault)
-			goto access_error;
+		if (is_rsvd_bits_set(vcpu, pte, walker->level)) {
+			rsvd_fault = true;
+			break;
+		}
 
 		if (write_fault && !is_writable_pte(pte))
 			if (user_fault || is_write_protection(vcpu))
-				goto access_error;
+				eperm = true;
 
 		if (user_fault && !(pte & PT_USER_MASK))
-			goto access_error;
+			eperm = true;
 
 #if PTTYPE == 64
 		if (fetch_fault && (pte & PT64_NX_MASK))
-			goto access_error;
+			eperm = true;
 #endif
 
-		if (!(pte & PT_ACCESSED_MASK)) {
+		if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) {
 			trace_kvm_mmu_set_accessed_bit(table_gfn, index,
 						       sizeof(pte));
 			if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
@@ -214,6 +223,9 @@ walk:
 		--walker->level;
 	}
 
+	if (!present || eperm || rsvd_fault)
+		goto error;
+
 	if (write_fault && !is_dirty_gpte(pte)) {
 		bool ret;
 
@@ -233,14 +245,10 @@ walk:
 		 __func__, (u64)pte, pte_access, pt_access);
 	return 1;
 
-not_present:
+error:
 	walker->error_code = 0;
-	goto err;
-
-access_error:
-	walker->error_code = PFERR_PRESENT_MASK;
-
-err:
+	if (present)
+		walker->error_code |= PFERR_PRESENT_MASK;
 	if (write_fault)
 		walker->error_code |= PFERR_WRITE_MASK;
 	if (user_fault)
-- 
cgit v1.2.3


From 673813e81d8468e80b6dd0fa839923eb9748dc49 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Wed, 7 Jul 2010 15:02:25 +0200
Subject: KVM: fix lock imbalance in kvm_create_pit()

Stanse found that there is an omitted unlock in kvm_create_pit in one fail
path. Add proper unlock there.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org
Cc: Gleb Natapov <gleb@redhat.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Gregory Haskins <ghaskins@novell.com>
Cc: kvm@vger.kernel.org
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8254.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 467cc47fb733..70db4d435393 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -696,6 +696,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 
 	pit->wq = create_singlethread_workqueue("kvm-pit-wq");
 	if (!pit->wq) {
+		mutex_unlock(&pit->pit_state.lock);
 		kfree(pit);
 		return NULL;
 	}
-- 
cgit v1.2.3


From fa7bff8f8a7d3de61c0473d0b6dc5a0f4fdc6ac9 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 7 Jul 2010 20:16:44 +0300
Subject: KVM: define hwpoison variables static

They are not used outside of the file.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a60b6b053b6f..630d1224f187 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -93,8 +93,8 @@ static bool kvm_rebooting;
 
 static bool largepages_enabled = true;
 
-struct page *hwpoison_page;
-pfn_t hwpoison_pfn;
+static struct page *hwpoison_page;
+static pfn_t hwpoison_pfn;
 
 inline int kvm_is_mmio_pfn(pfn_t pfn)
 {
-- 
cgit v1.2.3


From edba23e51578f7cb6781461568489fc1825db4ac Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 7 Jul 2010 20:16:45 +0300
Subject: KVM: Return EFAULT from kvm ioctl when guest accesses bad area

Currently if guest access address that belongs to memory slot but is not
backed up by page or page is read only KVM treats it like MMIO access.
Remove that capability. It was never part of the interface and should
not be relied upon.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c       |  4 +++-
 include/linux/kvm_host.h |  1 +
 virt/kvm/kvm_main.c      | 28 ++++++++++++++++++++++++----
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d8d48329cb82..89d7a2cae53b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2078,7 +2078,9 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
 	if (is_hwpoison_pfn(pfn)) {
 		kvm_send_hwpoison_signal(kvm, gfn);
 		return 0;
-	}
+	} else if (is_fault_pfn(pfn))
+		return -EFAULT;
+
 	return 1;
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e796326f3646..8055067b6bec 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -269,6 +269,7 @@ extern pfn_t bad_pfn;
 int is_error_page(struct page *page);
 int is_error_pfn(pfn_t pfn);
 int is_hwpoison_pfn(pfn_t pfn);
+int is_fault_pfn(pfn_t pfn);
 int kvm_is_error_hva(unsigned long addr);
 int kvm_set_memory_region(struct kvm *kvm,
 			  struct kvm_userspace_memory_region *mem,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 630d1224f187..b78b794c1039 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -96,6 +96,9 @@ static bool largepages_enabled = true;
 static struct page *hwpoison_page;
 static pfn_t hwpoison_pfn;
 
+static struct page *fault_page;
+static pfn_t fault_pfn;
+
 inline int kvm_is_mmio_pfn(pfn_t pfn)
 {
 	if (pfn_valid(pfn)) {
@@ -815,13 +818,13 @@ EXPORT_SYMBOL_GPL(kvm_disable_largepages);
 
 int is_error_page(struct page *page)
 {
-	return page == bad_page || page == hwpoison_page;
+	return page == bad_page || page == hwpoison_page || page == fault_page;
 }
 EXPORT_SYMBOL_GPL(is_error_page);
 
 int is_error_pfn(pfn_t pfn)
 {
-	return pfn == bad_pfn || pfn == hwpoison_pfn;
+	return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
 }
 EXPORT_SYMBOL_GPL(is_error_pfn);
 
@@ -831,6 +834,12 @@ int is_hwpoison_pfn(pfn_t pfn)
 }
 EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
 
+int is_fault_pfn(pfn_t pfn)
+{
+	return pfn == fault_pfn;
+}
+EXPORT_SYMBOL_GPL(is_fault_pfn);
+
 static inline unsigned long bad_hva(void)
 {
 	return PAGE_OFFSET;
@@ -959,8 +968,8 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
 		if (vma == NULL || addr < vma->vm_start ||
 		    !(vma->vm_flags & VM_PFNMAP)) {
 			up_read(&current->mm->mmap_sem);
-			get_page(bad_page);
-			return page_to_pfn(bad_page);
+			get_page(fault_page);
+			return page_to_pfn(fault_page);
 		}
 
 		pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -2226,6 +2235,15 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 
 	hwpoison_pfn = page_to_pfn(hwpoison_page);
 
+	fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+
+	if (fault_page == NULL) {
+		r = -ENOMEM;
+		goto out_free_0;
+	}
+
+	fault_pfn = page_to_pfn(fault_page);
+
 	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
 		r = -ENOMEM;
 		goto out_free_0;
@@ -2298,6 +2316,8 @@ out_free_1:
 out_free_0a:
 	free_cpumask_var(cpus_hardware_enabled);
 out_free_0:
+	if (fault_page)
+		__free_page(fault_page);
 	if (hwpoison_page)
 		__free_page(hwpoison_page);
 	__free_page(bad_page);
-- 
cgit v1.2.3


From a6f177efaa5856e22ed0d3c1e81e65b41654d083 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 8 Jul 2010 12:41:12 +0300
Subject: KVM: Reenter guest after emulation failure if due to access to
 non-mmio address

When shadow pages are in use sometimes KVM try to emulate an instruction
when it accesses a shadowed page. If emulation fails KVM un-shadows the
page and reenter guest to allow vcpu to execute the instruction. If page
is not in shadow page hash KVM assumes that this was attempt to do MMIO
and reports emulation failure to userspace since there is no way to fix
the situation. This logic has a race though. If two vcpus tries to write
to the same shadowed page simultaneously both will enter emulator, but
only one of them will find the page in shadow page hash since the one who
founds it also removes it from there, so another cpu will report failure
to userspace and will abort the guest.

Fix this by checking (in addition to checking shadowed page hash) that
page that caused the emulation belongs to valid memory slot. If it is
then reenter the guest to allow vcpu to reexecute the instruction.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3d72fc067059..d51eed239b47 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3930,6 +3930,29 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
 	return EMULATE_FAIL;
 }
 
+static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
+{
+	gpa_t gpa;
+
+	/*
+	 * if emulation was due to access to shadowed page table
+	 * and it failed try to unshadow page and re-entetr the
+	 * guest to let CPU execute the instruction.
+	 */
+	if (kvm_mmu_unprotect_page_virt(vcpu, gva))
+		return true;
+
+	gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+
+	if (gpa == UNMAPPED_GVA)
+		return true; /* let cpu generate fault */
+
+	if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
+		return true;
+
+	return false;
+}
+
 int emulate_instruction(struct kvm_vcpu *vcpu,
 			unsigned long cr2,
 			u16 error_code,
@@ -3998,7 +4021,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 
 		++vcpu->stat.insn_emulation;
 		if (r)  {
-			if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+			if (reexecute_instruction(vcpu, cr2))
 				return EMULATE_DONE;
 			if (emulation_type & EMULTYPE_SKIP)
 				return EMULATE_FAIL;
@@ -4019,12 +4042,7 @@ restart:
 	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
 
 	if (r) { /* emulation failed */
-		/*
-		 * if emulation was due to access to shadowed page table
-		 * and it failed try to unshadow page and re-entetr the
-		 * guest to let CPU execute the instruction.
-		 */
-		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+		if (reexecute_instruction(vcpu, cr2))
 			return EMULATE_DONE;
 
 		return handle_emulation_failure(vcpu);
-- 
cgit v1.2.3


From aea924f606c309feead37ab5c43f410a08ff3826 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Sat, 10 Jul 2010 17:37:56 +0800
Subject: KVM: PIT: stop vpit before freeing irq_routing

Fix:
general protection fault: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
......
Call Trace:
 [<ffffffffa0159bd1>] ? kvm_set_irq+0xdd/0x24b [kvm]
 [<ffffffff8106ea8b>] ? trace_hardirqs_off_caller+0x1f/0x10e
 [<ffffffff813ad17f>] ? sub_preempt_count+0xe/0xb6
 [<ffffffff8106d273>] ? put_lock_stats+0xe/0x27
...
RIP  [<ffffffffa0159c72>] kvm_set_irq+0x17e/0x24b [kvm]

This bug is triggered when guest is shutdown, is because we freed
irq_routing before pit thread stopped

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/i8254.c | 3 +++
 arch/x86/kvm/x86.c   | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 70db4d435393..0fd6378981f4 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -752,6 +752,9 @@ void kvm_free_pit(struct kvm *kvm)
 	struct hrtimer *timer;
 
 	if (kvm->arch.vpit) {
+		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev);
+		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
+					      &kvm->arch.vpit->speaker_dev);
 		kvm_unregister_irq_mask_notifier(kvm, 0,
 					       &kvm->arch.vpit->mask_notifier);
 		kvm_unregister_irq_ack_notifier(kvm,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d51eed239b47..d721e2d81a53 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5523,12 +5523,12 @@ static void kvm_free_vcpus(struct kvm *kvm)
 void kvm_arch_sync_events(struct kvm *kvm)
 {
 	kvm_free_all_assigned_devices(kvm);
+	kvm_free_pit(kvm);
 }
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	kvm_iommu_unmap_guest(kvm);
-	kvm_free_pit(kvm);
 	kfree(kvm->arch.vpic);
 	kfree(kvm->arch.vioapic);
 	kvm_free_vcpus(kvm);
-- 
cgit v1.2.3


From 908e75f3e70ca580cc20442cf6780dcc2d0557b7 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 7 Jul 2010 14:09:38 +0300
Subject: KVM: Expose MCE control MSRs to userspace

Userspace needs to reset and save/restore these MSRs.

The MCE banks are not exposed since their number varies from vcpu to vcpu.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d721e2d81a53..eb55ec55125c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -744,6 +744,8 @@ static unsigned num_msrs_to_save;
 
 static u32 emulated_msrs[] = {
 	MSR_IA32_MISC_ENABLE,
+	MSR_IA32_MCG_STATUS,
+	MSR_IA32_MCG_CTL,
 };
 
 static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
-- 
cgit v1.2.3


From 2e2602ca5df09e4c4e2b593bc30abead9a8617d4 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 7 Jul 2010 14:09:39 +0300
Subject: KVM: Document MCE banks non-exposure via KVM_GET_MSR_INDEX_LIST

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 Documentation/kvm/api.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index d9b00f15fbe6..179cc51c15e6 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -126,6 +126,10 @@ user fills in the size of the indices array in nmsrs, and in return
 kvm adjusts nmsrs to reflect the actual number of msrs and fills in
 the indices array with their numbers.
 
+Note: if kvm indicates supports MCE (KVM_CAP_MCE), then the MCE bank MSRs are
+not returned in the MSR list, as different vcpus can have a different number
+of banks, as set via the KVM_X86_SETUP_MCE ioctl.
+
 4.4 KVM_CHECK_EXTENSION
 
 Capability: basic
-- 
cgit v1.2.3


From 4a994358b919c3b14de61be5e30d9edc9089ba3f Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 11 Jul 2010 15:32:23 +0300
Subject: KVM: Convert mask notifiers to use irqchip/pin instead of gsi

Devices register mask notifier using gsi, but irqchip knows about
irqchip/pin, so conversion from irqchip/pin to gsi should be done before
looking for mask notifier to call.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 include/linux/kvm_host.h |  3 ++-
 virt/kvm/ioapic.c        |  2 +-
 virt/kvm/irq_comm.c      | 12 ++++++++----
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8055067b6bec..c13cc48697aa 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -447,7 +447,8 @@ void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
 				    struct kvm_irq_mask_notifier *kimn);
 void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 				      struct kvm_irq_mask_notifier *kimn);
-void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+			     bool mask);
 
 #ifdef __KVM_HAVE_IOAPIC
 void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 1149c60b198f..0b9df8303dcf 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -152,7 +152,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 		update_handled_vectors(ioapic);
 		mask_after = e->fields.mask;
 		if (mask_before != mask_after)
-			kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after);
+			kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
 		if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
 		    && ioapic->irr & (1 << index))
 			ioapic_service(ioapic, index);
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 06cf61e729d2..369e38010ad5 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -279,15 +279,19 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 	synchronize_rcu();
 }
 
-void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask)
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+			     bool mask)
 {
 	struct kvm_irq_mask_notifier *kimn;
 	struct hlist_node *n;
+	int gsi;
 
 	rcu_read_lock();
-	hlist_for_each_entry_rcu(kimn, n, &kvm->mask_notifier_list, link)
-		if (kimn->irq == irq)
-			kimn->func(kimn, mask);
+	gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
+	if (gsi != -1)
+		hlist_for_each_entry_rcu(kimn, n, &kvm->mask_notifier_list, link)
+			if (kimn->irq == gsi)
+				kimn->func(kimn, mask);
 	rcu_read_unlock();
 }
 
-- 
cgit v1.2.3


From 32ef26a3598636be520abed90ed0c2f439d36bbe Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 13 Jul 2010 14:27:04 +0300
Subject: KVM: MMU: Add link_shadow_page() helper

To simplify the process of fetching an spte, add a helper that links
a shadow page to an spte.

Reviewed-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c         | 10 ++++++++++
 arch/x86/kvm/paging_tmpl.h |  7 ++-----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 89d7a2cae53b..df3a7a79cce3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1482,6 +1482,16 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
 	--iterator->level;
 }
 
+static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
+{
+	u64 spte;
+
+	spte = __pa(sp->spt)
+		| PT_PRESENT_MASK | PT_ACCESSED_MASK
+		| PT_WRITABLE_MASK | PT_USER_MASK;
+	*sptep = spte;
+}
+
 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 					 struct kvm_mmu_page *sp)
 {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 1cea41cad069..36dc0749c878 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -309,7 +309,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 {
 	unsigned access = gw->pt_access;
 	struct kvm_mmu_page *sp;
-	u64 spte, *sptep = NULL;
+	u64 *sptep = NULL;
 	int direct;
 	gfn_t table_gfn;
 	int r;
@@ -395,10 +395,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			}
 		}
 
-		spte = __pa(sp->spt)
-			| PT_PRESENT_MASK | PT_ACCESSED_MASK
-			| PT_WRITABLE_MASK | PT_USER_MASK;
-		*sptep = spte;
+		link_shadow_page(sptep, sp);
 	}
 
 	return sptep;
-- 
cgit v1.2.3


From 121eee97a7802acda8b78436cc53196e9885549f Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 13 Jul 2010 14:27:05 +0300
Subject: KVM: MMU: Use __set_spte to link shadow pages

To avoid split accesses to 64 bit sptes on i386, use __set_spte() to link
shadow pages together.

(not technically required since shadow pages are __GFP_KERNEL, so upper 32
bits are always clear)

Reviewed-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index df3a7a79cce3..5a6019a534a3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1489,7 +1489,7 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
 	spte = __pa(sp->spt)
 		| PT_PRESENT_MASK | PT_ACCESSED_MASK
 		| PT_WRITABLE_MASK | PT_USER_MASK;
-	*sptep = spte;
+	__set_spte(sptep, spte);
 }
 
 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
-- 
cgit v1.2.3


From a3aa51cfaafe9179add88db20506ccb07e030b47 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 13 Jul 2010 14:27:06 +0300
Subject: KVM: MMU: Add drop_large_spte() helper

To clarify spte fetching code, move large spte handling into a helper.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c         | 8 ++++++++
 arch/x86/kvm/paging_tmpl.h | 5 +----
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5a6019a534a3..b75d6cb44ab6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1492,6 +1492,14 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
 	__set_spte(sptep, spte);
 }
 
+static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+	if (is_large_pte(*sptep)) {
+		drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+		kvm_flush_remote_tlbs(vcpu->kvm);
+	}
+}
+
 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 					 struct kvm_mmu_page *sp)
 {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 36dc0749c878..0fb7068d64c7 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -360,10 +360,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			kvm_flush_remote_tlbs(vcpu->kvm);
 		}
 
-		if (is_large_pte(*sptep)) {
-			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
-			kvm_flush_remote_tlbs(vcpu->kvm);
-		}
+		drop_large_spte(vcpu, sptep);
 
 		if (level <= gw->level) {
 			direct = 1;
-- 
cgit v1.2.3


From a357bd229cdaf37a41798d238ab50b34c71dd0d6 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 13 Jul 2010 14:27:07 +0300
Subject: KVM: MMU: Add validate_direct_spte() helper

Add a helper to verify that a direct shadow page is valid wrt the required
access permissions; drop the page if it is not valid.

Reviewed-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/mmu.c         | 23 +++++++++++++++++++++++
 arch/x86/kvm/paging_tmpl.h | 27 ++++++---------------------
 2 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b75d6cb44ab6..36c62f33513f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1500,6 +1500,29 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
 	}
 }
 
+static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
+				   unsigned direct_access)
+{
+	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
+		struct kvm_mmu_page *child;
+
+		/*
+		 * For the direct sp, if the guest pte's dirty bit
+		 * changed form clean to dirty, it will corrupt the
+		 * sp's access: allow writable in the read-only sp,
+		 * so we should update the spte at this point to get
+		 * a new sp with the correct access.
+		 */
+		child = page_header(*sptep & PT64_BASE_ADDR_MASK);
+		if (child->role.access == direct_access)
+			return;
+
+		mmu_page_remove_parent_pte(child, sptep);
+		__set_spte(sptep, shadow_trap_nonpresent_pte);
+		kvm_flush_remote_tlbs(vcpu->kvm);
+	}
+}
+
 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 					 struct kvm_mmu_page *sp)
 {
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 0fb7068d64c7..0c7461d3a5be 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -338,30 +338,15 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			break;
 		}
 
-		if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
-			struct kvm_mmu_page *child;
-
-			if (level != gw->level)
-				continue;
-
-			/*
-			 * For the direct sp, if the guest pte's dirty bit
-			 * changed form clean to dirty, it will corrupt the
-			 * sp's access: allow writable in the read-only sp,
-			 * so we should update the spte at this point to get
-			 * a new sp with the correct access.
-			 */
-			child = page_header(*sptep & PT64_BASE_ADDR_MASK);
-			if (child->role.access == direct_access)
-				continue;
-
-			mmu_page_remove_parent_pte(child, sptep);
-			__set_spte(sptep, shadow_trap_nonpresent_pte);
-			kvm_flush_remote_tlbs(vcpu->kvm);
-		}
+		if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)
+		    && level == gw->level)
+			validate_direct_spte(vcpu, sptep, direct_access);
 
 		drop_large_spte(vcpu, sptep);
 
+		if (is_shadow_present_pte(*sptep))
+			continue;
+
 		if (level <= gw->level) {
 			direct = 1;
 			access = direct_access;
-- 
cgit v1.2.3


From 39c8c672a18c52048343d7531dfb2dcf3431ee74 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 13 Jul 2010 14:27:08 +0300
Subject: KVM: MMU: Add gpte_valid() helper

Move the code to check whether a gpte has changed since we fetched it into
a helper.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 0c7461d3a5be..e1c1f9eb1cc1 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -299,6 +299,17 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 		     gpte_to_gfn(gpte), pfn, true, true);
 }
 
+static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
+				struct guest_walker *gw, int level)
+{
+	int r;
+	pt_element_t curr_pte;
+
+	r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1],
+				  &curr_pte, sizeof(curr_pte));
+	return r || curr_pte != gw->ptes[level - 1];
+}
+
 /*
  * Fetch a shadow pte for a specific level in the paging hierarchy.
  */
@@ -312,11 +323,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	u64 *sptep = NULL;
 	int direct;
 	gfn_t table_gfn;
-	int r;
 	int level;
 	bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
 	unsigned direct_access;
-	pt_element_t curr_pte;
 	struct kvm_shadow_walk_iterator iterator;
 
 	if (!is_present_gpte(gw->ptes[gw->level - 1]))
@@ -365,17 +374,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 		}
 		sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
 					       direct, access, sptep);
-		if (!direct) {
-			r = kvm_read_guest_atomic(vcpu->kvm,
-						  gw->pte_gpa[level - 2],
-						  &curr_pte, sizeof(curr_pte));
-			if (r || curr_pte != gw->ptes[level - 2]) {
+		if (!direct)
+			/*
+			 * Verify that the gpte in the page we've just write
+			 * protected is still there.
+			 */
+			if (FNAME(gpte_changed)(vcpu, gw, level - 1)) {
 				kvm_mmu_put_page(sp, sptep);
 				kvm_release_pfn_clean(pfn);
 				sptep = NULL;
 				break;
 			}
-		}
 
 		link_shadow_page(sptep, sp);
 	}
-- 
cgit v1.2.3


From 0b3c933302262d83018dd5f69656bca9f28a0cd3 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 13 Jul 2010 14:27:09 +0300
Subject: KVM: MMU: Simplify spte fetch() function

Partition the function into three sections:

- fetching indirect shadow pages (host_level > guest_level)
- fetching direct shadow pages (page_level < host_level <= guest_level)
- the final spte (page_level == host_level)

Instead of the current spaghetti.

A slight change from the original code is that we call validate_direct_spte()
more often: previously we called it only for gw->level, now we also call it for
lower levels.  The change should have no effect.

[xiao: fix regression caused by validate_direct_spte() called too late]

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 93 ++++++++++++++++++++++++----------------------
 1 file changed, 49 insertions(+), 44 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index e1c1f9eb1cc1..368e4cb6233b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -321,9 +321,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	unsigned access = gw->pt_access;
 	struct kvm_mmu_page *sp;
 	u64 *sptep = NULL;
-	int direct;
-	gfn_t table_gfn;
-	int level;
+	int uninitialized_var(level);
 	bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
 	unsigned direct_access;
 	struct kvm_shadow_walk_iterator iterator;
@@ -335,61 +333,68 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	if (!dirty)
 		direct_access &= ~ACC_WRITE_MASK;
 
-	for_each_shadow_entry(vcpu, addr, iterator) {
+	for (shadow_walk_init(&iterator, vcpu, addr);
+	     shadow_walk_okay(&iterator) && iterator.level > gw->level;
+	     shadow_walk_next(&iterator)) {
+		gfn_t table_gfn;
+
 		level = iterator.level;
 		sptep = iterator.sptep;
-		if (iterator.level == hlevel) {
-			mmu_set_spte(vcpu, sptep, access,
-				     gw->pte_access & access,
-				     user_fault, write_fault,
-				     dirty, ptwrite, level,
-				     gw->gfn, pfn, false, true);
-			break;
-		}
-
-		if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)
-		    && level == gw->level)
-			validate_direct_spte(vcpu, sptep, direct_access);
 
 		drop_large_spte(vcpu, sptep);
 
 		if (is_shadow_present_pte(*sptep))
 			continue;
 
-		if (level <= gw->level) {
-			direct = 1;
-			access = direct_access;
-
-			/*
-			 * It is a large guest pages backed by small host pages,
-			 * So we set @direct(@sp->role.direct)=1, and set
-			 * @table_gfn(@sp->gfn)=the base page frame for linear
-			 * translations.
-			 */
-			table_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
-			access &= gw->pte_access;
-		} else {
-			direct = 0;
-			table_gfn = gw->table_gfn[level - 2];
-		}
+		table_gfn = gw->table_gfn[level - 2];
 		sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
-					       direct, access, sptep);
-		if (!direct)
-			/*
-			 * Verify that the gpte in the page we've just write
-			 * protected is still there.
-			 */
-			if (FNAME(gpte_changed)(vcpu, gw, level - 1)) {
-				kvm_mmu_put_page(sp, sptep);
-				kvm_release_pfn_clean(pfn);
-				sptep = NULL;
-				break;
-			}
+				      false, access, sptep);
+
+		/*
+		 * Verify that the gpte in the page we've just write
+		 * protected is still there.
+		 */
+		if (FNAME(gpte_changed)(vcpu, gw, level - 1))
+			goto out_gpte_changed;
+
+		link_shadow_page(sptep, sp);
+	}
+
+	for (;
+	     shadow_walk_okay(&iterator) && iterator.level > hlevel;
+	     shadow_walk_next(&iterator)) {
+		gfn_t direct_gfn;
 
+		level = iterator.level;
+		sptep = iterator.sptep;
+
+		validate_direct_spte(vcpu, sptep, direct_access);
+
+		drop_large_spte(vcpu, sptep);
+
+		if (is_shadow_present_pte(*sptep))
+			continue;
+
+		direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
+
+		sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, level-1,
+				      true, direct_access, sptep);
 		link_shadow_page(sptep, sp);
 	}
 
+	sptep = iterator.sptep;
+	level = iterator.level;
+
+	mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
+		     user_fault, write_fault, dirty, ptwrite, level,
+		     gw->gfn, pfn, false, true);
+
 	return sptep;
+
+out_gpte_changed:
+	kvm_mmu_put_page(sp, sptep);
+	kvm_release_pfn_clean(pfn);
+	return NULL;
 }
 
 /*
-- 
cgit v1.2.3


From 5991b33237b7fc7dd9f62ae04998c42217d444a7 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 13 Jul 2010 14:27:10 +0300
Subject: KVM: MMU: Validate all gptes during fetch, not just those used for
 new pages

Currently, when we fetch an spte, we only verify that gptes match those that
the walker saw if we build new shadow pages for them.

However, this misses the following race:

  vcpu1            vcpu2

  walk
                  change gpte
                  walk
                  instantiate sp

  fetch existing sp

Fix by validating every gpte, regardless of whether it is used for building
a new sp or not.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 368e4cb6233b..8cb85f9c8adb 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -319,10 +319,11 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			 int *ptwrite, pfn_t pfn)
 {
 	unsigned access = gw->pt_access;
-	struct kvm_mmu_page *sp;
+	struct kvm_mmu_page *sp = NULL;
 	u64 *sptep = NULL;
 	int uninitialized_var(level);
 	bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
+	int top_level;
 	unsigned direct_access;
 	struct kvm_shadow_walk_iterator iterator;
 
@@ -333,6 +334,18 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	if (!dirty)
 		direct_access &= ~ACC_WRITE_MASK;
 
+	top_level = vcpu->arch.mmu.root_level;
+	if (top_level == PT32E_ROOT_LEVEL)
+		top_level = PT32_ROOT_LEVEL;
+	/*
+	 * Verify that the top-level gpte is still there.  Since the page
+	 * is a root page, it is either write protected (and cannot be
+	 * changed from now on) or it is invalid (in which case, we don't
+	 * really care if it changes underneath us after this point).
+	 */
+	if (FNAME(gpte_changed)(vcpu, gw, top_level))
+		goto out_gpte_changed;
+
 	for (shadow_walk_init(&iterator, vcpu, addr);
 	     shadow_walk_okay(&iterator) && iterator.level > gw->level;
 	     shadow_walk_next(&iterator)) {
@@ -343,12 +356,12 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 
 		drop_large_spte(vcpu, sptep);
 
-		if (is_shadow_present_pte(*sptep))
-			continue;
-
-		table_gfn = gw->table_gfn[level - 2];
-		sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
-				      false, access, sptep);
+		sp = NULL;
+		if (!is_shadow_present_pte(*sptep)) {
+			table_gfn = gw->table_gfn[level - 2];
+			sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
+					      false, access, sptep);
+		}
 
 		/*
 		 * Verify that the gpte in the page we've just write
@@ -357,7 +370,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 		if (FNAME(gpte_changed)(vcpu, gw, level - 1))
 			goto out_gpte_changed;
 
-		link_shadow_page(sptep, sp);
+		if (sp)
+			link_shadow_page(sptep, sp);
 	}
 
 	for (;
@@ -392,7 +406,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	return sptep;
 
 out_gpte_changed:
-	kvm_mmu_put_page(sp, sptep);
+	if (sp)
+		kvm_mmu_put_page(sp, sptep);
 	kvm_release_pfn_clean(pfn);
 	return NULL;
 }
-- 
cgit v1.2.3


From 24157aaf833261e68e5a398fa54bd15e4fa1d0b7 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 13 Jul 2010 14:27:11 +0300
Subject: KVM: MMU: Eliminate redundant temporaries in FNAME(fetch)

'level' and 'sptep' are aliases for 'interator.level' and 'iterator.sptep', no
need for them.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 59 +++++++++++++++++++---------------------------
 1 file changed, 24 insertions(+), 35 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 8cb85f9c8adb..d9a2742014e3 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -320,12 +320,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 {
 	unsigned access = gw->pt_access;
 	struct kvm_mmu_page *sp = NULL;
-	u64 *sptep = NULL;
-	int uninitialized_var(level);
 	bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
 	int top_level;
 	unsigned direct_access;
-	struct kvm_shadow_walk_iterator iterator;
+	struct kvm_shadow_walk_iterator it;
 
 	if (!is_present_gpte(gw->ptes[gw->level - 1]))
 		return NULL;
@@ -346,68 +344,59 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 	if (FNAME(gpte_changed)(vcpu, gw, top_level))
 		goto out_gpte_changed;
 
-	for (shadow_walk_init(&iterator, vcpu, addr);
-	     shadow_walk_okay(&iterator) && iterator.level > gw->level;
-	     shadow_walk_next(&iterator)) {
+	for (shadow_walk_init(&it, vcpu, addr);
+	     shadow_walk_okay(&it) && it.level > gw->level;
+	     shadow_walk_next(&it)) {
 		gfn_t table_gfn;
 
-		level = iterator.level;
-		sptep = iterator.sptep;
-
-		drop_large_spte(vcpu, sptep);
+		drop_large_spte(vcpu, it.sptep);
 
 		sp = NULL;
-		if (!is_shadow_present_pte(*sptep)) {
-			table_gfn = gw->table_gfn[level - 2];
-			sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
-					      false, access, sptep);
+		if (!is_shadow_present_pte(*it.sptep)) {
+			table_gfn = gw->table_gfn[it.level - 2];
+			sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
+					      false, access, it.sptep);
 		}
 
 		/*
 		 * Verify that the gpte in the page we've just write
 		 * protected is still there.
 		 */
-		if (FNAME(gpte_changed)(vcpu, gw, level - 1))
+		if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
 			goto out_gpte_changed;
 
 		if (sp)
-			link_shadow_page(sptep, sp);
+			link_shadow_page(it.sptep, sp);
 	}
 
 	for (;
-	     shadow_walk_okay(&iterator) && iterator.level > hlevel;
-	     shadow_walk_next(&iterator)) {
+	     shadow_walk_okay(&it) && it.level > hlevel;
+	     shadow_walk_next(&it)) {
 		gfn_t direct_gfn;
 
-		level = iterator.level;
-		sptep = iterator.sptep;
+		validate_direct_spte(vcpu, it.sptep, direct_access);
 
-		validate_direct_spte(vcpu, sptep, direct_access);
+		drop_large_spte(vcpu, it.sptep);
 
-		drop_large_spte(vcpu, sptep);
-
-		if (is_shadow_present_pte(*sptep))
+		if (is_shadow_present_pte(*it.sptep))
 			continue;
 
-		direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
+		direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
 
-		sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, level-1,
-				      true, direct_access, sptep);
-		link_shadow_page(sptep, sp);
+		sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
+				      true, direct_access, it.sptep);
+		link_shadow_page(it.sptep, sp);
 	}
 
-	sptep = iterator.sptep;
-	level = iterator.level;
-
-	mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
-		     user_fault, write_fault, dirty, ptwrite, level,
+	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
+		     user_fault, write_fault, dirty, ptwrite, it.level,
 		     gw->gfn, pfn, false, true);
 
-	return sptep;
+	return it.sptep;
 
 out_gpte_changed:
 	if (sp)
-		kvm_mmu_put_page(sp, sptep);
+		kvm_mmu_put_page(sp, it.sptep);
 	kvm_release_pfn_clean(pfn);
 	return NULL;
 }
-- 
cgit v1.2.3


From c0e0608cb902af1a1fd8d413ec0a07ee1e62c652 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 13 Jul 2010 16:40:23 +0300
Subject: KVM: x86: emulator: inc/dec can have lock prefix

Mark inc (0xfe/0 0xff/0) and dec (0xfe/1 0xff/1) as lock prefix capable.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 255473f974ab..b38bd8b92aa6 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -345,10 +345,10 @@ static u32 group_table[] = {
 	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
 	0, 0, 0, 0,
 	[Group4*8] =
-	ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+	ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
 	0, 0, 0, 0, 0, 0,
 	[Group5*8] =
-	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+	DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
 	SrcMem | ModRM | Stack, 0,
 	SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps,
 	SrcMem | ModRM | Stack, 0,
-- 
cgit v1.2.3


From d153513ddef5698a292b53790008b2e7660cd08d Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 14 Jul 2010 09:45:21 +0300
Subject: KVM: Document KVM_GET_SUPPORTED_CPUID2 ioctl

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 Documentation/kvm/api.txt | 64 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index 179cc51c15e6..5f5b64982b1a 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -968,6 +968,70 @@ struct kvm_xcrs {
 
 This ioctl would set vcpu's xcr to the value userspace specified.
 
+4.45 KVM_GET_SUPPORTED_CPUID
+
+Capability: KVM_CAP_EXT_CPUID
+Architectures: x86
+Type: system ioctl
+Parameters: struct kvm_cpuid2 (in/out)
+Returns: 0 on success, -1 on error
+
+struct kvm_cpuid2 {
+	__u32 nent;
+	__u32 padding;
+	struct kvm_cpuid_entry2 entries[0];
+};
+
+#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
+#define KVM_CPUID_FLAG_STATEFUL_FUNC    2
+#define KVM_CPUID_FLAG_STATE_READ_NEXT  4
+
+struct kvm_cpuid_entry2 {
+	__u32 function;
+	__u32 index;
+	__u32 flags;
+	__u32 eax;
+	__u32 ebx;
+	__u32 ecx;
+	__u32 edx;
+	__u32 padding[3];
+};
+
+This ioctl returns x86 cpuid features which are supported by both the hardware
+and kvm.  Userspace can use the information returned by this ioctl to
+construct cpuid information (for KVM_SET_CPUID2) that is consistent with
+hardware, kernel, and userspace capabilities, and with user requirements (for
+example, the user may wish to constrain cpuid to emulate older hardware,
+or for feature consistency across a cluster).
+
+Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure
+with the 'nent' field indicating the number of entries in the variable-size
+array 'entries'.  If the number of entries is too low to describe the cpu
+capabilities, an error (E2BIG) is returned.  If the number is too high,
+the 'nent' field is adjusted and an error (ENOMEM) is returned.  If the
+number is just right, the 'nent' field is adjusted to the number of valid
+entries in the 'entries' array, which is then filled.
+
+The entries returned are the host cpuid as returned by the cpuid instruction,
+with unknown or unsupported features masked out.  The fields in each entry
+are defined as follows:
+
+  function: the eax value used to obtain the entry
+  index: the ecx value used to obtain the entry (for entries that are
+         affected by ecx)
+  flags: an OR of zero or more of the following:
+        KVM_CPUID_FLAG_SIGNIFCANT_INDEX:
+           if the index field is valid
+        KVM_CPUID_FLAG_STATEFUL_FUNC:
+           if cpuid for this function returns different values for successive
+           invocations; there will be several entries with the same function,
+           all with this flag set
+        KVM_CPUID_FLAG_STATE_READ_NEXT:
+           for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is
+           the first entry to be read by a cpu
+   eax, ebx, ecx, edx: the values returned by the cpuid instruction for
+         this function/index combination
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
-- 
cgit v1.2.3


From 68be0803456b3eed33038be5566710ad7648c854 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Wed, 14 Jul 2010 19:05:45 +0300
Subject: KVM: x86: never re-execute instruction with enabled tdp

With tdp enabled we should get into emulator only when emulating io, so
reexecution will always bring us back into emulator.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index eb55ec55125c..689c2c3182ab 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3936,6 +3936,9 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
 {
 	gpa_t gpa;
 
+	if (tdp_enabled)
+		return false;
+
 	/*
 	 * if emulation was due to access to shadowed page table
 	 * and it failed try to unshadow page and re-entetr the
-- 
cgit v1.2.3


From 9195c4da26bbf8860e2e7b648dbf4ab465c7933a Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 15 Jul 2010 12:24:37 +0300
Subject: KVM: x86: Call mask notifiers from pic

If pit delivers interrupt while pic is masking it OS will never do EOI
and ack notifier will not be called so when pit will be unmasked no pit
interrupts will be delivered any more. Calling mask notifiers solves this
issue.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/i8259.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 819b748a33f9..8d10c063d7f2 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -363,10 +363,20 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
 		}
 	} else
 		switch (s->init_state) {
-		case 0:		/* normal mode */
+		case 0: { /* normal mode */
+			u8 imr_diff = s->imr ^ val,
+				off = (s == &s->pics_state->pics[0]) ? 0 : 8;
 			s->imr = val;
+			for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
+				if (imr_diff & (1 << irq))
+					kvm_fire_mask_notifiers(
+						s->pics_state->kvm,
+						SELECT_PIC(irq + off),
+						irq + off,
+						!!(s->imr & (1 << irq)));
 			pic_update_irq(s->pics_state);
 			break;
+		}
 		case 1:
 			s->irq_base = val & 0xf8;
 			s->init_state = 2;
-- 
cgit v1.2.3


From c19b8bd60e19308d5583ef200ddcc782d85d9543 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Thu, 15 Jul 2010 08:51:58 +0800
Subject: KVM: x86 emulator: fix xchg instruction emulation

If the destination is a memory operand and the memory cannot
map to a valid page, the xchg instruction emulation and locked
instruction will not work on io regions and stuck in endless
loop. We should emulate exchange as write to fix it.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Acked-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 689c2c3182ab..97aab036dabf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3562,6 +3562,10 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 		goto emul_write;
 
 	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+	if (is_error_page(page)) {
+		kvm_release_page_clean(page);
+		goto emul_write;
+	}
 
 	kaddr = kmap_atomic(page, KM_USER0);
 	kaddr += offset_in_page(gpa);
-- 
cgit v1.2.3


From 6e3e243c3b6e0bbd18c6ce0fbc12bc3fe2d77b34 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 16 Jul 2010 11:52:55 +0200
Subject: KVM: MMU: fix mmu notifier invalidate handler for huge spte

The index wasn't calculated correctly (off by one) for huge spte so KVM guest
was unstable with transparent hugepages.

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 36c62f33513f..812770cddc8d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -850,8 +850,12 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 			ret = handler(kvm, &memslot->rmap[gfn_offset], data);
 
 			for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
-				int idx = gfn_offset;
-				idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
+				unsigned long idx;
+				int sh;
+
+				sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
+				idx = ((memslot->base_gfn+gfn_offset) >> sh) -
+					(memslot->base_gfn >> sh);
 				ret |= handler(kvm,
 					&memslot->lpage_info[j][idx].rmap_pde,
 					data);
-- 
cgit v1.2.3


From fa1de2bfc0feb7245328ad25fb3e6d5cd2c903b4 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 16 Jul 2010 11:19:51 +0800
Subject: KVM: MMU: add missing reserved bits check in speculative path

In the speculative path, we should check guest pte's reserved bits just as
the real processor does

Reported-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 9 ++++++++-
 arch/x86/kvm/paging_tmpl.h | 5 +++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 812770cddc8d..d2ea9cabc066 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2697,6 +2697,9 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
 		return;
         }
 
+	if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
+		return;
+
 	++vcpu->kvm->stat.mmu_pte_updated;
 	if (!sp->role.cr4_pae)
 		paging32_update_pte(vcpu, sp, spte, new);
@@ -2775,6 +2778,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		       bool guest_initiated)
 {
 	gfn_t gfn = gpa >> PAGE_SHIFT;
+	union kvm_mmu_page_role mask = { .word = 0 };
 	struct kvm_mmu_page *sp;
 	struct hlist_node *node;
 	LIST_HEAD(invalid_list);
@@ -2849,6 +2853,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		}
 	}
 
+	mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
 	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
 		pte_size = sp->role.cr4_pae ? 8 : 4;
 		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
@@ -2896,7 +2901,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		while (npte--) {
 			entry = *spte;
 			mmu_pte_write_zap_pte(vcpu, sp, spte);
-			if (gentry)
+			if (gentry &&
+			      !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
+			      & mask.word))
 				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
 			if (!remote_flush && need_remote_flush(entry, *spte))
 				remote_flush = true;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index d9a2742014e3..51ef9097960d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -638,8 +638,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			return -EINVAL;
 
 		gfn = gpte_to_gfn(gpte);
-		if (gfn != sp->gfns[i] ||
-		      !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) {
+		if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)
+		      || gfn != sp->gfns[i] || !is_present_gpte(gpte)
+		      || !(gpte & PT_ACCESSED_MASK)) {
 			u64 nonpresent;
 
 			if (is_present_gpte(gpte) || !clear_unsync)
-- 
cgit v1.2.3


From daa3db693ce925a14b7e17ab6f306dc0e6a5342c Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 16 Jul 2010 11:23:04 +0800
Subject: KVM: MMU: fix broken page accessed tracking with ept enabled

In current code, if ept is enabled(shadow_accessed_mask = 0), the page
accessed tracking is lost.

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d2ea9cabc066..9b3b916ebeae 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -687,7 +687,7 @@ static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
 	if (!is_rmap_spte(old_spte))
 		return;
 	pfn = spte_to_pfn(old_spte);
-	if (old_spte & shadow_accessed_mask)
+	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
 		kvm_set_pfn_accessed(pfn);
 	if (is_writable_pte(old_spte))
 		kvm_set_pfn_dirty(pfn);
@@ -815,7 +815,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 				kvm_set_pfn_dirty(spte_to_pfn(*spte));
 			old_spte = __xchg_spte(spte, new_spte);
 			if (is_shadow_present_pte(old_spte)
-			    && (old_spte & shadow_accessed_mask))
+			    && (!shadow_accessed_mask ||
+			    old_spte & shadow_accessed_mask))
 				mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
 			spte = rmap_next(kvm, rmapp, spte);
 		}
-- 
cgit v1.2.3


From 9ed5520dd3c9cb79c25f95fce9c57b87637d0fb7 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 16 Jul 2010 11:25:17 +0800
Subject: KVM: MMU: fix page dirty tracking lost while sync page

In sync-page path, if spte.writable is changed, it will lose page dirty
tracking, for example:

assume spte.writable = 0 in a unsync-page, when it's synced, it map spte
to writable(that is spte.writable = 1), later guest write spte.gfn, it means
spte.gfn is dirty, then guest changed this mapping to read-only, after it's
synced,  spte.writable = 0

So, when host release the spte, it detect spte.writable = 0 and not mark page
dirty

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9b3b916ebeae..a04756a26fe2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1985,6 +1985,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		mark_page_dirty(vcpu->kvm, gfn);
 
 set_pte:
+	if (is_writable_pte(*sptep) && !is_writable_pte(spte))
+		kvm_set_pfn_dirty(pfn);
 	update_spte(sptep, spte);
 done:
 	return ret;
@@ -1998,7 +2000,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			 bool reset_host_protection)
 {
 	int was_rmapped = 0;
-	int was_writable = is_writable_pte(*sptep);
 	int rmap_count;
 
 	pgprintk("%s: spte %llx access %x write_fault %d"
@@ -2048,15 +2049,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	page_header_update_slot(vcpu->kvm, sptep, gfn);
 	if (!was_rmapped) {
 		rmap_count = rmap_add(vcpu, sptep, gfn);
-		kvm_release_pfn_clean(pfn);
 		if (rmap_count > RMAP_RECYCLE_THRESHOLD)
 			rmap_recycle(vcpu, sptep, gfn);
-	} else {
-		if (was_writable)
-			kvm_release_pfn_dirty(pfn);
-		else
-			kvm_release_pfn_clean(pfn);
 	}
+	kvm_release_pfn_clean(pfn);
 	if (speculative) {
 		vcpu->arch.last_pte_updated = sptep;
 		vcpu->arch.last_pte_gfn = gfn;
-- 
cgit v1.2.3


From be233d49ea8c1fde9f4afec378dc2c2f16ab0263 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 16 Jul 2010 11:27:10 +0800
Subject: KVM: MMU: don't atomicly set spte if it's not present

If the old mapping is not present, the spte.a is not lost, so no need
atomic operation to set it

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a04756a26fe2..9c7fae08291d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -307,9 +307,10 @@ static void update_spte(u64 *sptep, u64 new_spte)
 {
 	u64 old_spte;
 
-	if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask)) {
+	if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) ||
+	      !is_rmap_spte(*sptep))
 		__set_spte(sptep, new_spte);
-	} else {
+	else {
 		old_spte = __xchg_spte(sptep, new_spte);
 		if (old_spte & shadow_accessed_mask)
 			mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
-- 
cgit v1.2.3


From e4b502ead259fcf70839414abb7c8cdc3b523f01 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 16 Jul 2010 11:28:09 +0800
Subject: KVM: MMU: cleanup spte set and accssed/dirty tracking

Introduce set_spte_track_bits() to cleanup current code

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9c7fae08291d..e4b862eb8885 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -679,7 +679,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	}
 }
 
-static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+static void set_spte_track_bits(u64 *sptep, u64 new_spte)
 {
 	pfn_t pfn;
 	u64 old_spte;
@@ -692,6 +692,11 @@ static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
 		kvm_set_pfn_accessed(pfn);
 	if (is_writable_pte(old_spte))
 		kvm_set_pfn_dirty(pfn);
+}
+
+static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+{
+	set_spte_track_bits(sptep, new_spte);
 	rmap_remove(kvm, sptep);
 }
 
@@ -791,7 +796,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			     unsigned long data)
 {
 	int need_flush = 0;
-	u64 *spte, new_spte, old_spte;
+	u64 *spte, new_spte;
 	pte_t *ptep = (pte_t *)data;
 	pfn_t new_pfn;
 
@@ -812,13 +817,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			new_spte &= ~PT_WRITABLE_MASK;
 			new_spte &= ~SPTE_HOST_WRITEABLE;
 			new_spte &= ~shadow_accessed_mask;
-			if (is_writable_pte(*spte))
-				kvm_set_pfn_dirty(spte_to_pfn(*spte));
-			old_spte = __xchg_spte(spte, new_spte);
-			if (is_shadow_present_pte(old_spte)
-			    && (!shadow_accessed_mask ||
-			    old_spte & shadow_accessed_mask))
-				mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
+			set_spte_track_bits(spte, new_spte);
 			spte = rmap_next(kvm, rmapp, spte);
 		}
 	}
-- 
cgit v1.2.3


From 9a3aad70572c3f4d55e7f09ac4eb313d41d0a484 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 16 Jul 2010 11:30:18 +0800
Subject: KVM: MMU: using __xchg_spte more smarter

Sometimes, atomically set spte is not needed, this patch call __xchg_spte()
more smartly

Note: if the old mapping's access bit is already set, we no need atomic operation
since the access bit is not lost

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e4b862eb8885..0dcc95e09876 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -682,9 +682,14 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 static void set_spte_track_bits(u64 *sptep, u64 new_spte)
 {
 	pfn_t pfn;
-	u64 old_spte;
+	u64 old_spte = *sptep;
+
+	if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) ||
+	      old_spte & shadow_accessed_mask) {
+		__set_spte(sptep, new_spte);
+	} else
+		old_spte = __xchg_spte(sptep, new_spte);
 
-	old_spte = __xchg_spte(sptep, new_spte);
 	if (!is_rmap_spte(old_spte))
 		return;
 	pfn = spte_to_pfn(old_spte);
-- 
cgit v1.2.3


From 3444d7da1839b851eefedd372978d8a982316c36 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 26 Jul 2010 18:32:38 +0300
Subject: KVM: VMX: Fix host GDT.LIMIT corruption

vmx does not restore GDT.LIMIT to the host value, instead it sets it to 64KB.
This means host userspace can learn a few bits of host memory.

Fix by reloading GDTR when we load other host state.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2fdcc9819f36..27a0222c2946 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -185,6 +185,7 @@ static void kvm_cpu_vmxoff(void);
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
+static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
 
 static unsigned long *vmx_io_bitmap_a;
 static unsigned long *vmx_io_bitmap_b;
@@ -871,6 +872,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 #endif
 	if (current_thread_info()->status & TS_USEDFPU)
 		clts();
+	load_gdt(&__get_cpu_var(host_gdt));
 }
 
 static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -1379,6 +1381,8 @@ static int hardware_enable(void *garbage)
 		ept_sync_global();
 	}
 
+	store_gdt(&__get_cpu_var(host_gdt));
+
 	return 0;
 }
 
-- 
cgit v1.2.3