diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-01-16 16:15:14 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-01-16 16:15:14 +0200 |
commit | 79e06c4c4950be2abd8ca5d2428a8c915aa62c24 (patch) | |
tree | 0507ef82aa3c7766b7b19163a0351882b7d7c5b5 /virt/kvm | |
parent | cb3f09f9afe5286c0aed7a1c5cc71495de166efb (diff) | |
parent | c862dcd199759d4a45e65dab47b03e3e8a144e3a (diff) | |
download | lwn-79e06c4c4950be2abd8ca5d2428a8c915aa62c24.tar.gz lwn-79e06c4c4950be2abd8ca5d2428a8c915aa62c24.zip |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini:
"RISCV:
- Use common KVM implementation of MMU memory caches
- SBI v0.2 support for Guest
- Initial KVM selftests support
- Fix to avoid spurious virtual interrupts after clearing hideleg CSR
- Update email address for Anup and Atish
ARM:
- Simplification of the 'vcpu first run' by integrating it into KVM's
'pid change' flow
- Refactoring of the FP and SVE state tracking, also leading to a
simpler state and less shared data between EL1 and EL2 in the nVHE
case
- Tidy up the header file usage for the nvhe hyp object
- New HYP unsharing mechanism, finally allowing pages to be unmapped
from the Stage-1 EL2 page-tables
- Various pKVM cleanups around refcounting and sharing
- A couple of vgic fixes for bugs that would trigger once the vcpu
xarray rework is merged, but not sooner
- Add minimal support for ARMv8.7's PMU extension
- Rework kvm_pgtable initialisation ahead of the NV work
- New selftest for IRQ injection
- Teach selftests about the lack of default IPA space and page sizes
- Expand sysreg selftest to deal with Pointer Authentication
- The usual bunch of cleanups and doc update
s390:
- fix sigp sense/start/stop/inconsistency
- cleanups
x86:
- Clean up some function prototypes more
- improved gfn_to_pfn_cache with proper invalidation, used by Xen
emulation
- add KVM_IRQ_ROUTING_XEN_EVTCHN and event channel delivery
- completely remove potential TOC/TOU races in nested SVM consistency
checks
- update some PMCs on emulated instructions
- Intel AMX support (joint work between Thomas and Intel)
- large MMU cleanups
- module parameter to disable PMU virtualization
- cleanup register cache
- first part of halt handling cleanups
- Hyper-V enlightened MSR bitmap support for nested hypervisors
Generic:
- clean up Makefiles
- introduce CONFIG_HAVE_KVM_DIRTY_RING
- optimize memslot lookup using a tree
- optimize vCPU array usage by converting to xarray"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (268 commits)
x86/fpu: Fix inline prefix warnings
selftest: kvm: Add amx selftest
selftest: kvm: Move struct kvm_x86_state to header
selftest: kvm: Reorder vcpu_load_state steps for AMX
kvm: x86: Disable interception for IA32_XFD on demand
x86/fpu: Provide fpu_sync_guest_vmexit_xfd_state()
kvm: selftests: Add support for KVM_CAP_XSAVE2
kvm: x86: Add support for getting/setting expanded xstate buffer
x86/fpu: Add uabi_size to guest_fpu
kvm: x86: Add CPUID support for Intel AMX
kvm: x86: Add XCR0 support for Intel AMX
kvm: x86: Disable RDMSR interception of IA32_XFD_ERR
kvm: x86: Emulate IA32_XFD_ERR for guest
kvm: x86: Intercept #NM for saving IA32_XFD_ERR
x86/fpu: Prepare xfd_err in struct fpu_guest
kvm: x86: Add emulation for IA32_XFD
x86/fpu: Provide fpu_update_guest_xfd() for IA32_XFD emulation
kvm: x86: Enable dynamic xfeatures at KVM_SET_CPUID2
x86/fpu: Provide fpu_enable_guest_xfd_features() for KVM
x86/fpu: Add guest support to xfd_enable_feature()
...
Diffstat (limited to 'virt/kvm')
-rw-r--r-- | virt/kvm/Kconfig | 6 | ||||
-rw-r--r-- | virt/kvm/Makefile.kvm | 14 | ||||
-rw-r--r-- | virt/kvm/async_pf.c | 2 | ||||
-rw-r--r-- | virt/kvm/dirty_ring.c | 11 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 1066 | ||||
-rw-r--r-- | virt/kvm/kvm_mm.h | 44 | ||||
-rw-r--r-- | virt/kvm/mmu_lock.h | 23 | ||||
-rw-r--r-- | virt/kvm/pfncache.c | 337 |
8 files changed, 1028 insertions, 475 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 62b39149b8c8..f4834c20e4a6 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -4,6 +4,9 @@ config HAVE_KVM bool +config HAVE_KVM_PFNCACHE + bool + config HAVE_KVM_IRQCHIP bool @@ -13,6 +16,9 @@ config HAVE_KVM_IRQFD config HAVE_KVM_IRQ_ROUTING bool +config HAVE_KVM_DIRTY_RING + bool + config HAVE_KVM_EVENTFD bool select EVENTFD diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm new file mode 100644 index 000000000000..2c27d5d0c367 --- /dev/null +++ b/virt/kvm/Makefile.kvm @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for Kernel-based Virtual Machine module +# + +KVM ?= ../../../virt/kvm + +kvm-y := $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/binary_stats.o +kvm-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o +kvm-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o +kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o +kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o +kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o +kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index dd777688d14a..9bfe1d6f6529 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -85,7 +85,7 @@ static void async_pf_execute(struct work_struct *work) trace_kvm_async_pf_completed(addr, cr2_or_gpa); - rcuwait_wake_up(&vcpu->wait); + __kvm_vcpu_wake_up(vcpu); mmput(mm); kvm_put_kvm(vcpu->kvm); diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index 88f4683198ea..222ecc81d7df 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -9,7 +9,7 @@ #include <linux/vmalloc.h> #include <linux/kvm_dirty_ring.h> #include <trace/events/kvm.h> -#include "mmu_lock.h" +#include "kvm_mm.h" int __weak kvm_cpu_dirty_log_size(void) { @@ -36,15 +36,6 @@ static bool kvm_dirty_ring_full(struct kvm_dirty_ring *ring) return kvm_dirty_ring_used(ring) >= ring->size; } -struct kvm_dirty_ring *kvm_dirty_ring_get(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); - - WARN_ON_ONCE(vcpu->kvm != kvm); - - return &vcpu->dirty_ring; -} - static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask) { struct kvm_memory_slot *memslot; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 938975ff75a0..504158f0e131 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -59,7 +59,7 @@ #include "coalesced_mmio.h" #include "async_pf.h" -#include "mmu_lock.h" +#include "kvm_mm.h" #include "vfio.h" #define CREATE_TRACE_POINTS @@ -305,8 +305,9 @@ bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, { struct kvm_vcpu *vcpu; struct cpumask *cpus; + unsigned long i; bool called; - int i, me; + int me; me = get_cpu(); @@ -421,7 +422,9 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->kvm = kvm; vcpu->vcpu_id = id; vcpu->pid = NULL; +#ifndef __KVM_HAVE_ARCH_WQP rcuwait_init(&vcpu->wait); +#endif kvm_async_pf_vcpu_init(vcpu); vcpu->pre_pcpu = -1; @@ -432,10 +435,10 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->preempted = false; vcpu->ready = false; preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); - vcpu->last_used_slot = 0; + vcpu->last_used_slot = NULL; } -void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) +static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) { kvm_dirty_ring_free(&vcpu->dirty_ring); kvm_arch_vcpu_destroy(vcpu); @@ -450,7 +453,20 @@ void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) free_page((unsigned long)vcpu->run); kmem_cache_free(kvm_vcpu_cache, vcpu); } -EXPORT_SYMBOL_GPL(kvm_vcpu_destroy); + +void kvm_destroy_vcpus(struct kvm *kvm) +{ + unsigned long i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + kvm_vcpu_destroy(vcpu); + xa_erase(&kvm->vcpu_array, i); + } + + atomic_set(&kvm->online_vcpus, 0); +} +EXPORT_SYMBOL_GPL(kvm_destroy_vcpus); #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) @@ -498,6 +514,12 @@ static void kvm_null_fn(void) } #define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn) +/* Iterate over each memslot intersecting [start, last] (inclusive) range */ +#define kvm_for_each_memslot_in_hva_range(node, slots, start, last) \ + for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \ + node; \ + node = interval_tree_iter_next(node, start, last)) \ + static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, const struct kvm_hva_range *range) { @@ -507,6 +529,9 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, struct kvm_memslots *slots; int i, idx; + if (WARN_ON_ONCE(range->end <= range->start)) + return 0; + /* A null handler is allowed if and only if on_lock() is provided. */ if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) && IS_KVM_NULL_FN(range->handler))) @@ -515,15 +540,17 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, idx = srcu_read_lock(&kvm->srcu); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + struct interval_tree_node *node; + slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(slot, slots) { + kvm_for_each_memslot_in_hva_range(node, slots, + range->start, range->end - 1) { unsigned long hva_start, hva_end; + slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]); hva_start = max(range->start, slot->userspace_addr); hva_end = min(range->end, slot->userspace_addr + (slot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; /* * To optimize for the likely case where the address @@ -684,6 +711,9 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, kvm->mn_active_invalidate_count++; spin_unlock(&kvm->mn_invalidate_lock); + gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end, + hva_range.may_block); + __kvm_handle_hva_range(kvm, &hva_range); return 0; @@ -851,21 +881,6 @@ static void kvm_destroy_pm_notifier(struct kvm *kvm) } #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */ -static struct kvm_memslots *kvm_alloc_memslots(void) -{ - int i; - struct kvm_memslots *slots; - - slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT); - if (!slots) - return NULL; - - for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) - slots->id_to_index[i] = -1; - - return slots; -} - static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) { if (!memslot->dirty_bitmap) @@ -875,27 +890,33 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) memslot->dirty_bitmap = NULL; } +/* This does not remove the slot from struct kvm_memslots data structures */ static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { kvm_destroy_dirty_bitmap(slot); kvm_arch_free_memslot(kvm, slot); - slot->flags = 0; - slot->npages = 0; + kfree(slot); } static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots) { + struct hlist_node *idnode; struct kvm_memory_slot *memslot; + int bkt; - if (!slots) + /* + * The same memslot objects live in both active and inactive sets, + * arbitrarily free using index '1' so the second invocation of this + * function isn't operating over a structure with dangling pointers + * (even though this function isn't actually touching them). + */ + if (!slots->node_idx) return; - kvm_for_each_memslot(memslot, slots) + hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1]) kvm_free_memslot(kvm, memslot); - - kvfree(slots); } static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc) @@ -1034,8 +1055,9 @@ int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm) static struct kvm *kvm_create_vm(unsigned long type) { struct kvm *kvm = kvm_arch_alloc_vm(); + struct kvm_memslots *slots; int r = -ENOMEM; - int i; + int i, j; if (!kvm) return ERR_PTR(-ENOMEM); @@ -1050,6 +1072,10 @@ static struct kvm *kvm_create_vm(unsigned long type) mutex_init(&kvm->slots_arch_lock); spin_lock_init(&kvm->mn_invalidate_lock); rcuwait_init(&kvm->mn_memslots_update_rcuwait); + xa_init(&kvm->vcpu_array); + + INIT_LIST_HEAD(&kvm->gpc_list); + spin_lock_init(&kvm->gpc_lock); INIT_LIST_HEAD(&kvm->devices); @@ -1062,13 +1088,20 @@ static struct kvm *kvm_create_vm(unsigned long type) refcount_set(&kvm->users_count, 1); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { - struct kvm_memslots *slots = kvm_alloc_memslots(); + for (j = 0; j < 2; j++) { + slots = &kvm->__memslots[i][j]; - if (!slots) - goto out_err_no_arch_destroy_vm; - /* Generations must be different for each address space. */ - slots->generation = i; - rcu_assign_pointer(kvm->memslots[i], slots); + atomic_long_set(&slots->last_used_slot, (unsigned long)NULL); + slots->hva_tree = RB_ROOT_CACHED; + slots->gfn_tree = RB_ROOT; + hash_init(slots->id_hash); + slots->node_idx = j; + + /* Generations must be different for each address space. */ + slots->generation = i; + } + + rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]); } for (i = 0; i < KVM_NR_BUSES; i++) { @@ -1122,8 +1155,6 @@ out_err_no_arch_destroy_vm: WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); for (i = 0; i < KVM_NR_BUSES; i++) kfree(kvm_get_bus(kvm, i)); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); cleanup_srcu_struct(&kvm->irq_srcu); out_err_no_irq_srcu: cleanup_srcu_struct(&kvm->srcu); @@ -1188,8 +1219,10 @@ static void kvm_destroy_vm(struct kvm *kvm) #endif kvm_arch_destroy_vm(kvm); kvm_destroy_devices(kvm); - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + kvm_free_memslots(kvm, &kvm->__memslots[i][0]); + kvm_free_memslots(kvm, &kvm->__memslots[i][1]); + } cleanup_srcu_struct(&kvm->irq_srcu); cleanup_srcu_struct(&kvm->srcu); kvm_arch_free_vm(kvm); @@ -1259,165 +1292,136 @@ static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot) return 0; } -/* - * Delete a memslot by decrementing the number of used slots and shifting all - * other entries in the array forward one spot. - */ -static inline void kvm_memslot_delete(struct kvm_memslots *slots, - struct kvm_memory_slot *memslot) +static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id) { - struct kvm_memory_slot *mslots = slots->memslots; - int i; - - if (WARN_ON(slots->id_to_index[memslot->id] == -1)) - return; + struct kvm_memslots *active = __kvm_memslots(kvm, as_id); + int node_idx_inactive = active->node_idx ^ 1; - slots->used_slots--; - - if (atomic_read(&slots->last_used_slot) >= slots->used_slots) - atomic_set(&slots->last_used_slot, 0); - - for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) { - mslots[i] = mslots[i + 1]; - slots->id_to_index[mslots[i].id] = i; - } - mslots[i] = *memslot; - slots->id_to_index[memslot->id] = -1; + return &kvm->__memslots[as_id][node_idx_inactive]; } /* - * "Insert" a new memslot by incrementing the number of used slots. Returns - * the new slot's initial index into the memslots array. + * Helper to get the address space ID when one of memslot pointers may be NULL. + * This also serves as a sanity that at least one of the pointers is non-NULL, + * and that their address space IDs don't diverge. */ -static inline int kvm_memslot_insert_back(struct kvm_memslots *slots) +static int kvm_memslots_get_as_id(struct kvm_memory_slot *a, + struct kvm_memory_slot *b) { - return slots->used_slots++; -} + if (WARN_ON_ONCE(!a && !b)) + return 0; -/* - * Move a changed memslot backwards in the array by shifting existing slots - * with a higher GFN toward the front of the array. Note, the changed memslot - * itself is not preserved in the array, i.e. not swapped at this time, only - * its new index into the array is tracked. Returns the changed memslot's - * current index into the memslots array. - */ -static inline int kvm_memslot_move_backward(struct kvm_memslots *slots, - struct kvm_memory_slot *memslot) -{ - struct kvm_memory_slot *mslots = slots->memslots; - int i; + if (!a) + return b->as_id; + if (!b) + return a->as_id; - if (WARN_ON_ONCE(slots->id_to_index[memslot->id] == -1) || - WARN_ON_ONCE(!slots->used_slots)) - return -1; + WARN_ON_ONCE(a->as_id != b->as_id); + return a->as_id; +} - /* - * Move the target memslot backward in the array by shifting existing - * memslots with a higher GFN (than the target memslot) towards the - * front of the array. - */ - for (i = slots->id_to_index[memslot->id]; i < slots->used_slots - 1; i++) { - if (memslot->base_gfn > mslots[i + 1].base_gfn) - break; +static void kvm_insert_gfn_node(struct kvm_memslots *slots, + struct kvm_memory_slot *slot) +{ + struct rb_root *gfn_tree = &slots->gfn_tree; + struct rb_node **node, *parent; + int idx = slots->node_idx; - WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn); + parent = NULL; + for (node = &gfn_tree->rb_node; *node; ) { + struct kvm_memory_slot *tmp; - /* Shift the next memslot forward one and update its index. */ - mslots[i] = mslots[i + 1]; - slots->id_to_index[mslots[i].id] = i; + tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]); + parent = *node; + if (slot->base_gfn < tmp->base_gfn) + node = &(*node)->rb_left; + else if (slot->base_gfn > tmp->base_gfn) + node = &(*node)->rb_right; + else + BUG(); } - return i; + + rb_link_node(&slot->gfn_node[idx], parent, node); + rb_insert_color(&slot->gfn_node[idx], gfn_tree); } -/* - * Move a changed memslot forwards in the array by shifting existing slots with - * a lower GFN toward the back of the array. Note, the changed memslot itself - * is not preserved in the array, i.e. not swapped at this time, only its new - * index into the array is tracked. Returns the changed memslot's final index - * into the memslots array. - */ -static inline int kvm_memslot_move_forward(struct kvm_memslots *slots, - struct kvm_memory_slot *memslot, - int start) +static void kvm_erase_gfn_node(struct kvm_memslots *slots, + struct kvm_memory_slot *slot) { - struct kvm_memory_slot *mslots = slots->memslots; - int i; + rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree); +} - for (i = start; i > 0; i--) { - if (memslot->base_gfn < mslots[i - 1].base_gfn) - break; +static void kvm_replace_gfn_node(struct kvm_memslots *slots, + struct kvm_memory_slot *old, + struct kvm_memory_slot *new) +{ + int idx = slots->node_idx; - WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn); + WARN_ON_ONCE(old->base_gfn != new->base_gfn); - /* Shift the next memslot back one and update its index. */ - mslots[i] = mslots[i - 1]; - slots->id_to_index[mslots[i].id] = i; - } - return i; + rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx], + &slots->gfn_tree); } /* - * Re-sort memslots based on their GFN to account for an added, deleted, or - * moved memslot. Sorting memslots by GFN allows using a binary search during - * memslot lookup. - * - * IMPORTANT: Slots are sorted from highest GFN to lowest GFN! I.e. the entry - * at memslots[0] has the highest GFN. - * - * The sorting algorithm takes advantage of having initially sorted memslots - * and knowing the position of the changed memslot. Sorting is also optimized - * by not swapping the updated memslot and instead only shifting other memslots - * and tracking the new index for the update memslot. Only once its final - * index is known is the updated memslot copied into its position in the array. - * - * - When deleting a memslot, the deleted memslot simply needs to be moved to - * the end of the array. - * - * - When creating a memslot, the algorithm "inserts" the new memslot at the - * end of the array and then it forward to its correct location. + * Replace @old with @new in the inactive memslots. * - * - When moving a memslot, the algorithm first moves the updated memslot - * backward to handle the scenario where the memslot's GFN was changed to a - * lower value. update_memslots() then falls through and runs the same flow - * as creating a memslot to move the memslot forward to handle the scenario - * where its GFN was changed to a higher value. + * With NULL @old this simply adds @new. + * With NULL @new this simply removes @old. * - * Note, slots are sorted from highest->lowest instead of lowest->highest for - * historical reasons. Originally, invalid memslots where denoted by having - * GFN=0, thus sorting from highest->lowest naturally sorted invalid memslots - * to the end of the array. The current algorithm uses dedicated logic to - * delete a memslot and thus does not rely on invalid memslots having GFN=0. - * - * The other historical motiviation for highest->lowest was to improve the - * performance of memslot lookup. KVM originally used a linear search starting - * at memslots[0]. On x86, the largest memslot usually has one of the highest, - * if not *the* highest, GFN, as the bulk of the guest's RAM is located in a - * single memslot above the 4gb boundary. As the largest memslot is also the - * most likely to be referenced, sorting it to the front of the array was - * advantageous. The current binary search starts from the middle of the array - * and uses an LRU pointer to improve performance for all memslots and GFNs. + * If @new is non-NULL its hva_node[slots_idx] range has to be set + * appropriately. */ -static void update_memslots(struct kvm_memslots *slots, - struct kvm_memory_slot *memslot, - enum kvm_mr_change change) +static void kvm_replace_memslot(struct kvm *kvm, + struct kvm_memory_slot *old, + struct kvm_memory_slot *new) { - int i; + int as_id = kvm_memslots_get_as_id(old, new); + struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id); + int idx = slots->node_idx; - if (change == KVM_MR_DELETE) { - kvm_memslot_delete(slots, memslot); - } else { - if (change == KVM_MR_CREATE) - i = kvm_memslot_insert_back(slots); - else - i = kvm_memslot_move_backward(slots, memslot); - i = kvm_memslot_move_forward(slots, memslot, i); + if (old) { + hash_del(&old->id_node[idx]); + interval_tree_remove(&old->hva_node[idx], &slots->hva_tree); - /* - * Copy the memslot to its new position in memslots and update - * its index accordingly. - */ - slots->memslots[i] = *memslot; - slots->id_to_index[memslot->id] = i; + if ((long)old == atomic_long_read(&slots->last_used_slot)) + atomic_long_set(&slots->last_used_slot, (long)new); + + if (!new) { + kvm_erase_gfn_node(slots, old); + return; + } + } + + /* + * Initialize @new's hva range. Do this even when replacing an @old + * slot, kvm_copy_memslot() deliberately does not touch node data. + */ + new->hva_node[idx].start = new->userspace_addr; + new->hva_node[idx].last = new->userspace_addr + + (new->npages << PAGE_SHIFT) - 1; + + /* + * (Re)Add the new memslot. There is no O(1) interval_tree_replace(), + * hva_node needs to be swapped with remove+insert even though hva can't + * change when replacing an existing slot. + */ + hash_add(slots->id_hash, &new->id_node[idx], new->id); + interval_tree_insert(&new->hva_node[idx], &slots->hva_tree); + + /* + * If the memslot gfn is unchanged, rb_replace_node() can be used to + * switch the node in the gfn tree instead of removing the old and + * inserting the new as two separate operations. Replacement is a + * single O(1) operation versus two O(log(n)) operations for + * remove+insert. + */ + if (old && old->base_gfn == new->base_gfn) { + kvm_replace_gfn_node(slots, old, new); + } else { + if (old) + kvm_erase_gfn_node(slots, old); + kvm_insert_gfn_node(slots, new); } } @@ -1435,11 +1439,12 @@ static int check_memory_region_flags(const struct kvm_userspace_memory_region *m return 0; } -static struct kvm_memslots *install_new_memslots(struct kvm *kvm, - int as_id, struct kvm_memslots *slots) +static void kvm_swap_active_memslots(struct kvm *kvm, int as_id) { - struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id); - u64 gen = old_memslots->generation; + struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id); + + /* Grab the generation from the activate memslots. */ + u64 gen = __kvm_memslots(kvm, as_id)->generation; WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; @@ -1490,60 +1495,226 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, kvm_arch_memslots_updated(kvm, gen); slots->generation = gen; - - return old_memslots; } -static size_t kvm_memslots_size(int slots) +static int kvm_prepare_memory_region(struct kvm *kvm, + const struct kvm_memory_slot *old, + struct kvm_memory_slot *new, + enum kvm_mr_change change) { - return sizeof(struct kvm_memslots) + - (sizeof(struct kvm_memory_slot) * slots); + int r; + + /* + * If dirty logging is disabled, nullify the bitmap; the old bitmap + * will be freed on "commit". If logging is enabled in both old and + * new, reuse the existing bitmap. If logging is enabled only in the + * new and KVM isn't using a ring buffer, allocate and initialize a + * new bitmap. + */ + if (change != KVM_MR_DELETE) { + if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES)) + new->dirty_bitmap = NULL; + else if (old && old->dirty_bitmap) + new->dirty_bitmap = old->dirty_bitmap; + else if (!kvm->dirty_ring_size) { + r = kvm_alloc_dirty_bitmap(new); + if (r) + return r; + + if (kvm_dirty_log_manual_protect_and_init_set(kvm)) + bitmap_set(new->dirty_bitmap, 0, new->npages); + } + } + + r = kvm_arch_prepare_memory_region(kvm, old, new, change); + + /* Free the bitmap on failure if it was allocated above. */ + if (r && new && new->dirty_bitmap && old && !old->dirty_bitmap) + kvm_destroy_dirty_bitmap(new); + + return r; } -static void kvm_copy_memslots(struct kvm_memslots *to, - struct kvm_memslots *from) +static void kvm_commit_memory_region(struct kvm *kvm, + struct kvm_memory_slot *old, + const struct kvm_memory_slot *new, + enum kvm_mr_change change) { - memcpy(to, from, kvm_memslots_size(from->used_slots)); + /* + * Update the total number of memslot pages before calling the arch + * hook so that architectures can consume the result directly. + */ + if (change == KVM_MR_DELETE) + kvm->nr_memslot_pages -= old->npages; + else if (change == KVM_MR_CREATE) + kvm->nr_memslot_pages += new->npages; + + kvm_arch_commit_memory_region(kvm, old, new, change); + + switch (change) { + case KVM_MR_CREATE: + /* Nothing more to do. */ + break; + case KVM_MR_DELETE: + /* Free the old memslot and all its metadata. */ + kvm_free_memslot(kvm, old); + break; + case KVM_MR_MOVE: + case KVM_MR_FLAGS_ONLY: + /* + * Free the dirty bitmap as needed; the below check encompasses + * both the flags and whether a ring buffer is being used) + */ + if (old->dirty_bitmap && !new->dirty_bitmap) + kvm_destroy_dirty_bitmap(old); + + /* + * The final quirk. Free the detached, old slot, but only its + * memory, not any metadata. Metadata, including arch specific + * data, may be reused by @new. + */ + kfree(old); + break; + default: + BUG(); + } } /* - * Note, at a minimum, the current number of used slots must be allocated, even - * when deleting a memslot, as we need a complete duplicate of the memslots for - * use when invalidating a memslot prior to deleting/moving the memslot. + * Activate @new, which must be installed in the inactive slots by the caller, + * by swapping the active slots and then propagating @new to @old once @old is + * unreachable and can be safely modified. + * + * With NULL @old this simply adds @new to @active (while swapping the sets). + * With NULL @new this simply removes @old from @active and frees it + * (while also swapping the sets). */ -static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old, - enum kvm_mr_change change) +static void kvm_activate_memslot(struct kvm *kvm, + struct kvm_memory_slot *old, + struct kvm_memory_slot *new) { - struct kvm_memslots *slots; - size_t new_size; + int as_id = kvm_memslots_get_as_id(old, new); - if (change == KVM_MR_CREATE) - new_size = kvm_memslots_size(old->used_slots + 1); - else - new_size = kvm_memslots_size(old->used_slots); + kvm_swap_active_memslots(kvm, as_id); - slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT); - if (likely(slots)) - kvm_copy_memslots(slots, old); + /* Propagate the new memslot to the now inactive memslots. */ + kvm_replace_memslot(kvm, old, new); +} - return slots; +static void kvm_copy_memslot(struct kvm_memory_slot *dest, + const struct kvm_memory_slot *src) +{ + dest->base_gfn = src->base_gfn; + dest->npages = src->npages; + dest->dirty_bitmap = src->dirty_bitmap; + dest->arch = src->arch; + dest->userspace_addr = src->userspace_addr; + dest->flags = src->flags; + dest->id = src->id; + dest->as_id = src->as_id; +} + +static void kvm_invalidate_memslot(struct kvm *kvm, + struct kvm_memory_slot *old, + struct kvm_memory_slot *invalid_slot) +{ + /* + * Mark the current slot INVALID. As with all memslot modifications, + * this must be done on an unreachable slot to avoid modifying the + * current slot in the active tree. + */ + kvm_copy_memslot(invalid_slot, old); + invalid_slot->flags |= KVM_MEMSLOT_INVALID; + kvm_replace_memslot(kvm, old, invalid_slot); + + /* + * Activate the slot that is now marked INVALID, but don't propagate + * the slot to the now inactive slots. The slot is either going to be + * deleted or recreated as a new slot. + */ + kvm_swap_active_memslots(kvm, old->as_id); + + /* + * From this point no new shadow pages pointing to a deleted, or moved, + * memslot will be created. Validation of sp->gfn happens in: + * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) + * - kvm_is_visible_gfn (mmu_check_root) + */ + kvm_arch_flush_shadow_memslot(kvm, old); + + /* Was released by kvm_swap_active_memslots, reacquire. */ + mutex_lock(&kvm->slots_arch_lock); + + /* + * Copy the arch-specific field of the newly-installed slot back to the + * old slot as the arch data could have changed between releasing + * slots_arch_lock in install_new_memslots() and re-acquiring the lock + * above. Writers are required to retrieve memslots *after* acquiring + * slots_arch_lock, thus the active slot's data is guaranteed to be fresh. + */ + old->arch = invalid_slot->arch; +} + +static void kvm_create_memslot(struct kvm *kvm, + struct kvm_memory_slot *new) +{ + /* Add the new memslot to the inactive set and activate. */ + kvm_replace_memslot(kvm, NULL, new); + kvm_activate_memslot(kvm, NULL, new); +} + +static void kvm_delete_memslot(struct kvm *kvm, + struct kvm_memory_slot *old, + struct kvm_memory_slot *invalid_slot) +{ + /* + * Remove the old memslot (in the inactive memslots) by passing NULL as + * the "new" slot, and for the invalid version in the active slots. + */ + kvm_replace_memslot(kvm, old, NULL); + kvm_activate_memslot(kvm, invalid_slot, NULL); +} + +static void kvm_move_memslot(struct kvm *kvm, + struct kvm_memory_slot *old, + struct kvm_memory_slot *new, + struct kvm_memory_slot *invalid_slot) +{ + /* + * Replace the old memslot in the inactive slots, and then swap slots + * and replace the current INVALID with the new as well. + */ + kvm_replace_memslot(kvm, old, new); + kvm_activate_memslot(kvm, invalid_slot, new); +} + +static void kvm_update_flags_memslot(struct kvm *kvm, + struct kvm_memory_slot *old, + struct kvm_memory_slot *new) +{ + /* + * Similar to the MOVE case, but the slot doesn't need to be zapped as + * an intermediate step. Instead, the old memslot is simply replaced + * with a new, updated copy in both memslot sets. + */ + kvm_replace_memslot(kvm, old, new); + kvm_activate_memslot(kvm, old, new); } static int kvm_set_memslot(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot *new, int as_id, + struct kvm_memory_slot *old, + struct kvm_memory_slot *new, enum kvm_mr_change change) { - struct kvm_memory_slot *slot, old; - struct kvm_memslots *slots; + struct kvm_memory_slot *invalid_slot; int r; /* - * Released in install_new_memslots. + * Released in kvm_swap_active_memslots. * * Must be held from before the current memslots are copied until * after the new memslots are installed with rcu_assign_pointer, - * then released before the synchronize srcu in install_new_memslots. + * then released before the synchronize srcu in kvm_swap_active_memslots. * * When modifying memslots outside of the slots_lock, must be held * before reading the pointer to the current memslots until after all @@ -1554,114 +1725,88 @@ static int kvm_set_memslot(struct kvm *kvm, */ mutex_lock(&kvm->slots_arch_lock); - slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change); - if (!slots) { - mutex_unlock(&kvm->slots_arch_lock); - return -ENOMEM; - } - + /* + * Invalidate the old slot if it's being deleted or moved. This is + * done prior to actually deleting/moving the memslot to allow vCPUs to + * continue running by ensuring there are no mappings or shadow pages + * for the memslot when it is deleted/moved. Without pre-invalidation + * (and without a lock), a window would exist between effecting the + * delete/move and committing the changes in arch code where KVM or a + * guest could access a non-existent memslot. + * + * Modifications are done on a temporary, unreachable slot. The old + * slot needs to be preserved in case a later step fails and the + * invalidation needs to be reverted. + */ if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) { - /* - * Note, the INVALID flag needs to be in the appropriate entry - * in the freshly allocated memslots, not in @old or @new. - */ - slot = id_to_memslot(slots, new->id); - slot->flags |= KVM_MEMSLOT_INVALID; - - /* - * We can re-use the memory from the old memslots. - * It will be overwritten with a copy of the new memslots - * after reacquiring the slots_arch_lock below. - */ - slots = install_new_memslots(kvm, as_id, slots); - - /* From this point no new shadow pages pointing to a deleted, - * or moved, memslot will be created. - * - * validation of sp->gfn happens in: - * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) - * - kvm_is_visible_gfn (mmu_check_root) - */ - kvm_arch_flush_shadow_memslot(kvm, slot); - - /* Released in install_new_memslots. */ - mutex_lock(&kvm->slots_arch_lock); + invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT); + if (!invalid_slot) { + mutex_unlock(&kvm->slots_arch_lock); + return -ENOMEM; + } + kvm_invalidate_memslot(kvm, old, invalid_slot); + } + r = kvm_prepare_memory_region(kvm, old, new, change); + if (r) { /* - * The arch-specific fields of the memslots could have changed - * between releasing the slots_arch_lock in - * install_new_memslots and here, so get a fresh copy of the - * slots. + * For DELETE/MOVE, revert the above INVALID change. No + * modifications required since the original slot was preserved + * in the inactive slots. Changing the active memslots also + * release slots_arch_lock. */ - kvm_copy_memslots(slots, __kvm_memslots(kvm, as_id)); + if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) { + kvm_activate_memslot(kvm, invalid_slot, old); + kfree(invalid_slot); + } else { + mutex_unlock(&kvm->slots_arch_lock); + } + return r; } /* - * Make a full copy of the old memslot, the pointer will become stale - * when the memslots are re-sorted by update_memslots(), and the old - * memslot needs to be referenced after calling update_memslots(), e.g. - * to free its resources and for arch specific behavior. This needs to - * happen *after* (re)acquiring slots_arch_lock. + * For DELETE and MOVE, the working slot is now active as the INVALID + * version of the old slot. MOVE is particularly special as it reuses + * the old slot and returns a copy of the old slot (in working_slot). + * For CREATE, there is no old slot. For DELETE and FLAGS_ONLY, the + * old slot is detached but otherwise preserved. */ - slot = id_to_memslot(slots, new->id); - if (slot) { - old = *slot; - } else { - WARN_ON_ONCE(change != KVM_MR_CREATE); - memset(&old, 0, sizeof(old)); - old.id = new->id; - old.as_id = as_id; - } - - /* Copy the arch-specific data, again after (re)acquiring slots_arch_lock. */ - memcpy(&new->arch, &old.arch, sizeof(old.arch)); - - r = kvm_arch_prepare_memory_region(kvm, new, mem, change); - if (r) - goto out_slots; - - update_memslots(slots, new, change); - slots = install_new_memslots(kvm, as_id, slots); + if (change == KVM_MR_CREATE) + kvm_create_memslot(kvm, new); + else if (change == KVM_MR_DELETE) + kvm_delete_memslot(kvm, old, invalid_slot); + else if (change == KVM_MR_MOVE) + kvm_move_memslot(kvm, old, new, invalid_slot); + else if (change == KVM_MR_FLAGS_ONLY) + kvm_update_flags_memslot(kvm, old, new); + else + BUG(); - kvm_arch_commit_memory_region(kvm, mem, &old, new, change); + /* Free the temporary INVALID slot used for DELETE and MOVE. */ + if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) + kfree(invalid_slot); - /* Free the old memslot's metadata. Note, this is the full copy!!! */ - if (change == KVM_MR_DELETE) - kvm_free_memslot(kvm, &old); + /* + * No need to refresh new->arch, changes after dropping slots_arch_lock + * will directly hit the final, active memsot. Architectures are + * responsible for knowing that new->arch may be stale. + */ + kvm_commit_memory_region(kvm, old, new, change); - kvfree(slots); return 0; - -out_slots: - if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) { - slot = id_to_memslot(slots, new->id); - slot->flags &= ~KVM_MEMSLOT_INVALID; - slots = install_new_memslots(kvm, as_id, slots); - } else { - mutex_unlock(&kvm->slots_arch_lock); - } - kvfree(slots); - return r; } -static int kvm_delete_memslot(struct kvm *kvm, - const struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot *old, int as_id) +static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id, + gfn_t start, gfn_t end) { - struct kvm_memory_slot new; + struct kvm_memslot_iter iter; - if (!old->npages) - return -EINVAL; - - memset(&new, 0, sizeof(new)); - new.id = old->id; - /* - * This is only for debugging purpose; it should never be referenced - * for a removed memslot. - */ - new.as_id = as_id; + kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) { + if (iter.slot->id != id) + return true; + } - return kvm_set_memslot(kvm, mem, &new, as_id, KVM_MR_DELETE); + return false; } /* @@ -1675,9 +1820,11 @@ static int kvm_delete_memslot(struct kvm *kvm, int __kvm_set_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem) { - struct kvm_memory_slot old, new; - struct kvm_memory_slot *tmp; + struct kvm_memory_slot *old, *new; + struct kvm_memslots *slots; enum kvm_mr_change change; + unsigned long npages; + gfn_t base_gfn; int as_id, id; int r; @@ -1704,89 +1851,72 @@ int __kvm_set_memory_region(struct kvm *kvm, return -EINVAL; if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) return -EINVAL; + if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES) + return -EINVAL; + + slots = __kvm_memslots(kvm, as_id); /* - * Make a full copy of the old memslot, the pointer will become stale - * when the memslots are re-sorted by update_memslots(), and the old - * memslot needs to be referenced after calling update_memslots(), e.g. - * to free its resources and for arch specific behavior. + * Note, the old memslot (and the pointer itself!) may be invalidated + * and/or destroyed by kvm_set_memslot(). */ - tmp = id_to_memslot(__kvm_memslots(kvm, as_id), id); - if (tmp) { - old = *tmp; - tmp = NULL; - } else { - memset(&old, 0, sizeof(old)); - old.id = id; - } + old = id_to_memslot(slots, id); - if (!mem->memory_size) - return kvm_delete_memslot(kvm, mem, &old, as_id); + if (!mem->memory_size) { + if (!old || !old->npages) + return -EINVAL; - new.as_id = as_id; - new.id = id; - new.base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; - new.npages = mem->memory_size >> PAGE_SHIFT; - new.flags = mem->flags; - new.userspace_addr = mem->userspace_addr; + if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages)) + return -EIO; - if (new.npages > KVM_MEM_MAX_NR_PAGES) - return -EINVAL; + return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE); + } - if (!old.npages) { + base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT); + npages = (mem->memory_size >> PAGE_SHIFT); + + if (!old || !old->npages) { change = KVM_MR_CREATE; - new.dirty_bitmap = NULL; + + /* + * To simplify KVM internals, the total number of pages across + * all memslots must fit in an unsigned long. + */ + if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages) + return -EINVAL; } else { /* Modify an existing slot. */ - if ((new.userspace_addr != old.userspace_addr) || - (new.npages != old.npages) || - ((new.flags ^ old.flags) & KVM_MEM_READONLY)) + if ((mem->userspace_addr != old->userspace_addr) || + (npages != old->npages) || + ((mem->flags ^ old->flags) & KVM_MEM_READONLY)) return -EINVAL; - if (new.base_gfn != old.base_gfn) + if (base_gfn != old->base_gfn) change = KVM_MR_MOVE; - else if (new.flags != old.flags) + else if (mem->flags != old->flags) change = KVM_MR_FLAGS_ONLY; else /* Nothing to change. */ return 0; - - /* Copy dirty_bitmap from the current memslot. */ - new.dirty_bitmap = old.dirty_bitmap; } - if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { - /* Check for overlaps */ - kvm_for_each_memslot(tmp, __kvm_memslots(kvm, as_id)) { - if (tmp->id == id) - continue; - if (!((new.base_gfn + new.npages <= tmp->base_gfn) || - (new.base_gfn >= tmp->base_gfn + tmp->npages))) - return -EEXIST; - } - } + if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) && + kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages)) + return -EEXIST; - /* Allocate/free page dirty bitmap as needed */ - if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) - new.dirty_bitmap = NULL; - else if (!new.dirty_bitmap && !kvm->dirty_ring_size) { - r = kvm_alloc_dirty_bitmap(&new); - if (r) - return r; + /* Allocate a slot that will persist in the memslot. */ + new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT); + if (!new) + return -ENOMEM; - if (kvm_dirty_log_manual_protect_and_init_set(kvm)) - bitmap_set(new.dirty_bitmap, 0, new.npages); - } + new->as_id = as_id; + new->id = id; + new->base_gfn = base_gfn; + new->npages = npages; + new->flags = mem->flags; + new->userspace_addr = mem->userspace_addr; - r = kvm_set_memslot(kvm, mem, &new, as_id, change); + r = kvm_set_memslot(kvm, old, new, change); if (r) - goto out_bitmap; - - if (old.dirty_bitmap && !new.dirty_bitmap) - kvm_destroy_dirty_bitmap(&old); - return 0; - -out_bitmap: - if (new.dirty_bitmap && !old.dirty_bitmap) - kvm_destroy_dirty_bitmap(&new); + kfree(new); return r; } EXPORT_SYMBOL_GPL(__kvm_set_memory_region); @@ -2092,21 +2222,30 @@ EXPORT_SYMBOL_GPL(gfn_to_memslot); struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) { struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu); + u64 gen = slots->generation; struct kvm_memory_slot *slot; - int slot_index; - slot = try_get_memslot(slots, vcpu->last_used_slot, gfn); + /* + * This also protects against using a memslot from a different address space, + * since different address spaces have different generation numbers. + */ + if (unlikely(gen != vcpu->last_used_slot_gen)) { + vcpu->last_used_slot = NULL; + vcpu->last_used_slot_gen = gen; + } + + slot = try_get_memslot(vcpu->last_used_slot, gfn); if (slot) return slot; /* * Fall back to searching all memslots. We purposely use * search_memslots() instead of __gfn_to_memslot() to avoid - * thrashing the VM-wide last_used_index in kvm_memslots. + * thrashing the VM-wide last_used_slot in kvm_memslots. */ - slot = search_memslots(slots, gfn, &slot_index); + slot = search_memslots(slots, gfn, false); if (slot) { - vcpu->last_used_slot = slot_index; + vcpu->last_used_slot = slot; return slot; } @@ -2154,12 +2293,12 @@ out: return size; } -static bool memslot_is_readonly(struct kvm_memory_slot *slot) +static bool memslot_is_readonly(const struct kvm_memory_slot *slot) { return slot->flags & KVM_MEM_READONLY; } -static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, +static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn, gfn_t *nr_pages, bool write) { if (!slot || slot->flags & KVM_MEMSLOT_INVALID) @@ -2406,8 +2545,8 @@ out: * 2): @write_fault = false && @writable, @writable will tell the caller * whether the mapping is writable. */ -static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, - bool write_fault, bool *writable) +kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, + bool write_fault, bool *writable) { struct vm_area_struct *vma; kvm_pfn_t pfn = 0; @@ -2454,7 +2593,7 @@ exit: return pfn; } -kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, +kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, bool *async, bool write_fault, bool *writable, hva_t *hva) { @@ -2494,13 +2633,13 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, } EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); -kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) +kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) { return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); -kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) +kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn) { return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL, NULL); } @@ -3019,15 +3158,20 @@ int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) EXPORT_SYMBOL_GPL(kvm_clear_guest); void mark_page_dirty_in_slot(struct kvm *kvm, - struct kvm_memory_slot *memslot, + const struct kvm_memory_slot *memslot, gfn_t gfn) { + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); + + if (WARN_ON_ONCE(!vcpu) || WARN_ON_ONCE(vcpu->kvm != kvm)) + return; + if (memslot && kvm_slot_dirty_track_enabled(memslot)) { unsigned long rel_gfn = gfn - memslot->base_gfn; u32 slot = (memslot->as_id << 16) | memslot->id; if (kvm->dirty_ring_size) - kvm_dirty_ring_push(kvm_dirty_ring_get(kvm), + kvm_dirty_ring_push(&vcpu->dirty_ring, slot, rel_gfn); else set_bit_le(rel_gfn, memslot->dirty_bitmap); @@ -3139,68 +3283,93 @@ out: return ret; } -static inline void -update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited) +/* + * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is + * pending. This is mostly used when halting a vCPU, but may also be used + * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI. + */ +bool kvm_vcpu_block(struct kvm_vcpu *vcpu) { - if (waited) - vcpu->stat.generic.halt_poll_fail_ns += poll_ns; - else - vcpu->stat.generic.halt_poll_success_ns += poll_ns; + struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu); + bool waited = false; + + vcpu->stat.generic.blocking = 1; + + kvm_arch_vcpu_blocking(vcpu); + + prepare_to_rcuwait(wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + + if (kvm_vcpu_check_block(vcpu) < 0) + break; + + waited = true; + schedule(); + } + finish_rcuwait(wait); + + kvm_arch_vcpu_unblocking(vcpu); + + vcpu->stat.generic.blocking = 0; + + return waited; +} + +static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start, + ktime_t end, bool success) +{ + struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic; + u64 poll_ns = ktime_to_ns(ktime_sub(end, start)); + + ++vcpu->stat.generic.halt_attempted_poll; + + if (success) { + ++vcpu->stat.generic.halt_successful_poll; + + if (!vcpu_valid_wakeup(vcpu)) + ++vcpu->stat.generic.halt_poll_invalid; + + stats->halt_poll_success_ns += poll_ns; + KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns); + } else { + stats->halt_poll_fail_ns += poll_ns; + KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns); + } } /* - * The vCPU has executed a HLT instruction with in-kernel mode enabled. + * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc... If halt + * polling is enabled, busy wait for a short time before blocking to avoid the + * expensive block+unblock sequence if a wake event arrives soon after the vCPU + * is halted. */ -void kvm_vcpu_block(struct kvm_vcpu *vcpu) +void kvm_vcpu_halt(struct kvm_vcpu *vcpu) { + bool halt_poll_allowed = !kvm_arch_no_poll(vcpu); + bool do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns; ktime_t start, cur, poll_end; bool waited = false; - u64 block_ns; - - kvm_arch_vcpu_blocking(vcpu); + u64 halt_ns; start = cur = poll_end = ktime_get(); - if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) { - ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns); + if (do_halt_poll) { + ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns); - ++vcpu->stat.generic.halt_attempted_poll; do { /* * This sets KVM_REQ_UNHALT if an interrupt * arrives. */ - if (kvm_vcpu_check_block(vcpu) < 0) { - ++vcpu->stat.generic.halt_successful_poll; - if (!vcpu_valid_wakeup(vcpu)) - ++vcpu->stat.generic.halt_poll_invalid; - - KVM_STATS_LOG_HIST_UPDATE( - vcpu->stat.generic.halt_poll_success_hist, - ktime_to_ns(ktime_get()) - - ktime_to_ns(start)); + if (kvm_vcpu_check_block(vcpu) < 0) goto out; - } cpu_relax(); poll_end = cur = ktime_get(); } while (kvm_vcpu_can_poll(cur, stop)); - - KVM_STATS_LOG_HIST_UPDATE( - vcpu->stat.generic.halt_poll_fail_hist, - ktime_to_ns(ktime_get()) - ktime_to_ns(start)); } + waited = kvm_vcpu_block(vcpu); - prepare_to_rcuwait(&vcpu->wait); - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - - if (kvm_vcpu_check_block(vcpu) < 0) - break; - - waited = true; - schedule(); - } - finish_rcuwait(&vcpu->wait); cur = ktime_get(); if (waited) { vcpu->stat.generic.halt_wait_ns += @@ -3209,42 +3378,43 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) ktime_to_ns(cur) - ktime_to_ns(poll_end)); } out: - kvm_arch_vcpu_unblocking(vcpu); - block_ns = ktime_to_ns(cur) - ktime_to_ns(start); + /* The total time the vCPU was "halted", including polling time. */ + halt_ns = ktime_to_ns(cur) - ktime_to_ns(start); - update_halt_poll_stats( - vcpu, ktime_to_ns(ktime_sub(poll_end, start)), waited); + /* + * Note, halt-polling is considered successful so long as the vCPU was + * never actually scheduled out, i.e. even if the wake event arrived + * after of the halt-polling loop itself, but before the full wait. + */ + if (do_halt_poll) + update_halt_poll_stats(vcpu, start, poll_end, !waited); - if (!kvm_arch_no_poll(vcpu)) { + if (halt_poll_allowed) { if (!vcpu_valid_wakeup(vcpu)) { shrink_halt_poll_ns(vcpu); } else if (vcpu->kvm->max_halt_poll_ns) { - if (block_ns <= vcpu->halt_poll_ns) + if (halt_ns <= vcpu->halt_poll_ns) ; /* we had a long block, shrink polling */ else if (vcpu->halt_poll_ns && - block_ns > vcpu->kvm->max_halt_poll_ns) + halt_ns > vcpu->kvm->max_halt_poll_ns) shrink_halt_poll_ns(vcpu); /* we had a short halt and our poll time is too small */ else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns && - block_ns < vcpu->kvm->max_halt_poll_ns) + halt_ns < vcpu->kvm->max_halt_poll_ns) grow_halt_poll_ns(vcpu); } else { vcpu->halt_poll_ns = 0; } } - trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu)); - kvm_arch_vcpu_block_finish(vcpu); + trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu)); } -EXPORT_SYMBOL_GPL(kvm_vcpu_block); +EXPORT_SYMBOL_GPL(kvm_vcpu_halt); bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) { - struct rcuwait *waitp; - - waitp = kvm_arch_vcpu_get_wait(vcpu); - if (rcuwait_wake_up(waitp)) { + if (__kvm_vcpu_wake_up(vcpu)) { WRITE_ONCE(vcpu->ready, true); ++vcpu->stat.generic.halt_wakeup; return true; @@ -3265,6 +3435,19 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) if (kvm_vcpu_wake_up(vcpu)) return; + me = get_cpu(); + /* + * The only state change done outside the vcpu mutex is IN_GUEST_MODE + * to EXITING_GUEST_MODE. Therefore the moderately expensive "should + * kick" check does not need atomic operations if kvm_vcpu_kick is used + * within the vCPU thread itself. + */ + if (vcpu == __this_cpu_read(kvm_running_vcpu)) { + if (vcpu->mode == IN_GUEST_MODE) + WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE); + goto out; + } + /* * Note, the vCPU could get migrated to a different pCPU at any point * after kvm_arch_vcpu_should_kick(), which could result in sending an @@ -3272,12 +3455,12 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the * vCPU also requires it to leave IN_GUEST_MODE. */ - me = get_cpu(); if (kvm_arch_vcpu_should_kick(vcpu)) { cpu = READ_ONCE(vcpu->cpu); if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) smp_send_reschedule(cpu); } +out: put_cpu(); } EXPORT_SYMBOL_GPL(kvm_vcpu_kick); @@ -3375,10 +3558,10 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) struct kvm *kvm = me->kvm; struct kvm_vcpu *vcpu; int last_boosted_vcpu = me->kvm->last_boosted_vcpu; + unsigned long i; int yielded = 0; int try = 3; int pass; - int i; kvm_vcpu_set_in_spin_loop(me, true); /* @@ -3399,8 +3582,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) continue; if (vcpu == me) continue; - if (rcuwait_active(&vcpu->wait) && - !vcpu_dy_runnable(vcpu)) + if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu)) continue; if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && !kvm_arch_dy_has_pending_interrupt(vcpu) && @@ -3429,7 +3611,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff) { -#if KVM_DIRTY_LOG_PAGE_OFFSET > 0 +#ifdef CONFIG_HAVE_KVM_DIRTY_RING return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) && (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET + kvm->dirty_ring_size / PAGE_SIZE); @@ -3585,7 +3767,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) } vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus); - BUG_ON(kvm->vcpus[vcpu->vcpu_idx]); + r = xa_insert(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT); + BUG_ON(r == -EBUSY); + if (r) + goto unlock_vcpu_destroy; /* Fill the stats id string for the vcpu */ snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d", @@ -3595,15 +3780,14 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) kvm_get_kvm(kvm); r = create_vcpu_fd(vcpu); if (r < 0) { + xa_erase(&kvm->vcpu_array, vcpu->vcpu_idx); kvm_put_kvm_no_destroy(kvm); goto unlock_vcpu_destroy; } - kvm->vcpus[vcpu->vcpu_idx] = vcpu; - /* - * Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus - * before kvm->online_vcpu's incremented value. + * Pairs with smp_rmb() in kvm_get_vcpu. Store the vcpu + * pointer before kvm->online_vcpu's incremented value. */ smp_wmb(); atomic_inc(&kvm->online_vcpus); @@ -4132,7 +4316,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_NR_MEMSLOTS: return KVM_USER_MEM_SLOTS; case KVM_CAP_DIRTY_LOG_RING: -#if KVM_DIRTY_LOG_PAGE_OFFSET > 0 +#ifdef CONFIG_HAVE_KVM_DIRTY_RING return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn); #else return 0; @@ -4185,7 +4369,7 @@ static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size) static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm) { - int i; + unsigned long i; struct kvm_vcpu *vcpu; int cleared = 0; @@ -5104,7 +5288,7 @@ static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset) static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val) { - int i; + unsigned long i; struct kvm_vcpu *vcpu; *val = 0; @@ -5117,7 +5301,7 @@ static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val) static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset) { - int i; + unsigned long i; struct kvm_vcpu *vcpu; kvm_for_each_vcpu(i, vcpu, kvm) diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h new file mode 100644 index 000000000000..34ca40823260 --- /dev/null +++ b/virt/kvm/kvm_mm.h @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#ifndef __KVM_MM_H__ +#define __KVM_MM_H__ 1 + +/* + * Architectures can choose whether to use an rwlock or spinlock + * for the mmu_lock. These macros, for use in common code + * only, avoids using #ifdefs in places that must deal with + * multiple architectures. + */ + +#ifdef KVM_HAVE_MMU_RWLOCK +#define KVM_MMU_LOCK_INIT(kvm) rwlock_init(&(kvm)->mmu_lock) +#define KVM_MMU_LOCK(kvm) write_lock(&(kvm)->mmu_lock) +#define KVM_MMU_UNLOCK(kvm) write_unlock(&(kvm)->mmu_lock) +#define KVM_MMU_READ_LOCK(kvm) read_lock(&(kvm)->mmu_lock) +#define KVM_MMU_READ_UNLOCK(kvm) read_unlock(&(kvm)->mmu_lock) +#else +#define KVM_MMU_LOCK_INIT(kvm) spin_lock_init(&(kvm)->mmu_lock) +#define KVM_MMU_LOCK(kvm) spin_lock(&(kvm)->mmu_lock) +#define KVM_MMU_UNLOCK(kvm) spin_unlock(&(kvm)->mmu_lock) +#define KVM_MMU_READ_LOCK(kvm) spin_lock(&(kvm)->mmu_lock) +#define KVM_MMU_READ_UNLOCK(kvm) spin_unlock(&(kvm)->mmu_lock) +#endif /* KVM_HAVE_MMU_RWLOCK */ + +kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, + bool write_fault, bool *writable); + +#ifdef CONFIG_HAVE_KVM_PFNCACHE +void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, + unsigned long start, + unsigned long end, + bool may_block); +#else +static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, + unsigned long start, + unsigned long end, + bool may_block) +{ +} +#endif /* HAVE_KVM_PFNCACHE */ + +#endif /* __KVM_MM_H__ */ diff --git a/virt/kvm/mmu_lock.h b/virt/kvm/mmu_lock.h deleted file mode 100644 index 9e1308f9734c..000000000000 --- a/virt/kvm/mmu_lock.h +++ /dev/null @@ -1,23 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only - -#ifndef KVM_MMU_LOCK_H -#define KVM_MMU_LOCK_H 1 - -/* - * Architectures can choose whether to use an rwlock or spinlock - * for the mmu_lock. These macros, for use in common code - * only, avoids using #ifdefs in places that must deal with - * multiple architectures. - */ - -#ifdef KVM_HAVE_MMU_RWLOCK -#define KVM_MMU_LOCK_INIT(kvm) rwlock_init(&(kvm)->mmu_lock) -#define KVM_MMU_LOCK(kvm) write_lock(&(kvm)->mmu_lock) -#define KVM_MMU_UNLOCK(kvm) write_unlock(&(kvm)->mmu_lock) -#else -#define KVM_MMU_LOCK_INIT(kvm) spin_lock_init(&(kvm)->mmu_lock) -#define KVM_MMU_LOCK(kvm) spin_lock(&(kvm)->mmu_lock) -#define KVM_MMU_UNLOCK(kvm) spin_unlock(&(kvm)->mmu_lock) -#endif /* KVM_HAVE_MMU_RWLOCK */ - -#endif diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c new file mode 100644 index 000000000000..ce878f4be4da --- /dev/null +++ b/virt/kvm/pfncache.c @@ -0,0 +1,337 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Kernel-based Virtual Machine driver for Linux + * + * This module enables kernel and guest-mode vCPU access to guest physical + * memory with suitable invalidation mechanisms. + * + * Copyright © 2021 Amazon.com, Inc. or its affiliates. + * + * Authors: + * David Woodhouse <dwmw2@infradead.org> + */ + +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/errno.h> + +#include "kvm_mm.h" + +/* + * MMU notifier 'invalidate_range_start' hook. + */ +void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start, + unsigned long end, bool may_block) +{ + DECLARE_BITMAP(vcpu_bitmap, KVM_MAX_VCPUS); + struct gfn_to_pfn_cache *gpc; + bool wake_vcpus = false; + + spin_lock(&kvm->gpc_lock); + list_for_each_entry(gpc, &kvm->gpc_list, list) { + write_lock_irq(&gpc->lock); + + /* Only a single page so no need to care about length */ + if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) && + gpc->uhva >= start && gpc->uhva < end) { + gpc->valid = false; + + /* + * If a guest vCPU could be using the physical address, + * it needs to be woken. + */ + if (gpc->guest_uses_pa) { + if (!wake_vcpus) { + wake_vcpus = true; + bitmap_zero(vcpu_bitmap, KVM_MAX_VCPUS); + } + __set_bit(gpc->vcpu->vcpu_idx, vcpu_bitmap); + } + + /* + * We cannot call mark_page_dirty() from here because + * this physical CPU might not have an active vCPU + * with which to do the KVM dirty tracking. + * + * Neither is there any point in telling the kernel MM + * that the underlying page is dirty. A vCPU in guest + * mode might still be writing to it up to the point + * where we wake them a few lines further down anyway. + * + * So all the dirty marking happens on the unmap. + */ + } + write_unlock_irq(&gpc->lock); + } + spin_unlock(&kvm->gpc_lock); + + if (wake_vcpus) { + unsigned int req = KVM_REQ_GPC_INVALIDATE; + bool called; + + /* + * If the OOM reaper is active, then all vCPUs should have + * been stopped already, so perform the request without + * KVM_REQUEST_WAIT and be sad if any needed to be woken. + */ + if (!may_block) + req &= ~KVM_REQUEST_WAIT; + + called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap); + + WARN_ON_ONCE(called && !may_block); + } +} + +bool kvm_gfn_to_pfn_cache_check(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, + gpa_t gpa, unsigned long len) +{ + struct kvm_memslots *slots = kvm_memslots(kvm); + + if ((gpa & ~PAGE_MASK) + len > PAGE_SIZE) + return false; + + if (gpc->gpa != gpa || gpc->generation != slots->generation || + kvm_is_error_hva(gpc->uhva)) + return false; + + if (!gpc->valid) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_check); + +static void __release_gpc(struct kvm *kvm, kvm_pfn_t pfn, void *khva, + gpa_t gpa, bool dirty) +{ + /* Unmap the old page if it was mapped before, and release it */ + if (!is_error_noslot_pfn(pfn)) { + if (khva) { + if (pfn_valid(pfn)) + kunmap(pfn_to_page(pfn)); +#ifdef CONFIG_HAS_IOMEM + else + memunmap(khva); +#endif + } + + kvm_release_pfn(pfn, dirty); + if (dirty) + mark_page_dirty(kvm, gpa); + } +} + +static kvm_pfn_t hva_to_pfn_retry(struct kvm *kvm, unsigned long uhva) +{ + unsigned long mmu_seq; + kvm_pfn_t new_pfn; + int retry; + + do { + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + /* We always request a writeable mapping */ + new_pfn = hva_to_pfn(uhva, false, NULL, true, NULL); + if (is_error_noslot_pfn(new_pfn)) + break; + + KVM_MMU_READ_LOCK(kvm); + retry = mmu_notifier_retry_hva(kvm, mmu_seq, uhva); + KVM_MMU_READ_UNLOCK(kvm); + if (!retry) + break; + + cond_resched(); + } while (1); + + return new_pfn; +} + +int kvm_gfn_to_pfn_cache_refresh(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, + gpa_t gpa, unsigned long len, bool dirty) +{ + struct kvm_memslots *slots = kvm_memslots(kvm); + unsigned long page_offset = gpa & ~PAGE_MASK; + kvm_pfn_t old_pfn, new_pfn; + unsigned long old_uhva; + gpa_t old_gpa; + void *old_khva; + bool old_valid, old_dirty; + int ret = 0; + + /* + * If must fit within a single page. The 'len' argument is + * only to enforce that. + */ + if (page_offset + len > PAGE_SIZE) + return -EINVAL; + + write_lock_irq(&gpc->lock); + + old_gpa = gpc->gpa; + old_pfn = gpc->pfn; + old_khva = gpc->khva - offset_in_page(gpc->khva); + old_uhva = gpc->uhva; + old_valid = gpc->valid; + old_dirty = gpc->dirty; + + /* If the userspace HVA is invalid, refresh that first */ + if (gpc->gpa != gpa || gpc->generation != slots->generation || + kvm_is_error_hva(gpc->uhva)) { + gfn_t gfn = gpa_to_gfn(gpa); + + gpc->dirty = false; + gpc->gpa = gpa; + gpc->generation = slots->generation; + gpc->memslot = __gfn_to_memslot(slots, gfn); + gpc->uhva = gfn_to_hva_memslot(gpc->memslot, gfn); + + if (kvm_is_error_hva(gpc->uhva)) { + ret = -EFAULT; + goto out; + } + + gpc->uhva += page_offset; + } + + /* + * If the userspace HVA changed or the PFN was already invalid, + * drop the lock and do the HVA to PFN lookup again. + */ + if (!old_valid || old_uhva != gpc->uhva) { + unsigned long uhva = gpc->uhva; + void *new_khva = NULL; + + /* Placeholders for "hva is valid but not yet mapped" */ + gpc->pfn = KVM_PFN_ERR_FAULT; + gpc->khva = NULL; + gpc->valid = true; + + write_unlock_irq(&gpc->lock); + + new_pfn = hva_to_pfn_retry(kvm, uhva); + if (is_error_noslot_pfn(new_pfn)) { + ret = -EFAULT; + goto map_done; + } + + if (gpc->kernel_map) { + if (new_pfn == old_pfn) { + new_khva = old_khva; + old_pfn = KVM_PFN_ERR_FAULT; + old_khva = NULL; + } else if (pfn_valid(new_pfn)) { + new_khva = kmap(pfn_to_page(new_pfn)); +#ifdef CONFIG_HAS_IOMEM + } else { + new_khva = memremap(pfn_to_hpa(new_pfn), PAGE_SIZE, MEMREMAP_WB); +#endif + } + if (new_khva) + new_khva += page_offset; + else + ret = -EFAULT; + } + + map_done: + write_lock_irq(&gpc->lock); + if (ret) { + gpc->valid = false; + gpc->pfn = KVM_PFN_ERR_FAULT; + gpc->khva = NULL; + } else { + /* At this point, gpc->valid may already have been cleared */ + gpc->pfn = new_pfn; + gpc->khva = new_khva; + } + } else { + /* If the HVA→PFN mapping was already valid, don't unmap it. */ + old_pfn = KVM_PFN_ERR_FAULT; + old_khva = NULL; + } + + out: + if (ret) + gpc->dirty = false; + else + gpc->dirty = dirty; + + write_unlock_irq(&gpc->lock); + + __release_gpc(kvm, old_pfn, old_khva, old_gpa, old_dirty); + + return ret; +} +EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_refresh); + +void kvm_gfn_to_pfn_cache_unmap(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) +{ + void *old_khva; + kvm_pfn_t old_pfn; + bool old_dirty; + gpa_t old_gpa; + + write_lock_irq(&gpc->lock); + + gpc->valid = false; + + old_khva = gpc->khva - offset_in_page(gpc->khva); + old_dirty = gpc->dirty; + old_gpa = gpc->gpa; + old_pfn = gpc->pfn; + + /* + * We can leave the GPA → uHVA map cache intact but the PFN + * lookup will need to be redone even for the same page. + */ + gpc->khva = NULL; + gpc->pfn = KVM_PFN_ERR_FAULT; + + write_unlock_irq(&gpc->lock); + + __release_gpc(kvm, old_pfn, old_khva, old_gpa, old_dirty); +} +EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_unmap); + + +int kvm_gfn_to_pfn_cache_init(struct kvm *kvm, struct gfn_to_pfn_cache *gpc, + struct kvm_vcpu *vcpu, bool guest_uses_pa, + bool kernel_map, gpa_t gpa, unsigned long len, + bool dirty) +{ + if (!gpc->active) { + rwlock_init(&gpc->lock); + + gpc->khva = NULL; + gpc->pfn = KVM_PFN_ERR_FAULT; + gpc->uhva = KVM_HVA_ERR_BAD; + gpc->vcpu = vcpu; + gpc->kernel_map = kernel_map; + gpc->guest_uses_pa = guest_uses_pa; + gpc->valid = false; + gpc->active = true; + + spin_lock(&kvm->gpc_lock); + list_add(&gpc->list, &kvm->gpc_list); + spin_unlock(&kvm->gpc_lock); + } + return kvm_gfn_to_pfn_cache_refresh(kvm, gpc, gpa, len, dirty); +} +EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_init); + +void kvm_gfn_to_pfn_cache_destroy(struct kvm *kvm, struct gfn_to_pfn_cache *gpc) +{ + if (gpc->active) { + spin_lock(&kvm->gpc_lock); + list_del(&gpc->list); + spin_unlock(&kvm->gpc_lock); + + kvm_gfn_to_pfn_cache_unmap(kvm, gpc); + gpc->active = false; + } +} +EXPORT_SYMBOL_GPL(kvm_gfn_to_pfn_cache_destroy); |