From 037bc38b298c9a8de64f84b253c0b472228bbb10 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:03 -0700 Subject: KVM: Drop KVM_ERR_PTR_BAD_PAGE and instead return NULL to indicate an error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove KVM_ERR_PTR_BAD_PAGE and instead return NULL, as "bad page" is just a leftover bit of weirdness from days of old when KVM stuffed a "bad" page into the guest instead of actually handling missing pages. See commit cea7bb21280e ("KVM: MMU: Make gfn_to_page() always safe"). Reviewed-by: Alex Bennée Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-2-seanjc@google.com> --- include/linux/kvm_host.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 45be36e5285f..0b280e5bad0a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -153,13 +153,6 @@ static inline bool kvm_is_error_gpa(gpa_t gpa) return gpa == INVALID_GPA; } -#define KVM_ERR_PTR_BAD_PAGE (ERR_PTR(-ENOENT)) - -static inline bool is_error_page(struct page *page) -{ - return IS_ERR(page); -} - #define KVM_REQUEST_MASK GENMASK(7,0) #define KVM_REQUEST_NO_WAKEUP BIT(8) #define KVM_REQUEST_WAIT BIT(9) -- cgit v1.2.3 From 3af91068b7e10dba438f70eae94d877f20842fa1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:05 -0700 Subject: KVM: Add kvm_release_page_unused() API to put pages that KVM never consumes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an API to release an unused page, i.e. to put a page without marking it accessed or dirty. The API will be used when KVM faults-in a page but bails before installing the guest mapping (and other similar flows). Reviewed-by: Alex Bennée Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-4-seanjc@google.com> --- include/linux/kvm_host.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0b280e5bad0a..8de9acb0b35e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1216,6 +1216,15 @@ unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable); unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn); unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn, bool *writable); + +static inline void kvm_release_page_unused(struct page *page) +{ + if (!page) + return; + + put_page(page); +} + void kvm_release_page_clean(struct page *page); void kvm_release_page_dirty(struct page *page); -- cgit v1.2.3 From 6419bc52072b928acb764766968c672d5fede802 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:13 -0700 Subject: KVM: Rename gfn_to_page_many_atomic() to kvm_prefetch_pages() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename gfn_to_page_many_atomic() to kvm_prefetch_pages() to try and communicate its true purpose, as the "atomic" aspect is essentially a side effect of the fact that x86 uses the API while holding mmu_lock. E.g. even if mmu_lock weren't held, KVM wouldn't want to fault-in pages, as the goal is to opportunistically grab surrounding pages that have already been accessed and/or dirtied by the host, and to do so quickly. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-12-seanjc@google.com> --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/mmu/paging_tmpl.h | 2 +- include/linux/kvm_host.h | 4 ++-- virt/kvm/kvm_main.c | 6 +++--- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 993eeba32487..37c2f8d11e05 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2965,7 +2965,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, if (!slot) return -1; - ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start); + ret = kvm_prefetch_pages(slot, gfn, pages, end - start); if (ret <= 0) return -1; diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 36b2607280f0..143b7e9f26dc 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -549,7 +549,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, if (!slot) return false; - if (gfn_to_page_many_atomic(slot, gfn, &page, 1) != 1) + if (kvm_prefetch_pages(slot, gfn, &page, 1) != 1) return false; mmu_set_spte(vcpu, slot, spte, pte_access, gfn, page_to_pfn(page), NULL); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8de9acb0b35e..6a3976c1a218 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1207,8 +1207,8 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm); void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot); -int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, - struct page **pages, int nr_pages); +int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, + struct page **pages, int nr_pages); struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e7561ca96a09..aa7ae0f0f90e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3041,8 +3041,8 @@ kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(gfn_to_pfn); -int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, - struct page **pages, int nr_pages) +int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, + struct page **pages, int nr_pages) { unsigned long addr; gfn_t entry = 0; @@ -3056,7 +3056,7 @@ int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages); } -EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); +EXPORT_SYMBOL_GPL(kvm_prefetch_pages); /* * Do not use this helper unless you are absolutely certain the gfn _must_ be -- cgit v1.2.3 From e2d2ca71ac03c748dbc44e0dd7dc1557befb1ab6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:14 -0700 Subject: KVM: Drop @atomic param from gfn=>pfn and hva=>pfn APIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop @atomic from the myriad "to_pfn" APIs now that all callers pass "false", and remove a comment blurb about KVM running only the "GUP fast" part in atomic context. No functional change intended. Reviewed-by: Alex Bennée Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-13-seanjc@google.com> --- Documentation/virt/kvm/locking.rst | 4 ++-- arch/arm64/kvm/mmu.c | 2 +- arch/powerpc/kvm/book3s_64_mmu_hv.c | 2 +- arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +- arch/x86/kvm/mmu/mmu.c | 12 ++++++------ include/linux/kvm_host.h | 3 +-- virt/kvm/kvm_main.c | 33 ++++++++------------------------- virt/kvm/kvm_mm.h | 4 ++-- virt/kvm/pfncache.c | 2 +- 9 files changed, 23 insertions(+), 41 deletions(-) (limited to 'include/linux') diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index 693090bfc66d..f463ac42ac7a 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -135,8 +135,8 @@ We dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap. For direct sp, we can easily avoid it since the spte of direct sp is fixed to gfn. For indirect sp, we disabled fast page fault for simplicity. -A solution for indirect sp could be to pin the gfn, for example via -gfn_to_pfn_memslot_atomic, before the cmpxchg. After the pinning: +A solution for indirect sp could be to pin the gfn before the cmpxchg. After +the pinning: - We have held the refcount of pfn; that means the pfn can not be freed and be reused for another gfn. diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 0f7658aefa1a..9fbc79fad292 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1570,7 +1570,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, mmu_seq = vcpu->kvm->mmu_invalidate_seq; mmap_read_unlock(current->mm); - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, write_fault, &writable, NULL); if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(hva, vma_shift); diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 1b51b1c4713b..8cd02ca4b1b8 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -613,7 +613,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu, write_ok = true; } else { /* Call KVM generic code to do the slow-path check */ - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, writing, &write_ok, NULL); if (is_error_noslot_pfn(pfn)) return -EFAULT; diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 408d98f8a514..26a969e935e3 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -852,7 +852,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, unsigned long pfn; /* Call KVM generic code to do the slow-path check */ - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, false, NULL, + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, writing, upgrade_p, NULL); if (is_error_noslot_pfn(pfn)) return -EFAULT; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 37c2f8d11e05..e5e0bf7593e7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4387,9 +4387,9 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault return kvm_faultin_pfn_private(vcpu, fault); async = false; - fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, false, - &async, fault->write, - &fault->map_writable, &fault->hva); + fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, &async, + fault->write, &fault->map_writable, + &fault->hva); if (!async) return RET_PF_CONTINUE; /* *pfn has correct page already */ @@ -4409,9 +4409,9 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * to wait for IO. Note, gup always bails if it is unable to quickly * get a page and a fatal signal, i.e. SIGKILL, is pending. */ - fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true, - NULL, fault->write, - &fault->map_writable, &fault->hva); + fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, true, NULL, + fault->write, &fault->map_writable, + &fault->hva); return RET_PF_CONTINUE; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6a3976c1a218..32e23e05a8c3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1232,9 +1232,8 @@ kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable); kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn); -kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn); kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, - bool atomic, bool interruptible, bool *async, + bool interruptible, bool *async, bool write_fault, bool *writable, hva_t *hva); void kvm_release_pfn_clean(kvm_pfn_t pfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index aa7ae0f0f90e..c7506eb23086 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2756,8 +2756,7 @@ static inline int check_user_page_hwpoison(unsigned long addr) /* * The fast path to get the writable pfn which will be stored in @pfn, - * true indicates success, otherwise false is returned. It's also the - * only part that runs if we can in atomic context. + * true indicates success, otherwise false is returned. */ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, bool *writable, kvm_pfn_t *pfn) @@ -2922,7 +2921,6 @@ out: /* * Pin guest page in memory and return its pfn. * @addr: host virtual address which maps memory to the guest - * @atomic: whether this function is forbidden from sleeping * @interruptible: whether the process can be interrupted by non-fatal signals * @async: whether this function need to wait IO complete if the * host page is not in the memory @@ -2934,22 +2932,16 @@ out: * 2): @write_fault = false && @writable, @writable will tell the caller * whether the mapping is writable. */ -kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible, - bool *async, bool write_fault, bool *writable) +kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool *async, + bool write_fault, bool *writable) { struct vm_area_struct *vma; kvm_pfn_t pfn; int npages, r; - /* we can do it either atomically or asynchronously, not both */ - BUG_ON(atomic && async); - if (hva_to_pfn_fast(addr, write_fault, writable, &pfn)) return pfn; - if (atomic) - return KVM_PFN_ERR_FAULT; - npages = hva_to_pfn_slow(addr, async, write_fault, interruptible, writable, &pfn); if (npages == 1) @@ -2986,7 +2978,7 @@ exit: } kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, - bool atomic, bool interruptible, bool *async, + bool interruptible, bool *async, bool write_fault, bool *writable, hva_t *hva) { unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); @@ -3008,33 +3000,24 @@ kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, writable = NULL; } - return hva_to_pfn(addr, atomic, interruptible, async, write_fault, - writable); + return hva_to_pfn(addr, interruptible, async, write_fault, writable); } EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable) { - return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false, - NULL, write_fault, writable, NULL); + return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, + write_fault, writable, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) { - return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true, - NULL, NULL); + return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); -kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn) -{ - return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true, - NULL, NULL); -} -EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); - kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) { return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index 715f19669d01..a3fa86f60d6c 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -20,8 +20,8 @@ #define KVM_MMU_UNLOCK(kvm) spin_unlock(&(kvm)->mmu_lock) #endif /* KVM_HAVE_MMU_RWLOCK */ -kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible, - bool *async, bool write_fault, bool *writable); +kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool *async, + bool write_fault, bool *writable); #ifdef CONFIG_HAVE_KVM_PFNCACHE void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index f0039efb9e1e..58c706a610e5 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -198,7 +198,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) } /* We always request a writeable mapping */ - new_pfn = hva_to_pfn(gpc->uhva, false, false, NULL, true, NULL); + new_pfn = hva_to_pfn(gpc->uhva, false, NULL, true, NULL); if (is_error_noslot_pfn(new_pfn)) goto out_error; -- cgit v1.2.3 From 6769d1bcd3509ad2d2ee04da122c465a11a165b4 Mon Sep 17 00:00:00 2001 From: David Stevens Date: Thu, 10 Oct 2024 11:23:18 -0700 Subject: KVM: Replace "async" pointer in gfn=>pfn with "no_wait" and error code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a pfn error code to communicate that hva_to_pfn() failed because I/O was needed and disallowed, and convert @async to a constant @no_wait boolean. This will allow eliminating the @no_wait param by having callers pass in FOLL_NOWAIT along with other FOLL_* flags. Tested-by: Alex Bennée Signed-off-by: David Stevens Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-17-seanjc@google.com> --- arch/x86/kvm/mmu/mmu.c | 18 +++++++++++------- include/linux/kvm_host.h | 3 ++- virt/kvm/kvm_main.c | 27 ++++++++++++++------------- virt/kvm/kvm_mm.h | 2 +- virt/kvm/pfncache.c | 4 ++-- 5 files changed, 30 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e5e0bf7593e7..cae81209f9ed 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4381,17 +4381,21 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { - bool async; - if (fault->is_private) return kvm_faultin_pfn_private(vcpu, fault); - async = false; - fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, &async, + fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true, fault->write, &fault->map_writable, &fault->hva); - if (!async) - return RET_PF_CONTINUE; /* *pfn has correct page already */ + + /* + * If resolving the page failed because I/O is needed to fault-in the + * page, then either set up an asynchronous #PF to do the I/O, or if + * doing an async #PF isn't possible, retry with I/O allowed. All + * other failures are terminal, i.e. retrying won't help. + */ + if (fault->pfn != KVM_PFN_ERR_NEEDS_IO) + return RET_PF_CONTINUE; if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) { trace_kvm_try_async_get_page(fault->addr, fault->gfn); @@ -4409,7 +4413,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * to wait for IO. Note, gup always bails if it is unable to quickly * get a page and a fatal signal, i.e. SIGKILL, is pending. */ - fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, true, NULL, + fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, true, true, fault->write, &fault->map_writable, &fault->hva); return RET_PF_CONTINUE; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 32e23e05a8c3..dc15a9a64408 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -97,6 +97,7 @@ #define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1) #define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 2) #define KVM_PFN_ERR_SIGPENDING (KVM_PFN_ERR_MASK + 3) +#define KVM_PFN_ERR_NEEDS_IO (KVM_PFN_ERR_MASK + 4) /* * error pfns indicate that the gfn is in slot but faild to @@ -1233,7 +1234,7 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable); kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn); kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, - bool interruptible, bool *async, + bool interruptible, bool no_wait, bool write_fault, bool *writable, hva_t *hva); void kvm_release_pfn_clean(kvm_pfn_t pfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f5b7fd653341..d7a72278c033 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2778,7 +2778,7 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, * The slow path to get the pfn of the specified host virtual address, * 1 indicates success, -errno is returned if error is detected. */ -static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, +static int hva_to_pfn_slow(unsigned long addr, bool no_wait, bool write_fault, bool interruptible, bool *writable, kvm_pfn_t *pfn) { /* @@ -2801,7 +2801,7 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, if (write_fault) flags |= FOLL_WRITE; - if (async) + if (no_wait) flags |= FOLL_NOWAIT; if (interruptible) flags |= FOLL_INTERRUPTIBLE; @@ -2912,8 +2912,8 @@ out: * Pin guest page in memory and return its pfn. * @addr: host virtual address which maps memory to the guest * @interruptible: whether the process can be interrupted by non-fatal signals - * @async: whether this function need to wait IO complete if the - * host page is not in the memory + * @no_wait: whether or not this function need to wait IO complete if the + * host page is not in the memory * @write_fault: whether we should get a writable host page * @writable: whether it allows to map a writable host page for !@write_fault * @@ -2922,7 +2922,7 @@ out: * 2): @write_fault = false && @writable, @writable will tell the caller * whether the mapping is writable. */ -kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool *async, +kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool no_wait, bool write_fault, bool *writable) { struct vm_area_struct *vma; @@ -2934,7 +2934,7 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool *async, if (hva_to_pfn_fast(addr, write_fault, writable, &pfn)) return pfn; - npages = hva_to_pfn_slow(addr, async, write_fault, interruptible, + npages = hva_to_pfn_slow(addr, no_wait, write_fault, interruptible, writable, &pfn); if (npages == 1) return pfn; @@ -2956,16 +2956,17 @@ retry: if (r < 0) pfn = KVM_PFN_ERR_FAULT; } else { - if (async && vma_is_valid(vma, write_fault)) - *async = true; - pfn = KVM_PFN_ERR_FAULT; + if (no_wait && vma_is_valid(vma, write_fault)) + pfn = KVM_PFN_ERR_NEEDS_IO; + else + pfn = KVM_PFN_ERR_FAULT; } mmap_read_unlock(current->mm); return pfn; } kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, - bool interruptible, bool *async, + bool interruptible, bool no_wait, bool write_fault, bool *writable, hva_t *hva) { unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); @@ -2987,21 +2988,21 @@ kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, writable = NULL; } - return hva_to_pfn(addr, interruptible, async, write_fault, writable); + return hva_to_pfn(addr, interruptible, no_wait, write_fault, writable); } EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable) { - return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, + return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false, write_fault, writable, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) { - return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL, NULL); + return __gfn_to_pfn_memslot(slot, gfn, false, false, true, NULL, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index a3fa86f60d6c..51f3fee4ca3f 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -20,7 +20,7 @@ #define KVM_MMU_UNLOCK(kvm) spin_unlock(&(kvm)->mmu_lock) #endif /* KVM_HAVE_MMU_RWLOCK */ -kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool *async, +kvm_pfn_t hva_to_pfn(unsigned long addr, bool interruptible, bool no_wait, bool write_fault, bool *writable); #ifdef CONFIG_HAVE_KVM_PFNCACHE diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 58c706a610e5..32dc61f48c81 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -197,8 +197,8 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) cond_resched(); } - /* We always request a writeable mapping */ - new_pfn = hva_to_pfn(gpc->uhva, false, NULL, true, NULL); + /* We always request a writable mapping */ + new_pfn = hva_to_pfn(gpc->uhva, false, false, true, NULL); if (is_error_noslot_pfn(new_pfn)) goto out_error; -- cgit v1.2.3 From cccefb0a0d3b4f7b41b1921538087dd7031876ac Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:20 -0700 Subject: KVM: Drop unused "hva" pointer from __gfn_to_pfn_memslot() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop @hva from __gfn_to_pfn_memslot() now that all callers pass NULL. No functional change intended. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-19-seanjc@google.com> --- arch/arm64/kvm/mmu.c | 2 +- arch/powerpc/kvm/book3s_64_mmu_hv.c | 2 +- arch/powerpc/kvm/book3s_64_mmu_radix.c | 2 +- arch/x86/kvm/mmu/mmu.c | 6 ++---- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 9 +++------ 6 files changed, 9 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 9fbc79fad292..246c820379ec 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1571,7 +1571,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, mmap_read_unlock(current->mm); pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, - write_fault, &writable, NULL); + write_fault, &writable); if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(hva, vma_shift); return 0; diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 8cd02ca4b1b8..2f1d58984b41 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -614,7 +614,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu, } else { /* Call KVM generic code to do the slow-path check */ pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, - writing, &write_ok, NULL); + writing, &write_ok); if (is_error_noslot_pfn(pfn)) return -EFAULT; page = NULL; diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 26a969e935e3..8304b6f8fe45 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -853,7 +853,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, /* Call KVM generic code to do the slow-path check */ pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, - writing, upgrade_p, NULL); + writing, upgrade_p); if (is_error_noslot_pfn(pfn)) return -EFAULT; page = NULL; diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d4e21071b78e..688202ac50c2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4384,8 +4384,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault return kvm_faultin_pfn_private(vcpu, fault); fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true, - fault->write, &fault->map_writable, - NULL); + fault->write, &fault->map_writable); /* * If resolving the page failed because I/O is needed to fault-in the @@ -4413,8 +4412,7 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault * get a page and a fatal signal, i.e. SIGKILL, is pending. */ fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, true, true, - fault->write, &fault->map_writable, - NULL); + fault->write, &fault->map_writable); return RET_PF_CONTINUE; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index dc15a9a64408..2c9eb472f059 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1235,7 +1235,7 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn); kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, bool interruptible, bool no_wait, - bool write_fault, bool *writable, hva_t *hva); + bool write_fault, bool *writable); void kvm_release_pfn_clean(kvm_pfn_t pfn); void kvm_release_pfn_dirty(kvm_pfn_t pfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d7a72278c033..9de915a56bd5 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2967,13 +2967,10 @@ retry: kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, bool interruptible, bool no_wait, - bool write_fault, bool *writable, hva_t *hva) + bool write_fault, bool *writable) { unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); - if (hva) - *hva = addr; - if (kvm_is_error_hva(addr)) { if (writable) *writable = false; @@ -2996,13 +2993,13 @@ kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable) { return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false, - write_fault, writable, NULL); + write_fault, writable); } EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) { - return __gfn_to_pfn_memslot(slot, gfn, false, false, true, NULL, NULL); + return __gfn_to_pfn_memslot(slot, gfn, false, false, true, NULL); } EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); -- cgit v1.2.3 From ef7db98e477f4b379fac3131bf94c33774c4a211 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:24 -0700 Subject: KVM: Use NULL for struct page pointer to indicate mremapped memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop yet another unnecessary magic page value from KVM, as there's zero reason to use a poisoned pointer to indicate "no page". If KVM uses a NULL page pointer, the kernel will explode just as quickly as if KVM uses a poisoned pointer. Never mind the fact that such usage would be a blatant and egregious KVM bug. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-23-seanjc@google.com> --- include/linux/kvm_host.h | 4 ---- virt/kvm/kvm_main.c | 4 ++-- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 2c9eb472f059..cd6f5cc1930f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -273,16 +273,12 @@ enum { READING_SHADOW_PAGE_TABLES, }; -#define KVM_UNMAPPED_PAGE ((void *) 0x500 + POISON_POINTER_DELTA) - struct kvm_host_map { /* * Only valid if the 'pfn' is managed by the host kernel (i.e. There is * a 'struct page' for it. When using mem= kernel parameter some memory * can be used as guest memory but they are not managed by host * kernel). - * If 'pfn' is not managed by the host kernel, this field is - * initialized to KVM_UNMAPPED_PAGE. */ struct page *page; void *hva; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4074f49eb3f1..c20386a8aa3e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3061,7 +3061,7 @@ void kvm_release_pfn(kvm_pfn_t pfn, bool dirty) int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) { - map->page = KVM_UNMAPPED_PAGE; + map->page = NULL; map->hva = NULL; map->gfn = gfn; @@ -3087,7 +3087,7 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) if (!map->hva) return; - if (map->page != KVM_UNMAPPED_PAGE) + if (map->page) kunmap(map->page); #ifdef CONFIG_HAS_IOMEM else -- cgit v1.2.3 From 2ff072ba7ad2deb1c3b2d231faa0ac3a25e2451b Mon Sep 17 00:00:00 2001 From: David Stevens Date: Thu, 10 Oct 2024 11:23:32 -0700 Subject: KVM: Migrate kvm_vcpu_map() to kvm_follow_pfn() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migrate kvm_vcpu_map() to kvm_follow_pfn(), and have it track whether or not the map holds a refcounted struct page. Precisely tracking struct page references will eventually allow removing kvm_pfn_to_refcounted_page() and its various wrappers. Signed-off-by: David Stevens [sean: use a pointer instead of a boolean] Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-31-seanjc@google.com> --- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 26 ++++++++++++++++---------- 2 files changed, 17 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index cd6f5cc1930f..35e1beb017dd 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -280,6 +280,7 @@ struct kvm_host_map { * can be used as guest memory but they are not managed by host * kernel). */ + struct page *refcounted_page; struct page *page; void *hva; kvm_pfn_t pfn; @@ -1238,7 +1239,6 @@ void kvm_release_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_accessed(kvm_pfn_t pfn); -void kvm_release_pfn(kvm_pfn_t pfn, bool dirty); int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 64888257e301..842a5d5f3120 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3087,21 +3087,21 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(gfn_to_page); -void kvm_release_pfn(kvm_pfn_t pfn, bool dirty) -{ - if (dirty) - kvm_release_pfn_dirty(pfn); - else - kvm_release_pfn_clean(pfn); -} - int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) { + struct kvm_follow_pfn kfp = { + .slot = gfn_to_memslot(vcpu->kvm, gfn), + .gfn = gfn, + .flags = FOLL_WRITE, + .refcounted_page = &map->refcounted_page, + }; + + map->refcounted_page = NULL; map->page = NULL; map->hva = NULL; map->gfn = gfn; - map->pfn = gfn_to_pfn(vcpu->kvm, gfn); + map->pfn = kvm_follow_pfn(&kfp); if (is_error_noslot_pfn(map->pfn)) return -EINVAL; @@ -3133,10 +3133,16 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) if (dirty) kvm_vcpu_mark_page_dirty(vcpu, map->gfn); - kvm_release_pfn(map->pfn, dirty); + if (map->refcounted_page) { + if (dirty) + kvm_release_page_dirty(map->refcounted_page); + else + kvm_release_page_clean(map->refcounted_page); + } map->hva = NULL; map->page = NULL; + map->refcounted_page = NULL; } EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); -- cgit v1.2.3 From 2bcb52a3602bf4cbc55d8fb4da00c930f83d7789 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:33 -0700 Subject: KVM: Pin (as in FOLL_PIN) pages during kvm_vcpu_map() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pin, as in FOLL_PIN, pages when mapping them for direct access by KVM. As per Documentation/core-api/pin_user_pages.rst, writing to a page that was gotten via FOLL_GET is explicitly disallowed. Correct (uses FOLL_PIN calls): pin_user_pages() write to the data within the pages unpin_user_pages() INCORRECT (uses FOLL_GET calls): get_user_pages() write to the data within the pages put_page() Unfortunately, FOLL_PIN is a "private" flag, and so kvm_follow_pfn must use a one-off bool instead of being able to piggyback the "flags" field. Link: https://lwn.net/Articles/930667 Link: https://lore.kernel.org/all/cover.1683044162.git.lstoakes@gmail.com Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-32-seanjc@google.com> --- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 54 ++++++++++++++++++++++++++++++++++-------------- virt/kvm/kvm_mm.h | 7 +++++++ 3 files changed, 47 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 35e1beb017dd..b4c541fa5a1f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -280,7 +280,7 @@ struct kvm_host_map { * can be used as guest memory but they are not managed by host * kernel). */ - struct page *refcounted_page; + struct page *pinned_page; struct page *page; void *hva; kvm_pfn_t pfn; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 842a5d5f3120..0d59f47f099e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2814,9 +2814,12 @@ static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, */ if (map) { pfn = map->pfn; - page = kvm_pfn_to_refcounted_page(pfn); - if (page && !get_page_unless_zero(page)) - return KVM_PFN_ERR_FAULT; + + if (!kfp->pin) { + page = kvm_pfn_to_refcounted_page(pfn); + if (page && !get_page_unless_zero(page)) + return KVM_PFN_ERR_FAULT; + } } else { pfn = page_to_pfn(page); } @@ -2834,16 +2837,24 @@ static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) { struct page *page; + bool r; /* - * Fast pin a writable pfn only if it is a write fault request - * or the caller allows to map a writable pfn for a read fault - * request. + * Try the fast-only path when the caller wants to pin/get the page for + * writing. If the caller only wants to read the page, KVM must go + * down the full, slow path in order to avoid racing an operation that + * breaks Copy-on-Write (CoW), e.g. so that KVM doesn't end up pointing + * at the old, read-only page while mm/ points at a new, writable page. */ if (!((kfp->flags & FOLL_WRITE) || kfp->map_writable)) return false; - if (get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page)) { + if (kfp->pin) + r = pin_user_pages_fast(kfp->hva, 1, FOLL_WRITE, &page) == 1; + else + r = get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page); + + if (r) { *pfn = kvm_resolve_pfn(kfp, page, NULL, true); return true; } @@ -2872,10 +2883,21 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) struct page *page, *wpage; int npages; - npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags); + if (kfp->pin) + npages = pin_user_pages_unlocked(kfp->hva, 1, &page, flags); + else + npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags); if (npages != 1) return npages; + /* + * Pinning is mutually exclusive with opportunistically mapping a read + * fault as writable, as KVM should never pin pages when mapping memory + * into the guest (pinning is only for direct accesses from KVM). + */ + if (WARN_ON_ONCE(kfp->map_writable && kfp->pin)) + goto out; + /* map read fault as writable if possible */ if (!(flags & FOLL_WRITE) && kfp->map_writable && get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) { @@ -2884,6 +2906,7 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) flags |= FOLL_WRITE; } +out: *pfn = kvm_resolve_pfn(kfp, page, NULL, flags & FOLL_WRITE); return npages; } @@ -3093,10 +3116,11 @@ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) .slot = gfn_to_memslot(vcpu->kvm, gfn), .gfn = gfn, .flags = FOLL_WRITE, - .refcounted_page = &map->refcounted_page, + .refcounted_page = &map->pinned_page, + .pin = true, }; - map->refcounted_page = NULL; + map->pinned_page = NULL; map->page = NULL; map->hva = NULL; map->gfn = gfn; @@ -3133,16 +3157,16 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) if (dirty) kvm_vcpu_mark_page_dirty(vcpu, map->gfn); - if (map->refcounted_page) { + if (map->pinned_page) { if (dirty) - kvm_release_page_dirty(map->refcounted_page); - else - kvm_release_page_clean(map->refcounted_page); + kvm_set_page_dirty(map->pinned_page); + kvm_set_page_accessed(map->pinned_page); + unpin_user_page(map->pinned_page); } map->hva = NULL; map->page = NULL; - map->refcounted_page = NULL; + map->pinned_page = NULL; } EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index d3ac1ba8ba66..acef3f5c582a 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -30,6 +30,13 @@ struct kvm_follow_pfn { /* FOLL_* flags modifying lookup behavior, e.g. FOLL_WRITE. */ unsigned int flags; + /* + * Pin the page (effectively FOLL_PIN, which is an mm/ internal flag). + * The page *must* be pinned if KVM will write to the page via a kernel + * mapping, e.g. via kmap(), mremap(), etc. + */ + bool pin; + /* * If non-NULL, try to get a writable mapping even for a read fault. * Set to true if a writable mapping was obtained. -- cgit v1.2.3 From 365e319208442a0807a96e9ea4d0b1fa338f1929 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:35 -0700 Subject: KVM: Pass in write/dirty to kvm_vcpu_map(), not kvm_vcpu_unmap() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that all kvm_vcpu_{,un}map() users pass "true" for @dirty, have them pass "true" as a @writable param to kvm_vcpu_map(), and thus create a read-only mapping when possible. Note, creating read-only mappings can be theoretically slower, as they don't play nice with fast GUP due to the need to break CoW before mapping the underlying PFN. But practically speaking, creating a mapping isn't a super hot path, and getting a writable mapping for reading is weird and confusing. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-34-seanjc@google.com> --- arch/x86/kvm/svm/nested.c | 4 ++-- arch/x86/kvm/svm/sev.c | 2 +- arch/x86/kvm/svm/svm.c | 8 ++++---- arch/x86/kvm/vmx/nested.c | 16 ++++++++-------- include/linux/kvm_host.h | 20 ++++++++++++++++++-- virt/kvm/kvm_main.c | 12 +++++++----- 6 files changed, 40 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index cf84103ce38b..b708bdf7eaff 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -926,7 +926,7 @@ out_exit_err: nested_svm_vmexit(svm); out: - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); return ret; } @@ -1130,7 +1130,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.exit_int_info_err, KVM_ISA_SVM); - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); nested_svm_transition_tlb_flush(vcpu); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0b851ef937f2..4557ff3804ae 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3468,7 +3468,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) sev_es_sync_to_ghcb(svm); - kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map, true); + kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map); svm->sev_es.ghcb = NULL; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9df3e1e5ae81..c1e29307826b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2299,7 +2299,7 @@ static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) svm_copy_vmloadsave_state(vmcb12, svm->vmcb); } - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); return ret; } @@ -4714,7 +4714,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) svm_copy_vmrun_state(map_save.hva + 0x400, &svm->vmcb01.ptr->save); - kvm_vcpu_unmap(vcpu, &map_save, true); + kvm_vcpu_unmap(vcpu, &map_save); return 0; } @@ -4774,9 +4774,9 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) svm->nested.nested_run_pending = 1; unmap_save: - kvm_vcpu_unmap(vcpu, &map_save, true); + kvm_vcpu_unmap(vcpu, &map_save); unmap_map: - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); return ret; } diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ff83b56fe2fa..259fe445e695 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -231,7 +231,7 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); - kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); + kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map); vmx->nested.hv_evmcs = NULL; vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; @@ -318,9 +318,9 @@ static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, true); - kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); - kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); + kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map); + kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map); + kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map); vmx->nested.pi_desc = NULL; } @@ -624,7 +624,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, int msr; unsigned long *msr_bitmap_l1; unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; - struct kvm_host_map msr_bitmap_map; + struct kvm_host_map map; /* Nothing to do if the MSR bitmap is not in use. */ if (!cpu_has_vmx_msr_bitmap() || @@ -647,10 +647,10 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, return true; } - if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &msr_bitmap_map)) + if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map)) return false; - msr_bitmap_l1 = (unsigned long *)msr_bitmap_map.hva; + msr_bitmap_l1 = (unsigned long *)map.hva; /* * To keep the control flow simple, pay eight 8-byte writes (sixteen @@ -714,7 +714,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_FLUSH_CMD, MSR_TYPE_W); - kvm_vcpu_unmap(vcpu, &msr_bitmap_map, false); + kvm_vcpu_unmap(vcpu, &map); vmx->nested.force_msr_bitmap_recalc = false; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b4c541fa5a1f..101dbf2be1ce 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -285,6 +285,7 @@ struct kvm_host_map { void *hva; kvm_pfn_t pfn; kvm_pfn_t gfn; + bool writable; }; /* @@ -1311,8 +1312,23 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn); struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn); -int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map); -void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty); + +int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map, + bool writable); +void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map); + +static inline int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, + struct kvm_host_map *map) +{ + return __kvm_vcpu_map(vcpu, gpa, map, true); +} + +static inline int kvm_vcpu_map_readonly(struct kvm_vcpu *vcpu, gpa_t gpa, + struct kvm_host_map *map) +{ + return __kvm_vcpu_map(vcpu, gpa, map, false); +} + unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0d59f47f099e..baa741c2b81c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3110,7 +3110,8 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(gfn_to_page); -int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) +int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, + bool writable) { struct kvm_follow_pfn kfp = { .slot = gfn_to_memslot(vcpu->kvm, gfn), @@ -3124,6 +3125,7 @@ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) map->page = NULL; map->hva = NULL; map->gfn = gfn; + map->writable = writable; map->pfn = kvm_follow_pfn(&kfp); if (is_error_noslot_pfn(map->pfn)) @@ -3140,9 +3142,9 @@ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) return map->hva ? 0 : -EFAULT; } -EXPORT_SYMBOL_GPL(kvm_vcpu_map); +EXPORT_SYMBOL_GPL(__kvm_vcpu_map); -void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) +void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map) { if (!map->hva) return; @@ -3154,11 +3156,11 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) memunmap(map->hva); #endif - if (dirty) + if (map->writable) kvm_vcpu_mark_page_dirty(vcpu, map->gfn); if (map->pinned_page) { - if (dirty) + if (map->writable) kvm_set_page_dirty(map->pinned_page); kvm_set_page_accessed(map->pinned_page); unpin_user_page(map->pinned_page); -- cgit v1.2.3 From 21dd877060d49f1f57901f929189653fc42ac37a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:44 -0700 Subject: KVM: Move declarations of memslot accessors up in kvm_host.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the memslot lookup helpers further up in kvm_host.h so that they can be used by inlined "to pfn" wrappers. No functional change intended. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-43-seanjc@google.com> --- include/linux/kvm_host.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 101dbf2be1ce..8d35a36e7707 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1168,6 +1168,10 @@ static inline bool kvm_memslot_iter_is_valid(struct kvm_memslot_iter *iter, gfn_ kvm_memslot_iter_is_valid(iter, end); \ kvm_memslot_iter_next(iter)) +struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); +struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); +struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn); + /* * KVM_SET_USER_MEMORY_REGION ioctl allows the following operations: * - create a new memory slot @@ -1303,16 +1307,12 @@ int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, }) int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); -struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn); void mark_page_dirty_in_slot(struct kvm *kvm, const struct kvm_memory_slot *memslot, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); -struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); -struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn); - int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map, bool writable); void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map); -- cgit v1.2.3 From 1c7b627e930624dd64ee906df554c8f2bad628ff Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:45 -0700 Subject: KVM: Add kvm_faultin_pfn() to specifically service guest page faults MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new dedicated API, kvm_faultin_pfn(), for servicing guest page faults, i.e. for getting pages/pfns that will be mapped into the guest via an mmu_notifier-protected KVM MMU. Keep struct kvm_follow_pfn buried in internal code, as having __kvm_faultin_pfn() take "out" params is actually cleaner for several architectures, e.g. it allows the caller to have its own "page fault" structure without having to marshal data to/from kvm_follow_pfn. Long term, common KVM would ideally provide a kvm_page_fault structure, a la x86's struct of the same name. But all architectures need to be converted to a common API before that can happen. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-44-seanjc@google.com> --- include/linux/kvm_host.h | 12 ++++++++++++ virt/kvm/kvm_main.c | 22 ++++++++++++++++++++++ 2 files changed, 34 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8d35a36e7707..a63b0325d3e2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1231,6 +1231,18 @@ static inline void kvm_release_page_unused(struct page *page) void kvm_release_page_clean(struct page *page); void kvm_release_page_dirty(struct page *page); +kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn, + unsigned int foll, bool *writable, + struct page **refcounted_page); + +static inline kvm_pfn_t kvm_faultin_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, + bool write, bool *writable, + struct page **refcounted_page) +{ + return __kvm_faultin_pfn(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, + write ? FOLL_WRITE : 0, writable, refcounted_page); +} + kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, bool *writable); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index becf640e369c..f79745d6500c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3092,6 +3092,28 @@ kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(gfn_to_pfn); +kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn, + unsigned int foll, bool *writable, + struct page **refcounted_page) +{ + struct kvm_follow_pfn kfp = { + .slot = slot, + .gfn = gfn, + .flags = foll, + .map_writable = writable, + .refcounted_page = refcounted_page, + }; + + if (WARN_ON_ONCE(!writable || !refcounted_page)) + return KVM_PFN_ERR_FAULT; + + *writable = false; + *refcounted_page = NULL; + + return kvm_follow_pfn(&kfp); +} +EXPORT_SYMBOL_GPL(__kvm_faultin_pfn); + int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, struct page **pages, int nr_pages) { -- cgit v1.2.3 From 1fbee5b01a0fd27db571eed757682a7c20045107 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:48 -0700 Subject: KVM: guest_memfd: Provide "struct page" as output from kvm_gmem_get_pfn() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Provide the "struct page" associated with a guest_memfd pfn as an output from __kvm_gmem_get_pfn() so that KVM guest page fault handlers can directly put the page instead of having to rely on kvm_pfn_to_refcounted_page(). Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-47-seanjc@google.com> --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/svm/sev.c | 10 ++++++---- include/linux/kvm_host.h | 6 ++++-- virt/kvm/guest_memfd.c | 8 ++++++-- 4 files changed, 17 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2bea2d20c571..c657c3c449c8 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4407,7 +4407,7 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, } r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn, - &max_order); + &fault->refcounted_page, &max_order); if (r) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return r; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 4557ff3804ae..c6c852485900 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3849,6 +3849,7 @@ static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) if (VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) { gfn_t gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa); struct kvm_memory_slot *slot; + struct page *page; kvm_pfn_t pfn; slot = gfn_to_memslot(vcpu->kvm, gfn); @@ -3859,7 +3860,7 @@ static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) * The new VMSA will be private memory guest memory, so * retrieve the PFN from the gmem backend. */ - if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, NULL)) + if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL)) return -EINVAL; /* @@ -3888,7 +3889,7 @@ static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) * changes then care should be taken to ensure * svm->sev_es.vmsa is pinned through some other means. */ - kvm_release_pfn_clean(pfn); + kvm_release_page_clean(page); } /* @@ -4688,6 +4689,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) struct kvm_memory_slot *slot; struct kvm *kvm = vcpu->kvm; int order, rmp_level, ret; + struct page *page; bool assigned; kvm_pfn_t pfn; gfn_t gfn; @@ -4714,7 +4716,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) return; } - ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &order); + ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &page, &order); if (ret) { pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n", gpa); @@ -4772,7 +4774,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) out: trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret); out_no_trace: - put_page(pfn_to_page(pfn)); + kvm_release_page_unused(page); } static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a63b0325d3e2..6efdc00b4254 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2487,11 +2487,13 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) #ifdef CONFIG_KVM_PRIVATE_MEM int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn, kvm_pfn_t *pfn, int *max_order); + gfn_t gfn, kvm_pfn_t *pfn, struct page **page, + int *max_order); #else static inline int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, - kvm_pfn_t *pfn, int *max_order) + kvm_pfn_t *pfn, struct page **page, + int *max_order) { KVM_BUG_ON(1, kvm); return -EIO; diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 8a878e57c5d4..47a9f68f7b24 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -594,7 +594,8 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file, } int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn, kvm_pfn_t *pfn, int *max_order) + gfn_t gfn, kvm_pfn_t *pfn, struct page **page, + int *max_order) { pgoff_t index = kvm_gmem_get_index(slot, gfn); struct file *file = kvm_gmem_get_file(slot); @@ -615,7 +616,10 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); folio_unlock(folio); - if (r < 0) + + if (!r) + *page = folio_file_page(folio, index); + else folio_put(folio); out: -- cgit v1.2.3 From dc06193532af4ba88ed20daeef88f22b053ebb91 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:51 -0700 Subject: KVM: Move x86's API to release a faultin page to common KVM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move KVM x86's helper that "finishes" the faultin process to common KVM so that the logic can be shared across all architectures. Note, not all architectures implement a fast page fault path, but the gist of the comment applies to all architectures. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-50-seanjc@google.com> --- arch/x86/kvm/mmu/mmu.c | 24 ++---------------------- include/linux/kvm_host.h | 26 ++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f3a4ed6afec2..0aae8e63566c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4377,28 +4377,8 @@ static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int r) { - lockdep_assert_once(lockdep_is_held(&vcpu->kvm->mmu_lock) || - r == RET_PF_RETRY); - - if (!fault->refcounted_page) - return; - - /* - * If the page that KVM got from the *primary MMU* is writable, and KVM - * installed or reused a SPTE, mark the page/folio dirty. Note, this - * may mark a folio dirty even if KVM created a read-only SPTE, e.g. if - * the GFN is write-protected. Folios can't be safely marked dirty - * outside of mmu_lock as doing so could race with writeback on the - * folio. As a result, KVM can't mark folios dirty in the fast page - * fault handler, and so KVM must (somewhat) speculatively mark the - * folio dirty if KVM could locklessly make the SPTE writable. - */ - if (r == RET_PF_RETRY) - kvm_release_page_unused(fault->refcounted_page); - else if (!fault->map_writable) - kvm_release_page_clean(fault->refcounted_page); - else - kvm_release_page_dirty(fault->refcounted_page); + kvm_release_faultin_page(vcpu->kvm, fault->refcounted_page, + r == RET_PF_RETRY, fault->map_writable); } static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6efdc00b4254..3e06393e5f1e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1231,6 +1231,32 @@ static inline void kvm_release_page_unused(struct page *page) void kvm_release_page_clean(struct page *page); void kvm_release_page_dirty(struct page *page); +static inline void kvm_release_faultin_page(struct kvm *kvm, struct page *page, + bool unused, bool dirty) +{ + lockdep_assert_once(lockdep_is_held(&kvm->mmu_lock) || unused); + + if (!page) + return; + + /* + * If the page that KVM got from the *primary MMU* is writable, and KVM + * installed or reused a SPTE, mark the page/folio dirty. Note, this + * may mark a folio dirty even if KVM created a read-only SPTE, e.g. if + * the GFN is write-protected. Folios can't be safely marked dirty + * outside of mmu_lock as doing so could race with writeback on the + * folio. As a result, KVM can't mark folios dirty in the fast page + * fault handler, and so KVM must (somewhat) speculatively mark the + * folio dirty if KVM could locklessly make the SPTE writable. + */ + if (unused) + kvm_release_page_unused(page); + else if (dirty) + kvm_release_page_dirty(page); + else + kvm_release_page_clean(page); +} + kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn, unsigned int foll, bool *writable, struct page **refcounted_page); -- cgit v1.2.3 From f42e289a2095f61755e6ca5fd1370d441bf589d5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:18 -0700 Subject: KVM: Add support for read-only usage of gfn_to_page() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rework gfn_to_page() to support read-only accesses so that it can be used by arm64 to get MTE tags out of guest memory. Opportunistically rewrite the comment to be even more stern about using gfn_to_page(), as there are very few scenarios where requiring a struct page is actually the right thing to do (though there are such scenarios). Add a FIXME to call out that KVM probably should be pinning pages, not just getting pages. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-77-seanjc@google.com> --- include/linux/kvm_host.h | 7 ++++++- virt/kvm/kvm_main.c | 15 ++++++++------- 2 files changed, 14 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3e06393e5f1e..96cf9e5660c3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1213,7 +1213,12 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, struct page **pages, int nr_pages); -struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); +struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write); +static inline struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) +{ + return __gfn_to_page(kvm, gfn, true); +} + unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable); unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5a424598610f..1c11a05a97af 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3132,25 +3132,26 @@ int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn, EXPORT_SYMBOL_GPL(kvm_prefetch_pages); /* - * Do not use this helper unless you are absolutely certain the gfn _must_ be - * backed by 'struct page'. A valid example is if the backing memslot is - * controlled by KVM. Note, if the returned page is valid, it's refcount has - * been elevated by gfn_to_pfn(). + * Don't use this API unless you are absolutely, positively certain that KVM + * needs to get a struct page, e.g. to pin the page for firmware DMA. + * + * FIXME: Users of this API likely need to FOLL_PIN the page, not just elevate + * its refcount. */ -struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) +struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write) { struct page *refcounted_page = NULL; struct kvm_follow_pfn kfp = { .slot = gfn_to_memslot(kvm, gfn), .gfn = gfn, - .flags = FOLL_WRITE, + .flags = write ? FOLL_WRITE : 0, .refcounted_page = &refcounted_page, }; (void)kvm_follow_pfn(&kfp); return refcounted_page; } -EXPORT_SYMBOL_GPL(gfn_to_page); +EXPORT_SYMBOL_GPL(__gfn_to_page); int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, bool writable) -- cgit v1.2.3 From 06cdaff80e50e3fb74e5e3101e1d5d7aa8b68da6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:21 -0700 Subject: KVM: Drop gfn_to_pfn() APIs now that all users are gone MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drop gfn_to_pfn() and all its variants now that all users are gone. No functional change intended. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-80-seanjc@google.com> --- include/linux/kvm_host.h | 8 -------- virt/kvm/kvm_main.c | 53 ------------------------------------------------ 2 files changed, 61 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 96cf9e5660c3..4a1eaa40a215 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1274,14 +1274,6 @@ static inline kvm_pfn_t kvm_faultin_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, write ? FOLL_WRITE : 0, writable, refcounted_page); } -kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); -kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, - bool *writable); -kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn); -kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, - bool interruptible, bool no_wait, - bool write_fault, bool *writable); - void kvm_release_pfn_clean(kvm_pfn_t pfn); void kvm_release_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_dirty(kvm_pfn_t pfn); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 1c11a05a97af..b2c8d429442d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3039,59 +3039,6 @@ static kvm_pfn_t kvm_follow_pfn(struct kvm_follow_pfn *kfp) return hva_to_pfn(kfp); } -kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn, - bool interruptible, bool no_wait, - bool write_fault, bool *writable) -{ - struct kvm_follow_pfn kfp = { - .slot = slot, - .gfn = gfn, - .map_writable = writable, - }; - - if (write_fault) - kfp.flags |= FOLL_WRITE; - if (no_wait) - kfp.flags |= FOLL_NOWAIT; - if (interruptible) - kfp.flags |= FOLL_INTERRUPTIBLE; - - return kvm_follow_pfn(&kfp); -} -EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); - -kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, - bool *writable) -{ - struct kvm_follow_pfn kfp = { - .slot = gfn_to_memslot(kvm, gfn), - .gfn = gfn, - .flags = write_fault ? FOLL_WRITE : 0, - .map_writable = writable, - }; - - return kvm_follow_pfn(&kfp); -} -EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); - -kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn) -{ - struct kvm_follow_pfn kfp = { - .slot = slot, - .gfn = gfn, - .flags = FOLL_WRITE, - }; - - return kvm_follow_pfn(&kfp); -} -EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); - -kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) -{ - return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); -} -EXPORT_SYMBOL_GPL(gfn_to_pfn); - kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn, unsigned int foll, bool *writable, struct page **refcounted_page) -- cgit v1.2.3 From 93b7da404f5b0b02a4211bbb784889f001d27953 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:26 -0700 Subject: KVM: Drop APIs that manipulate "struct page" via pfns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove all kvm_{release,set}_pfn_*() APIs now that all users are gone. No functional change intended. Reviewed-by: Alex Bennée Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-85-seanjc@google.com> --- include/linux/kvm_host.h | 5 ----- virt/kvm/kvm_main.c | 55 ------------------------------------------------ 2 files changed, 60 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4a1eaa40a215..d045f8310a48 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1274,11 +1274,6 @@ static inline kvm_pfn_t kvm_faultin_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, write ? FOLL_WRITE : 0, writable, refcounted_page); } -void kvm_release_pfn_clean(kvm_pfn_t pfn); -void kvm_release_pfn_dirty(kvm_pfn_t pfn); -void kvm_set_pfn_dirty(kvm_pfn_t pfn); -void kvm_set_pfn_accessed(kvm_pfn_t pfn); - int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a483da96f4be..396ca14f18f3 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3164,61 +3164,6 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map) } EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); -void kvm_release_pfn_clean(kvm_pfn_t pfn) -{ - struct page *page; - - if (is_error_noslot_pfn(pfn)) - return; - - page = kvm_pfn_to_refcounted_page(pfn); - if (!page) - return; - - kvm_release_page_clean(page); -} -EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); - -void kvm_release_pfn_dirty(kvm_pfn_t pfn) -{ - struct page *page; - - if (is_error_noslot_pfn(pfn)) - return; - - page = kvm_pfn_to_refcounted_page(pfn); - if (!page) - return; - - kvm_release_page_dirty(page); -} -EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); - -/* - * Note, checking for an error/noslot pfn is the caller's responsibility when - * directly marking a page dirty/accessed. Unlike the "release" helpers, the - * "set" helpers are not to be used when the pfn might point at garbage. - */ -void kvm_set_pfn_dirty(kvm_pfn_t pfn) -{ - if (WARN_ON(is_error_noslot_pfn(pfn))) - return; - - if (pfn_valid(pfn)) - kvm_set_page_dirty(pfn_to_page(pfn)); -} -EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); - -void kvm_set_pfn_accessed(kvm_pfn_t pfn) -{ - if (WARN_ON(is_error_noslot_pfn(pfn))) - return; - - if (pfn_valid(pfn)) - kvm_set_page_accessed(pfn_to_page(pfn)); -} -EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); - static int next_segment(unsigned long len, int offset) { if (len > PAGE_SIZE - offset) -- cgit v1.2.3 From 8b15c3764c05ed8766709711d2054d96349dee8e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:27 -0700 Subject: KVM: Don't grab reference on VM_MIXEDMAP pfns that have a "struct page" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that KVM no longer relies on an ugly heuristic to find its struct page references, i.e. now that KVM can't get false positives on VM_MIXEDMAP pfns, remove KVM's hack to elevate the refcount for pfns that happen to have a valid struct page. In addition to removing a long-standing wart in KVM, this allows KVM to map non-refcounted struct page memory into the guest, e.g. for exposing GPU TTM buffers to KVM guests. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-86-seanjc@google.com> --- include/linux/kvm_host.h | 3 -- virt/kvm/kvm_main.c | 75 ++---------------------------------------------- 2 files changed, 2 insertions(+), 76 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d045f8310a48..02f0206fd2dc 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1730,9 +1730,6 @@ void kvm_arch_sync_events(struct kvm *kvm); int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); -struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn); -bool kvm_is_zone_device_page(struct page *page); - struct kvm_irq_ack_notifier { struct hlist_node link; unsigned gsi; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 396ca14f18f3..b1b10dc408a0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -160,52 +160,6 @@ __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) { } -bool kvm_is_zone_device_page(struct page *page) -{ - /* - * The metadata used by is_zone_device_page() to determine whether or - * not a page is ZONE_DEVICE is guaranteed to be valid if and only if - * the device has been pinned, e.g. by get_user_pages(). WARN if the - * page_count() is zero to help detect bad usage of this helper. - */ - if (WARN_ON_ONCE(!page_count(page))) - return false; - - return is_zone_device_page(page); -} - -/* - * Returns a 'struct page' if the pfn is "valid" and backed by a refcounted - * page, NULL otherwise. Note, the list of refcounted PG_reserved page types - * is likely incomplete, it has been compiled purely through people wanting to - * back guest with a certain type of memory and encountering issues. - */ -struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn) -{ - struct page *page; - - if (!pfn_valid(pfn)) - return NULL; - - page = pfn_to_page(pfn); - if (!PageReserved(page)) - return page; - - /* The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. */ - if (is_zero_pfn(pfn)) - return page; - - /* - * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting - * perspective they are "normal" pages, albeit with slightly different - * usage rules. - */ - if (kvm_is_zone_device_page(page)) - return page; - - return NULL; -} - /* * Switches to specified vcpu, until a matching vcpu_put() */ @@ -2804,35 +2758,10 @@ static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, if (kfp->map_writable) *kfp->map_writable = writable; - /* - * FIXME: Remove this once KVM no longer blindly calls put_page() on - * every pfn that points at a struct page. - * - * Get a reference for follow_pte() pfns if they happen to point at a - * struct page, as KVM will ultimately call kvm_release_pfn_clean() on - * the returned pfn, i.e. KVM expects to have a reference. - * - * Certain IO or PFNMAP mappings can be backed with valid struct pages, - * but be allocated without refcounting, e.g. tail pages of - * non-compound higher order allocations. Grabbing and putting a - * reference to such pages would cause KVM to prematurely free a page - * it doesn't own (KVM gets and puts the one and only reference). - * Don't allow those pages until the FIXME is resolved. - * - * Don't grab a reference for pins, callers that pin pages are required - * to check refcounted_page, i.e. must not blindly release the pfn. - */ - if (map) { + if (map) pfn = map->pfn; - - if (!kfp->pin) { - page = kvm_pfn_to_refcounted_page(pfn); - if (page && !get_page_unless_zero(page)) - return KVM_PFN_ERR_FAULT; - } - } else { + else pfn = page_to_pfn(page); - } *kfp->refcounted_page = page; -- cgit v1.2.3 From 3e7f43188ee227bcf0f07f60a00f1fd1aca10e6a Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Aug 2024 13:01:36 -0700 Subject: KVM: Protect vCPU's "last run PID" with rwlock, not RCU To avoid jitter on KVM_RUN due to synchronize_rcu(), use a rwlock instead of RCU to protect vcpu->pid, a.k.a. the pid of the task last used to a vCPU. When userspace is doing M:N scheduling of tasks to vCPUs, e.g. to run SEV migration helper vCPUs during post-copy, the synchronize_rcu() needed to change the PID associated with the vCPU can stall for hundreds of milliseconds, which is problematic for latency sensitive post-copy operations. In the directed yield path, do not acquire the lock if it's contended, i.e. if the associated PID is changing, as that means the vCPU's task is already running. Reported-by: Steve Rutherford Reviewed-by: Steve Rutherford Acked-by: Oliver Upton Link: https://lore.kernel.org/r/20240802200136.329973-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/arm64/include/asm/kvm_host.h | 2 +- include/linux/kvm_host.h | 3 ++- virt/kvm/kvm_main.c | 39 +++++++++++++++++++++++++-------------- 3 files changed, 28 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index bf64fed9820e..04febe60c88e 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1140,7 +1140,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, void kvm_arm_halt_guest(struct kvm *kvm); void kvm_arm_resume_guest(struct kvm *kvm); -#define vcpu_has_run_once(vcpu) !!rcu_access_pointer((vcpu)->pid) +#define vcpu_has_run_once(vcpu) (!!READ_ONCE((vcpu)->pid)) #ifndef __KVM_NVHE_HYPERVISOR__ #define kvm_call_hyp_nvhe(f, ...) \ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 02f0206fd2dc..18a1672ffcbf 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -334,7 +334,8 @@ struct kvm_vcpu { #ifndef __KVM_HAVE_ARCH_WQP struct rcuwait wait; #endif - struct pid __rcu *pid; + struct pid *pid; + rwlock_t pid_lock; int sigset_active; sigset_t sigset; unsigned int halt_poll_ns; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 17048d9575e3..c1876b9ed213 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -447,6 +447,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->kvm = kvm; vcpu->vcpu_id = id; vcpu->pid = NULL; + rwlock_init(&vcpu->pid_lock); #ifndef __KVM_HAVE_ARCH_WQP rcuwait_init(&vcpu->wait); #endif @@ -474,7 +475,7 @@ static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) * the vcpu->pid pointer, and at destruction time all file descriptors * are already gone. */ - put_pid(rcu_dereference_protected(vcpu->pid, 1)); + put_pid(vcpu->pid); free_page((unsigned long)vcpu->run); kmem_cache_free(kvm_vcpu_cache, vcpu); @@ -3770,15 +3771,17 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_kick); int kvm_vcpu_yield_to(struct kvm_vcpu *target) { - struct pid *pid; struct task_struct *task = NULL; int ret; - rcu_read_lock(); - pid = rcu_dereference(target->pid); - if (pid) - task = get_pid_task(pid, PIDTYPE_PID); - rcu_read_unlock(); + if (!read_trylock(&target->pid_lock)) + return 0; + + if (target->pid) + task = get_pid_task(target->pid, PIDTYPE_PID); + + read_unlock(&target->pid_lock); + if (!task) return 0; ret = yield_to(task, 1); @@ -4030,9 +4033,9 @@ static int vcpu_get_pid(void *data, u64 *val) { struct kvm_vcpu *vcpu = data; - rcu_read_lock(); - *val = pid_nr(rcu_dereference(vcpu->pid)); - rcu_read_unlock(); + read_lock(&vcpu->pid_lock); + *val = pid_nr(vcpu->pid); + read_unlock(&vcpu->pid_lock); return 0; } @@ -4318,7 +4321,14 @@ static long kvm_vcpu_ioctl(struct file *filp, r = -EINVAL; if (arg) goto out; - oldpid = rcu_access_pointer(vcpu->pid); + + /* + * Note, vcpu->pid is primarily protected by vcpu->mutex. The + * dedicated r/w lock allows other tasks, e.g. other vCPUs, to + * read vcpu->pid while this vCPU is in KVM_RUN, e.g. to yield + * directly to this vCPU + */ + oldpid = vcpu->pid; if (unlikely(oldpid != task_pid(current))) { /* The thread running this VCPU changed. */ struct pid *newpid; @@ -4328,9 +4338,10 @@ static long kvm_vcpu_ioctl(struct file *filp, break; newpid = get_task_pid(current, PIDTYPE_PID); - rcu_assign_pointer(vcpu->pid, newpid); - if (oldpid) - synchronize_rcu(); + write_lock(&vcpu->pid_lock); + vcpu->pid = newpid; + write_unlock(&vcpu->pid_lock); + put_pid(oldpid); } vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe); -- cgit v1.2.3 From 948ccbd95016f50ce01df5eef9440eede3b8c713 Mon Sep 17 00:00:00 2001 From: Xianglai Li Date: Wed, 13 Nov 2024 16:18:26 +0800 Subject: LoongArch: KVM: Add iocsr and mmio bus simulation in kernel Add iocsr and mmio memory read and write simulation to the kernel. When the VM accesses the device address space through iocsr instructions or mmio, it does not need to return to the qemu user mode but can directly completes the access in the kernel mode. Signed-off-by: Tianrui Zhao Signed-off-by: Xianglai Li Signed-off-by: Huacai Chen --- arch/loongarch/kvm/exit.c | 82 ++++++++++++++++++++++++++++++++-------------- include/linux/kvm_host.h | 1 + include/trace/events/kvm.h | 35 ++++++++++++++++++++ 3 files changed, 94 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c index 90894f70ff4a..69f3e3782cc9 100644 --- a/arch/loongarch/kvm/exit.c +++ b/arch/loongarch/kvm/exit.c @@ -157,7 +157,7 @@ static int kvm_handle_csr(struct kvm_vcpu *vcpu, larch_inst inst) int kvm_emu_iocsr(larch_inst inst, struct kvm_run *run, struct kvm_vcpu *vcpu) { int ret; - unsigned long val; + unsigned long *val; u32 addr, rd, rj, opcode; /* @@ -170,6 +170,7 @@ int kvm_emu_iocsr(larch_inst inst, struct kvm_run *run, struct kvm_vcpu *vcpu) ret = EMULATE_DO_IOCSR; run->iocsr_io.phys_addr = addr; run->iocsr_io.is_write = 0; + val = &vcpu->arch.gprs[rd]; /* LoongArch is Little endian */ switch (opcode) { @@ -202,16 +203,25 @@ int kvm_emu_iocsr(larch_inst inst, struct kvm_run *run, struct kvm_vcpu *vcpu) run->iocsr_io.is_write = 1; break; default: - ret = EMULATE_FAIL; - break; + return EMULATE_FAIL; } - if (ret == EMULATE_DO_IOCSR) { - if (run->iocsr_io.is_write) { - val = vcpu->arch.gprs[rd]; - memcpy(run->iocsr_io.data, &val, run->iocsr_io.len); - } - vcpu->arch.io_gpr = rd; + if (run->iocsr_io.is_write) { + if (!kvm_io_bus_write(vcpu, KVM_IOCSR_BUS, addr, run->iocsr_io.len, val)) + ret = EMULATE_DONE; + else + /* Save data and let user space to write it */ + memcpy(run->iocsr_io.data, val, run->iocsr_io.len); + + trace_kvm_iocsr(KVM_TRACE_IOCSR_WRITE, run->iocsr_io.len, addr, val); + } else { + if (!kvm_io_bus_read(vcpu, KVM_IOCSR_BUS, addr, run->iocsr_io.len, val)) + ret = EMULATE_DONE; + else + /* Save register id for iocsr read completion */ + vcpu->arch.io_gpr = rd; + + trace_kvm_iocsr(KVM_TRACE_IOCSR_READ, run->iocsr_io.len, addr, NULL); } return ret; @@ -447,19 +457,33 @@ int kvm_emu_mmio_read(struct kvm_vcpu *vcpu, larch_inst inst) } if (ret == EMULATE_DO_MMIO) { + trace_kvm_mmio(KVM_TRACE_MMIO_READ, run->mmio.len, run->mmio.phys_addr, NULL); + + /* + * If mmio device such as PCH-PIC is emulated in KVM, + * it need not return to user space to handle the mmio + * exception. + */ + ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, vcpu->arch.badv, + run->mmio.len, &vcpu->arch.gprs[rd]); + if (!ret) { + update_pc(&vcpu->arch); + vcpu->mmio_needed = 0; + return EMULATE_DONE; + } + /* Set for kvm_complete_mmio_read() use */ vcpu->arch.io_gpr = rd; run->mmio.is_write = 0; vcpu->mmio_is_write = 0; - trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, run->mmio.len, - run->mmio.phys_addr, NULL); - } else { - kvm_err("Read not supported Inst=0x%08x @%lx BadVaddr:%#lx\n", - inst.word, vcpu->arch.pc, vcpu->arch.badv); - kvm_arch_vcpu_dump_regs(vcpu); - vcpu->mmio_needed = 0; + return EMULATE_DO_MMIO; } + kvm_err("Read not supported Inst=0x%08x @%lx BadVaddr:%#lx\n", + inst.word, vcpu->arch.pc, vcpu->arch.badv); + kvm_arch_vcpu_dump_regs(vcpu); + vcpu->mmio_needed = 0; + return ret; } @@ -600,19 +624,29 @@ int kvm_emu_mmio_write(struct kvm_vcpu *vcpu, larch_inst inst) } if (ret == EMULATE_DO_MMIO) { + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, run->mmio.len, run->mmio.phys_addr, data); + + /* + * If mmio device such as PCH-PIC is emulated in KVM, + * it need not return to user space to handle the mmio + * exception. + */ + ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, vcpu->arch.badv, run->mmio.len, data); + if (!ret) + return EMULATE_DONE; + run->mmio.is_write = 1; vcpu->mmio_needed = 1; vcpu->mmio_is_write = 1; - trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, run->mmio.len, - run->mmio.phys_addr, data); - } else { - vcpu->arch.pc = curr_pc; - kvm_err("Write not supported Inst=0x%08x @%lx BadVaddr:%#lx\n", - inst.word, vcpu->arch.pc, vcpu->arch.badv); - kvm_arch_vcpu_dump_regs(vcpu); - /* Rollback PC if emulation was unsuccessful */ + return EMULATE_DO_MMIO; } + vcpu->arch.pc = curr_pc; + kvm_err("Write not supported Inst=0x%08x @%lx BadVaddr:%#lx\n", + inst.word, vcpu->arch.pc, vcpu->arch.badv); + kvm_arch_vcpu_dump_regs(vcpu); + /* Rollback PC if emulation was unsuccessful */ + return ret; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 45be36e5285f..164d9725cb08 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -219,6 +219,7 @@ enum kvm_bus { KVM_PIO_BUS, KVM_VIRTIO_CCW_NOTIFY_BUS, KVM_FAST_MMIO_BUS, + KVM_IOCSR_BUS, KVM_NR_BUSES }; diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index 74e40d5d4af4..fc7d0f8ff078 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -236,6 +236,41 @@ TRACE_EVENT(kvm_mmio, __entry->len, __entry->gpa, __entry->val) ); +#define KVM_TRACE_IOCSR_READ_UNSATISFIED 0 +#define KVM_TRACE_IOCSR_READ 1 +#define KVM_TRACE_IOCSR_WRITE 2 + +#define kvm_trace_symbol_iocsr \ + { KVM_TRACE_IOCSR_READ_UNSATISFIED, "unsatisfied-read" }, \ + { KVM_TRACE_IOCSR_READ, "read" }, \ + { KVM_TRACE_IOCSR_WRITE, "write" } + +TRACE_EVENT(kvm_iocsr, + TP_PROTO(int type, int len, u64 gpa, void *val), + TP_ARGS(type, len, gpa, val), + + TP_STRUCT__entry( + __field( u32, type ) + __field( u32, len ) + __field( u64, gpa ) + __field( u64, val ) + ), + + TP_fast_assign( + __entry->type = type; + __entry->len = len; + __entry->gpa = gpa; + __entry->val = 0; + if (val) + memcpy(&__entry->val, val, + min_t(u32, sizeof(__entry->val), len)); + ), + + TP_printk("iocsr %s len %u gpa 0x%llx val 0x%llx", + __print_symbolic(__entry->type, kvm_trace_symbol_iocsr), + __entry->len, __entry->gpa, __entry->val) +); + #define kvm_fpu_load_symbol \ {0, "unload"}, \ {1, "load"} -- cgit v1.2.3 From d96c77bd4eeba469bddbbb14323d2191684da82a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 8 Nov 2024 04:56:31 -0500 Subject: KVM: x86: switch hugepage recovery thread to vhost_task kvm_vm_create_worker_thread() is meant to be used for kthreads that can consume significant amounts of CPU time on behalf of a VM or in response to how the VM behaves (for example how it accesses its memory). Therefore it wants to charge the CPU time consumed by that work to the VM's container. However, because of these threads, cgroups which have kvm instances inside never complete freezing. This can be trivially reproduced: root@test ~# mkdir /sys/fs/cgroup/test root@test ~# echo $$ > /sys/fs/cgroup/test/cgroup.procs root@test ~# qemu-system-x86_64 -nographic -enable-kvm and in another terminal: root@test ~# echo 1 > /sys/fs/cgroup/test/cgroup.freeze root@test ~# cat /sys/fs/cgroup/test/cgroup.events populated 1 frozen 0 The cgroup freezing happens in the signal delivery path but kvm_nx_huge_page_recovery_worker, while joining non-root cgroups, never calls into the signal delivery path and thus never gets frozen. Because the cgroup freezer determines whether a given cgroup is frozen by comparing the number of frozen threads to the total number of threads in the cgroup, the cgroup never becomes frozen and users waiting for the state transition may hang indefinitely. Since the worker kthread is tied to a user process, it's better if it behaves similarly to user tasks as much as possible, including being able to send SIGSTOP and SIGCONT. In fact, vhost_task is all that kvm_vm_create_worker_thread() wanted to be and more: not only it inherits the userspace process's cgroups, it has other niceties like being parented properly in the process tree. Use it instead of the homegrown alternative. Incidentally, the new code is also better behaved when you flip recovery back and forth to disabled and back to enabled. If your recovery period is 1 minute, it will run the next recovery after 1 minute independent of how many times you flipped the parameter. (Commit message based on emails from Tejun). Reported-by: Tejun Heo Reported-by: Luca Boccassi Acked-by: Tejun Heo Tested-by: Luca Boccassi Cc: stable@vger.kernel.org Reviewed-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 4 +- arch/x86/kvm/Kconfig | 1 + arch/x86/kvm/mmu/mmu.c | 68 ++++++++++++-------------- include/linux/kvm_host.h | 6 --- virt/kvm/kvm_main.c | 103 ---------------------------------------- 5 files changed, 35 insertions(+), 147 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3e8afc82ae2f..e159e44a6a1b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -1442,7 +1443,8 @@ struct kvm_arch { bool sgx_provisioning_allowed; struct kvm_x86_pmu_event_filter __rcu *pmu_event_filter; - struct task_struct *nx_huge_page_recovery_thread; + struct vhost_task *nx_huge_page_recovery_thread; + u64 nx_huge_page_last; #ifdef CONFIG_X86_64 /* The number of TDP MMU pages across all roots. */ diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 1ed1e4f5d51c..d93af5390341 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -30,6 +30,7 @@ config KVM_X86 select HAVE_KVM_IRQ_BYPASS select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_READONLY_MEM + select VHOST_TASK select KVM_ASYNC_PF select USER_RETURN_NOTIFIER select KVM_MMIO diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d7b391fe2c23..22e7ad235123 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7162,7 +7162,7 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) kvm_mmu_zap_all_fast(kvm); mutex_unlock(&kvm->slots_lock); - wake_up_process(kvm->arch.nx_huge_page_recovery_thread); + vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread); } mutex_unlock(&kvm_lock); } @@ -7291,7 +7291,7 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) - wake_up_process(kvm->arch.nx_huge_page_recovery_thread); + vhost_task_wake(kvm->arch.nx_huge_page_recovery_thread); mutex_unlock(&kvm_lock); } @@ -7394,62 +7394,56 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, rcu_idx); } -static long get_nx_huge_page_recovery_timeout(u64 start_time) +static void kvm_nx_huge_page_recovery_worker_kill(void *data) { - bool enabled; - uint period; - - enabled = calc_nx_huge_pages_recovery_period(&period); - - return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64() - : MAX_SCHEDULE_TIMEOUT; } -static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data) +static bool kvm_nx_huge_page_recovery_worker(void *data) { - u64 start_time; + struct kvm *kvm = data; + bool enabled; + uint period; long remaining_time; - while (true) { - start_time = get_jiffies_64(); - remaining_time = get_nx_huge_page_recovery_timeout(start_time); - - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop() && remaining_time > 0) { - schedule_timeout(remaining_time); - remaining_time = get_nx_huge_page_recovery_timeout(start_time); - set_current_state(TASK_INTERRUPTIBLE); - } - - set_current_state(TASK_RUNNING); - - if (kthread_should_stop()) - return 0; + enabled = calc_nx_huge_pages_recovery_period(&period); + if (!enabled) + return false; - kvm_recover_nx_huge_pages(kvm); + remaining_time = kvm->arch.nx_huge_page_last + msecs_to_jiffies(period) + - get_jiffies_64(); + if (remaining_time > 0) { + schedule_timeout(remaining_time); + /* check for signals and come back */ + return true; } + + __set_current_state(TASK_RUNNING); + kvm_recover_nx_huge_pages(kvm); + kvm->arch.nx_huge_page_last = get_jiffies_64(); + return true; } int kvm_mmu_post_init_vm(struct kvm *kvm) { - int err; - if (nx_hugepage_mitigation_hard_disabled) return 0; - err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0, - "kvm-nx-lpage-recovery", - &kvm->arch.nx_huge_page_recovery_thread); - if (!err) - kthread_unpark(kvm->arch.nx_huge_page_recovery_thread); + kvm->arch.nx_huge_page_last = get_jiffies_64(); + kvm->arch.nx_huge_page_recovery_thread = vhost_task_create( + kvm_nx_huge_page_recovery_worker, kvm_nx_huge_page_recovery_worker_kill, + kvm, "kvm-nx-lpage-recovery"); - return err; + if (!kvm->arch.nx_huge_page_recovery_thread) + return -ENOMEM; + + vhost_task_start(kvm->arch.nx_huge_page_recovery_thread); + return 0; } void kvm_mmu_pre_destroy_vm(struct kvm *kvm) { if (kvm->arch.nx_huge_page_recovery_thread) - kthread_stop(kvm->arch.nx_huge_page_recovery_thread); + vhost_task_stop(kvm->arch.nx_huge_page_recovery_thread); } #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 03e4d26e3bcc..401439bb21e3 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2425,12 +2425,6 @@ static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE */ -typedef int (*kvm_vm_thread_fn_t)(struct kvm *kvm, uintptr_t data); - -int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, - uintptr_t data, const char *name, - struct task_struct **thread_ptr); - #ifdef CONFIG_KVM_XFER_TO_GUEST_WORK static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 27186b06518a..de2c11dae231 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -6426,106 +6426,3 @@ void kvm_exit(void) kvm_irqfd_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); - -struct kvm_vm_worker_thread_context { - struct kvm *kvm; - struct task_struct *parent; - struct completion init_done; - kvm_vm_thread_fn_t thread_fn; - uintptr_t data; - int err; -}; - -static int kvm_vm_worker_thread(void *context) -{ - /* - * The init_context is allocated on the stack of the parent thread, so - * we have to locally copy anything that is needed beyond initialization - */ - struct kvm_vm_worker_thread_context *init_context = context; - struct task_struct *parent; - struct kvm *kvm = init_context->kvm; - kvm_vm_thread_fn_t thread_fn = init_context->thread_fn; - uintptr_t data = init_context->data; - int err; - - err = kthread_park(current); - /* kthread_park(current) is never supposed to return an error */ - WARN_ON(err != 0); - if (err) - goto init_complete; - - err = cgroup_attach_task_all(init_context->parent, current); - if (err) { - kvm_err("%s: cgroup_attach_task_all failed with err %d\n", - __func__, err); - goto init_complete; - } - - set_user_nice(current, task_nice(init_context->parent)); - -init_complete: - init_context->err = err; - complete(&init_context->init_done); - init_context = NULL; - - if (err) - goto out; - - /* Wait to be woken up by the spawner before proceeding. */ - kthread_parkme(); - - if (!kthread_should_stop()) - err = thread_fn(kvm, data); - -out: - /* - * Move kthread back to its original cgroup to prevent it lingering in - * the cgroup of the VM process, after the latter finishes its - * execution. - * - * kthread_stop() waits on the 'exited' completion condition which is - * set in exit_mm(), via mm_release(), in do_exit(). However, the - * kthread is removed from the cgroup in the cgroup_exit() which is - * called after the exit_mm(). This causes the kthread_stop() to return - * before the kthread actually quits the cgroup. - */ - rcu_read_lock(); - parent = rcu_dereference(current->real_parent); - get_task_struct(parent); - rcu_read_unlock(); - cgroup_attach_task_all(parent, current); - put_task_struct(parent); - - return err; -} - -int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, - uintptr_t data, const char *name, - struct task_struct **thread_ptr) -{ - struct kvm_vm_worker_thread_context init_context = {}; - struct task_struct *thread; - - *thread_ptr = NULL; - init_context.kvm = kvm; - init_context.parent = current; - init_context.thread_fn = thread_fn; - init_context.data = data; - init_completion(&init_context.init_done); - - thread = kthread_run(kvm_vm_worker_thread, &init_context, - "%s-%d", name, task_pid_nr(current)); - if (IS_ERR(thread)) - return PTR_ERR(thread); - - /* kthread_run is never supposed to return NULL */ - WARN_ON(thread == NULL); - - wait_for_completion(&init_context.init_done); - - if (!init_context.err) - *thread_ptr = thread; - - return init_context.err; -} -- cgit v1.2.3