From 9259607ec7100118cc5c608d97c9d406501e861e Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Fri, 5 Sep 2025 11:11:39 +0200 Subject: KVM: Explicitly allocate/setup irqfd cleanup as per-CPU workqueue Explicitly request the use of per-CPU queues for the irqfd cleanup workqueue in preparation for changing the default behavior of alloc_workqueue() from per-CPU to unbound, which will in turn allow for the removal of WQ_UNBOUND. See commit 930c2ea566af ("workqueue: Add new WQ_PERCPU flag") for details. No functional change intended. Suggested-by: Tejun Heo Signed-off-by: Marco Crivellari Link: https://lore.kernel.org/r/20250905091139.110677-2-marco.crivellari@suse.com [sean: rewrite changelog to tailor it to the KVM] Signed-off-by: Sean Christopherson --- virt/kvm/eventfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index a7794ffdb976..0e8b5277be3b 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -707,7 +707,7 @@ bool kvm_notify_irqfd_resampler(struct kvm *kvm, */ int kvm_irqfd_init(void) { - irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", 0, 0); + irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", WQ_PERCPU, 0); if (!irqfd_cleanup_wq) return -ENOMEM; -- cgit v1.2.3 From 049e560d4f47960c31c06f3de7712e4d2c5d16a2 Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Sun, 12 Oct 2025 07:16:06 +0000 Subject: KVM: guest_memfd: move kvm_gmem_get_index() and use in kvm_gmem_prepare_folio() Move kvm_gmem_get_index() to the top of the file so that it can be used in kvm_gmem_prepare_folio() to replace the open-coded calculation. No functional change intended. Reviewed-by: David Hildenbrand Signed-off-by: Shivank Garg Link: https://lore.kernel.org/r/20251012071607.17646-1-shivankg@amd.com Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'virt') diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index fbca8c0972da..22dacf49a04d 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -25,6 +25,11 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index) return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1)); } +static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) +{ + return gfn - slot->base_gfn + slot->gmem.pgoff; +} + static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, pgoff_t index, struct folio *folio) { @@ -78,7 +83,7 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, * checked when creating memslots. */ WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio))); - index = gfn - slot->base_gfn + slot->gmem.pgoff; + index = kvm_gmem_get_index(slot, gfn); index = ALIGN_DOWN(index, 1 << folio_order(folio)); r = __kvm_gmem_prepare_folio(kvm, slot, index, folio); if (!r) @@ -335,11 +340,6 @@ static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) return get_file_active(&slot->gmem.file); } -static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) -{ - return gfn - slot->base_gfn + slot->gmem.pgoff; -} - static bool kvm_gmem_supports_mmap(struct inode *inode) { const u64 flags = (u64)inode->i_private; -- cgit v1.2.3 From 3f1078a445d9038532a572ff643a826ed9335259 Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Sun, 12 Oct 2025 07:16:07 +0000 Subject: KVM: guest_memfd: remove redundant gmem variable initialization Remove redundant initialization of gmem in __kvm_gmem_get_pfn() as it is already initialized at the top of the function. No functional change intended. Reviewed-by: David Hildenbrand Signed-off-by: Shivank Garg Link: https://lore.kernel.org/r/20251012071607.17646-2-shivankg@amd.com Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 1 - 1 file changed, 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 22dacf49a04d..caa87efc8f7a 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -668,7 +668,6 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file, return ERR_PTR(-EFAULT); } - gmem = file->private_data; if (xa_load(&gmem->bindings, index) != slot) { WARN_ON_ONCE(xa_load(&gmem->bindings, index)); return ERR_PTR(-EIO); -- cgit v1.2.3 From 765fcd7c0753cc62d0a839cbd8355cfaf57a7eb6 Mon Sep 17 00:00:00 2001 From: Pedro Demarchi Gomes Date: Sat, 4 Oct 2025 00:02:10 -0300 Subject: KVM: guest_memfd: use folio_nr_pages() instead of shift operation folio_nr_pages() is a faster helper function to get the number of pages when NR_PAGES_IN_LARGE_FOLIO is enabled. Signed-off-by: Pedro Demarchi Gomes Link: https://lore.kernel.org/r/20251004030210.49080-1-pedrodemargomes@gmail.com Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'virt') diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index caa87efc8f7a..9017e4d77c53 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -82,9 +82,9 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, * The order will be passed when creating the guest_memfd, and * checked when creating memslots. */ - WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio))); + WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio))); index = kvm_gmem_get_index(slot, gfn); - index = ALIGN_DOWN(index, 1 << folio_order(folio)); + index = ALIGN_DOWN(index, folio_nr_pages(folio)); r = __kvm_gmem_prepare_folio(kvm, slot, index, folio); if (!r) kvm_gmem_mark_prepared(folio); -- cgit v1.2.3 From 5f3e10797ab883e9dcc256f21094f039c2bb3143 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 Oct 2025 15:27:33 -0700 Subject: KVM: guest_memfd: Drop a superfluous local var in kvm_gmem_fault_user_mapping() Drop the local "int err" that's buried in the middle guest_memfd's user fault handler to avoid the potential for variable shadowing, e.g. if an "err" variable were also declared at function scope. No functional change intended. Link: https://lore.kernel.org/r/20251007222733.349460-1-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'virt') diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 9017e4d77c53..1e4af29159ea 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -361,12 +361,10 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) folio = kvm_gmem_get_folio(inode, vmf->pgoff); if (IS_ERR(folio)) { - int err = PTR_ERR(folio); - - if (err == -EAGAIN) + if (PTR_ERR(folio) == -EAGAIN) return VM_FAULT_RETRY; - return vmf_error(err); + return vmf_error(PTR_ERR(folio)); } if (WARN_ON_ONCE(folio_test_large(folio))) { -- cgit v1.2.3 From 497b1dfbcacf4e45c9cd3f594959918ca0e4536b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 16 Oct 2025 10:28:42 -0700 Subject: KVM: guest_memfd: Rename "struct kvm_gmem" to "struct gmem_file" Rename the "kvm_gmem" structure to "gmem_file" in anticipation of using dedicated guest_memfd inodes instead of anonyomous inodes, at which point the "kvm_gmem" nomenclature becomes quite misleading. In guest_memfd, inodes are effectively the raw underlying physical storage, and will be used to track properties of the physical memory, while each gmem file is effectively a single VM's view of that storage, and is used to track assets specific to its associated VM, e.g. memslots=>gmem bindings. Using "kvm_gmem" suggests that the per-VM/per-file structures are _the_ guest_memfd instance, which almost the exact opposite of reality. Opportunistically rename local variables from "gmem" to "f", again to avoid confusion once guest_memfd specific inodes come along. No functional change intended. Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Reviewed-by: Shivank Garg Tested-by: Shivank Garg Link: https://lore.kernel.org/r/20251016172853.52451-2-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 98 +++++++++++++++++++++++++++----------------------- 1 file changed, 53 insertions(+), 45 deletions(-) (limited to 'virt') diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 1e4af29159ea..2989c5fe426f 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -7,7 +7,16 @@ #include "kvm_mm.h" -struct kvm_gmem { +/* + * A guest_memfd instance can be associated multiple VMs, each with its own + * "view" of the underlying physical memory. + * + * The gmem's inode is effectively the raw underlying physical storage, and is + * used to track properties of the physical memory, while each gmem file is + * effectively a single VM's view of that storage, and is used to track assets + * specific to its associated VM, e.g. memslots=>gmem bindings. + */ +struct gmem_file { struct kvm *kvm; struct xarray bindings; struct list_head entry; @@ -115,16 +124,16 @@ static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *in return KVM_FILTER_PRIVATE; } -static void __kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start, +static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start, pgoff_t end, enum kvm_gfn_range_filter attr_filter) { bool flush = false, found_memslot = false; struct kvm_memory_slot *slot; - struct kvm *kvm = gmem->kvm; + struct kvm *kvm = f->kvm; unsigned long index; - xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) { + xa_for_each_range(&f->bindings, index, slot, start, end - 1) { pgoff_t pgoff = slot->gmem.pgoff; struct kvm_gfn_range gfn_range = { @@ -157,20 +166,20 @@ static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start, { struct list_head *gmem_list = &inode->i_mapping->i_private_list; enum kvm_gfn_range_filter attr_filter; - struct kvm_gmem *gmem; + struct gmem_file *f; attr_filter = kvm_gmem_get_invalidate_filter(inode); - list_for_each_entry(gmem, gmem_list, entry) - __kvm_gmem_invalidate_begin(gmem, start, end, attr_filter); + list_for_each_entry(f, gmem_list, entry) + __kvm_gmem_invalidate_begin(f, start, end, attr_filter); } -static void __kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start, +static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start, pgoff_t end) { - struct kvm *kvm = gmem->kvm; + struct kvm *kvm = f->kvm; - if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { + if (xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) { KVM_MMU_LOCK(kvm); kvm_mmu_invalidate_end(kvm); KVM_MMU_UNLOCK(kvm); @@ -181,10 +190,10 @@ static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start, pgoff_t end) { struct list_head *gmem_list = &inode->i_mapping->i_private_list; - struct kvm_gmem *gmem; + struct gmem_file *f; - list_for_each_entry(gmem, gmem_list, entry) - __kvm_gmem_invalidate_end(gmem, start, end); + list_for_each_entry(f, gmem_list, entry) + __kvm_gmem_invalidate_end(f, start, end); } static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len) @@ -282,9 +291,9 @@ static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset, static int kvm_gmem_release(struct inode *inode, struct file *file) { - struct kvm_gmem *gmem = file->private_data; + struct gmem_file *f = file->private_data; struct kvm_memory_slot *slot; - struct kvm *kvm = gmem->kvm; + struct kvm *kvm = f->kvm; unsigned long index; /* @@ -304,7 +313,7 @@ static int kvm_gmem_release(struct inode *inode, struct file *file) filemap_invalidate_lock(inode->i_mapping); - xa_for_each(&gmem->bindings, index, slot) + xa_for_each(&f->bindings, index, slot) WRITE_ONCE(slot->gmem.file, NULL); /* @@ -312,18 +321,18 @@ static int kvm_gmem_release(struct inode *inode, struct file *file) * Zap all SPTEs pointed at by this file. Do not free the backing * memory, as its lifetime is associated with the inode, not the file. */ - __kvm_gmem_invalidate_begin(gmem, 0, -1ul, + __kvm_gmem_invalidate_begin(f, 0, -1ul, kvm_gmem_get_invalidate_filter(inode)); - __kvm_gmem_invalidate_end(gmem, 0, -1ul); + __kvm_gmem_invalidate_end(f, 0, -1ul); - list_del(&gmem->entry); + list_del(&f->entry); filemap_invalidate_unlock(inode->i_mapping); mutex_unlock(&kvm->slots_lock); - xa_destroy(&gmem->bindings); - kfree(gmem); + xa_destroy(&f->bindings); + kfree(f); kvm_put_kvm(kvm); @@ -491,7 +500,7 @@ bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm) static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) { const char *anon_name = "[kvm-gmem]"; - struct kvm_gmem *gmem; + struct gmem_file *f; struct inode *inode; struct file *file; int fd, err; @@ -500,14 +509,13 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) if (fd < 0) return fd; - gmem = kzalloc(sizeof(*gmem), GFP_KERNEL); - if (!gmem) { + f = kzalloc(sizeof(*f), GFP_KERNEL); + if (!f) { err = -ENOMEM; goto err_fd; } - file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem, - O_RDWR, NULL); + file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, f, O_RDWR, NULL); if (IS_ERR(file)) { err = PTR_ERR(file); goto err_gmem; @@ -529,15 +537,15 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); kvm_get_kvm(kvm); - gmem->kvm = kvm; - xa_init(&gmem->bindings); - list_add(&gmem->entry, &inode->i_mapping->i_private_list); + f->kvm = kvm; + xa_init(&f->bindings); + list_add(&f->entry, &inode->i_mapping->i_private_list); fd_install(fd, file); return fd; err_gmem: - kfree(gmem); + kfree(f); err_fd: put_unused_fd(fd); return err; @@ -562,7 +570,7 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, { loff_t size = slot->npages << PAGE_SHIFT; unsigned long start, end; - struct kvm_gmem *gmem; + struct gmem_file *f; struct inode *inode; struct file *file; int r = -EINVAL; @@ -576,8 +584,8 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, if (file->f_op != &kvm_gmem_fops) goto err; - gmem = file->private_data; - if (gmem->kvm != kvm) + f = file->private_data; + if (f->kvm != kvm) goto err; inode = file_inode(file); @@ -591,8 +599,8 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, start = offset >> PAGE_SHIFT; end = start + slot->npages; - if (!xa_empty(&gmem->bindings) && - xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) { + if (!xa_empty(&f->bindings) && + xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) { filemap_invalidate_unlock(inode->i_mapping); goto err; } @@ -607,7 +615,7 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, if (kvm_gmem_supports_mmap(inode)) slot->flags |= KVM_MEMSLOT_GMEM_ONLY; - xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL); + xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL); filemap_invalidate_unlock(inode->i_mapping); /* @@ -625,7 +633,7 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) { unsigned long start = slot->gmem.pgoff; unsigned long end = start + slot->npages; - struct kvm_gmem *gmem; + struct gmem_file *f; struct file *file; /* @@ -636,10 +644,10 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) if (!file) return; - gmem = file->private_data; + f = file->private_data; filemap_invalidate_lock(file->f_mapping); - xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL); + xa_store_range(&f->bindings, start, end - 1, NULL, GFP_KERNEL); /* * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn() @@ -657,17 +665,17 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file, pgoff_t index, kvm_pfn_t *pfn, bool *is_prepared, int *max_order) { - struct file *gmem_file = READ_ONCE(slot->gmem.file); - struct kvm_gmem *gmem = file->private_data; + struct file *slot_file = READ_ONCE(slot->gmem.file); + struct gmem_file *f = file->private_data; struct folio *folio; - if (file != gmem_file) { - WARN_ON_ONCE(gmem_file); + if (file != slot_file) { + WARN_ON_ONCE(slot_file); return ERR_PTR(-EFAULT); } - if (xa_load(&gmem->bindings, index) != slot) { - WARN_ON_ONCE(xa_load(&gmem->bindings, index)); + if (xa_load(&f->bindings, index) != slot) { + WARN_ON_ONCE(xa_load(&f->bindings, index)); return ERR_PTR(-EIO); } -- cgit v1.2.3 From 392dd9d9488a8a81c2d58f9f4eee99c5b7b8e1c7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 16 Oct 2025 10:28:43 -0700 Subject: KVM: guest_memfd: Add macro to iterate over gmem_files for a mapping/inode Add a kvm_gmem_for_each_file() to make it more obvious that KVM is iterating over guest_memfd _files_, not guest_memfd instances, as could be assumed given the name "gmem_list". No functional change intended. Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Reviewed-by: Shivank Garg Tested-by: Shivank Garg Link: https://lore.kernel.org/r/20251016172853.52451-3-seanjc@google.com [sean: drop .clang-format change] Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'virt') diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 2989c5fe426f..5cce20ff418d 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -22,6 +22,9 @@ struct gmem_file { struct list_head entry; }; +#define kvm_gmem_for_each_file(f, mapping) \ + list_for_each_entry(f, &(mapping)->i_private_list, entry) + /** * folio_file_pfn - like folio_file_page, but return a pfn. * @folio: The folio which contains this index. @@ -164,13 +167,12 @@ static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start, static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start, pgoff_t end) { - struct list_head *gmem_list = &inode->i_mapping->i_private_list; enum kvm_gfn_range_filter attr_filter; struct gmem_file *f; attr_filter = kvm_gmem_get_invalidate_filter(inode); - list_for_each_entry(f, gmem_list, entry) + kvm_gmem_for_each_file(f, inode->i_mapping) __kvm_gmem_invalidate_begin(f, start, end, attr_filter); } @@ -189,10 +191,9 @@ static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start, static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start, pgoff_t end) { - struct list_head *gmem_list = &inode->i_mapping->i_private_list; struct gmem_file *f; - list_for_each_entry(f, gmem_list, entry) + kvm_gmem_for_each_file(f, inode->i_mapping) __kvm_gmem_invalidate_end(f, start, end); } -- cgit v1.2.3 From a63ca4236e6799cf4343f9aec9d92afdfa582446 Mon Sep 17 00:00:00 2001 From: Ackerley Tng Date: Thu, 16 Oct 2025 10:28:44 -0700 Subject: KVM: guest_memfd: Use guest mem inodes instead of anonymous inodes guest_memfd's inode represents memory the guest_memfd is providing. guest_memfd's file represents a struct kvm's view of that memory. Using a custom inode allows customization of the inode teardown process via callbacks. For example, ->evict_inode() allows customization of the truncation process on file close, and ->destroy_inode() and ->free_inode() allow customization of the inode freeing process. Customizing the truncation process allows flexibility in management of guest_memfd memory and customization of the inode freeing process allows proper cleanup of memory metadata stored on the inode. Memory metadata is more appropriately stored on the inode (as opposed to the file), since the metadata is for the memory and is not unique to a specific binding and struct kvm. Acked-by: David Hildenbrand Co-developed-by: Fuad Tabba Signed-off-by: Fuad Tabba Signed-off-by: Ackerley Tng Signed-off-by: Shivank Garg Tested-by: Ashish Kalra [sean: drop helpers, open code logic in __kvm_gmem_create()] Link: https://lore.kernel.org/r/20251016172853.52451-4-seanjc@google.com Signed-off-by: Sean Christopherson --- include/uapi/linux/magic.h | 1 + virt/kvm/guest_memfd.c | 82 ++++++++++++++++++++++++++++++++++++++-------- virt/kvm/kvm_main.c | 7 +++- virt/kvm/kvm_mm.h | 9 ++--- 4 files changed, 80 insertions(+), 19 deletions(-) (limited to 'virt') diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index bb575f3ab45e..638ca21b7a90 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -103,5 +103,6 @@ #define DEVMEM_MAGIC 0x454d444d /* "DMEM" */ #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */ #define PID_FS_MAGIC 0x50494446 /* "PIDF" */ +#define GUEST_MEMFD_MAGIC 0x474d454d /* "GMEM" */ #endif /* __LINUX_MAGIC_H__ */ diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 5cce20ff418d..ce04fc85e631 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -1,12 +1,16 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include +#include #include +#include #include -#include #include "kvm_mm.h" +static struct vfsmount *kvm_gmem_mnt; + /* * A guest_memfd instance can be associated multiple VMs, each with its own * "view" of the underlying physical memory. @@ -424,11 +428,6 @@ static struct file_operations kvm_gmem_fops = { .fallocate = kvm_gmem_fallocate, }; -void kvm_gmem_init(struct module *module) -{ - kvm_gmem_fops.owner = module; -} - static int kvm_gmem_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) @@ -500,7 +499,7 @@ bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm) static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) { - const char *anon_name = "[kvm-gmem]"; + static const char *name = "[kvm-gmem]"; struct gmem_file *f; struct inode *inode; struct file *file; @@ -516,16 +515,17 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) goto err_fd; } - file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, f, O_RDWR, NULL); - if (IS_ERR(file)) { - err = PTR_ERR(file); + /* __fput() will take care of fops_put(). */ + if (!fops_get(&kvm_gmem_fops)) { + err = -ENOENT; goto err_gmem; } - file->f_flags |= O_LARGEFILE; - - inode = file->f_inode; - WARN_ON(file->f_mapping != inode->i_mapping); + inode = anon_inode_make_secure_inode(kvm_gmem_mnt->mnt_sb, name, NULL); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto err_fops; + } inode->i_private = (void *)(unsigned long)flags; inode->i_op = &kvm_gmem_iops; @@ -537,6 +537,15 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) /* Unmovable mappings are supposed to be marked unevictable as well. */ WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); + file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto err_inode; + } + + file->f_flags |= O_LARGEFILE; + file->private_data = f; + kvm_get_kvm(kvm); f->kvm = kvm; xa_init(&f->bindings); @@ -545,6 +554,10 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) fd_install(fd, file); return fd; +err_inode: + iput(inode); +err_fops: + fops_put(&kvm_gmem_fops); err_gmem: kfree(f); err_fd: @@ -816,3 +829,44 @@ put_folio_and_exit: } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate); #endif + +static int kvm_gmem_init_fs_context(struct fs_context *fc) +{ + if (!init_pseudo(fc, GUEST_MEMFD_MAGIC)) + return -ENOMEM; + + fc->s_iflags |= SB_I_NOEXEC; + fc->s_iflags |= SB_I_NODEV; + + return 0; +} + +static struct file_system_type kvm_gmem_fs = { + .name = "guest_memfd", + .init_fs_context = kvm_gmem_init_fs_context, + .kill_sb = kill_anon_super, +}; + +static int kvm_gmem_init_mount(void) +{ + kvm_gmem_mnt = kern_mount(&kvm_gmem_fs); + + if (IS_ERR(kvm_gmem_mnt)) + return PTR_ERR(kvm_gmem_mnt); + + kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC; + return 0; +} + +int kvm_gmem_init(struct module *module) +{ + kvm_gmem_fops.owner = module; + + return kvm_gmem_init_mount(); +} + +void kvm_gmem_exit(void) +{ + kern_unmount(kvm_gmem_mnt); + kvm_gmem_mnt = NULL; +} diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b7a0ae2a7b20..4845e5739436 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -6517,7 +6517,9 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) if (WARN_ON_ONCE(r)) goto err_vfio; - kvm_gmem_init(module); + r = kvm_gmem_init(module); + if (r) + goto err_gmem; r = kvm_init_virtualization(); if (r) @@ -6538,6 +6540,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module) err_register: kvm_uninit_virtualization(); err_virt: + kvm_gmem_exit(); +err_gmem: kvm_vfio_ops_exit(); err_vfio: kvm_async_pf_deinit(); @@ -6569,6 +6573,7 @@ void kvm_exit(void) for_each_possible_cpu(cpu) free_cpumask_var(per_cpu(cpu_kick_mask, cpu)); kmem_cache_destroy(kvm_vcpu_cache); + kvm_gmem_exit(); kvm_vfio_ops_exit(); kvm_async_pf_deinit(); kvm_irqfd_exit(); diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index 31defb08ccba..9fcc5d5b7f8d 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -68,17 +68,18 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, #endif /* HAVE_KVM_PFNCACHE */ #ifdef CONFIG_KVM_GUEST_MEMFD -void kvm_gmem_init(struct module *module); +int kvm_gmem_init(struct module *module); +void kvm_gmem_exit(void); int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args); int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned int fd, loff_t offset); void kvm_gmem_unbind(struct kvm_memory_slot *slot); #else -static inline void kvm_gmem_init(struct module *module) +static inline int kvm_gmem_init(struct module *module) { - + return 0; } - +static inline void kvm_gmem_exit(void) {}; static inline int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned int fd, loff_t offset) -- cgit v1.2.3 From f609e89ae8936dbba8992ada83d27ece5cb8393a Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Thu, 16 Oct 2025 10:28:45 -0700 Subject: KVM: guest_memfd: Add slab-allocated inode cache Add a dedicated gmem_inode structure and a slab-allocated inode cache for guest memory backing, similar to how shmem handles inodes. This adds the necessary allocation/destruction functions and prepares for upcoming guest_memfd NUMA policy support changes. Using a dedicated structure will also allow for additional cleanups, e.g. to track flags in gmem_inode instead of i_private. Signed-off-by: Shivank Garg Tested-by: Ashish Kalra [sean: s/kvm_gmem_inode_info/gmem_inode, name init_once()] Reviewed-by: Ackerley Tng Tested-by: Ackerley Tng Link: https://lore.kernel.org/r/20251016172853.52451-5-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index ce04fc85e631..88fd812f0f31 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -26,6 +26,15 @@ struct gmem_file { struct list_head entry; }; +struct gmem_inode { + struct inode vfs_inode; +}; + +static __always_inline struct gmem_inode *GMEM_I(struct inode *inode) +{ + return container_of(inode, struct gmem_inode, vfs_inode); +} + #define kvm_gmem_for_each_file(f, mapping) \ list_for_each_entry(f, &(mapping)->i_private_list, entry) @@ -830,13 +839,61 @@ put_folio_and_exit: EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate); #endif +static struct kmem_cache *kvm_gmem_inode_cachep; + +static void kvm_gmem_init_inode_once(void *__gi) +{ + struct gmem_inode *gi = __gi; + + /* + * Note! Don't initialize the inode with anything specific to the + * guest_memfd instance, or that might be specific to how the inode is + * used (from the VFS-layer's perspective). This hook is called only + * during the initial slab allocation, i.e. only fields/state that are + * idempotent across _all_ use of the inode _object_ can be initialized + * at this time! + */ + inode_init_once(&gi->vfs_inode); +} + +static struct inode *kvm_gmem_alloc_inode(struct super_block *sb) +{ + struct gmem_inode *gi; + + gi = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL); + if (!gi) + return NULL; + + return &gi->vfs_inode; +} + +static void kvm_gmem_destroy_inode(struct inode *inode) +{ +} + +static void kvm_gmem_free_inode(struct inode *inode) +{ + kmem_cache_free(kvm_gmem_inode_cachep, GMEM_I(inode)); +} + +static const struct super_operations kvm_gmem_super_operations = { + .statfs = simple_statfs, + .alloc_inode = kvm_gmem_alloc_inode, + .destroy_inode = kvm_gmem_destroy_inode, + .free_inode = kvm_gmem_free_inode, +}; + static int kvm_gmem_init_fs_context(struct fs_context *fc) { + struct pseudo_fs_context *ctx; + if (!init_pseudo(fc, GUEST_MEMFD_MAGIC)) return -ENOMEM; fc->s_iflags |= SB_I_NOEXEC; fc->s_iflags |= SB_I_NODEV; + ctx = fc->fs_private; + ctx->ops = &kvm_gmem_super_operations; return 0; } @@ -860,13 +917,31 @@ static int kvm_gmem_init_mount(void) int kvm_gmem_init(struct module *module) { + struct kmem_cache_args args = { + .align = 0, + .ctor = kvm_gmem_init_inode_once, + }; + int ret; + kvm_gmem_fops.owner = module; + kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache", + sizeof(struct gmem_inode), + &args, SLAB_ACCOUNT); + if (!kvm_gmem_inode_cachep) + return -ENOMEM; - return kvm_gmem_init_mount(); + ret = kvm_gmem_init_mount(); + if (ret) { + kmem_cache_destroy(kvm_gmem_inode_cachep); + return ret; + } + return 0; } void kvm_gmem_exit(void) { kern_unmount(kvm_gmem_mnt); kvm_gmem_mnt = NULL; + rcu_barrier(); + kmem_cache_destroy(kvm_gmem_inode_cachep); } -- cgit v1.2.3 From ed1ffa810bd600ae40c794d87e4ca587dee5fa3a Mon Sep 17 00:00:00 2001 From: Shivank Garg Date: Thu, 16 Oct 2025 10:28:46 -0700 Subject: KVM: guest_memfd: Enforce NUMA mempolicy using shared policy Previously, guest-memfd allocations followed local NUMA node id in absence of process mempolicy, resulting in arbitrary memory allocation. Moreover, mbind() couldn't be used by the VMM as guest memory wasn't mapped into userspace when allocation occurred. Enable NUMA policy support by implementing vm_ops for guest-memfd mmap operation. This allows the VMM to use mmap()+mbind() to set the desired NUMA policy for a range of memory, and provides fine-grained control over guest memory allocation across NUMA nodes. Note, using mmap()+mbind() works even for PRIVATE memory, as mbind() doesn't require the memory to be faulted in. However, get_mempolicy() and other paths that require the userspace page tables to be populated may return incorrect information for PRIVATE memory (though under the hood, KVM+guest_memfd will still behave correctly). Store the policy in the inode structure, gmem_inode, as a shared memory policy, so that the policy is a property of the physical memory itself, i.e. not bound to the VMA. In guest_memfd, KVM is the primary MMU and any VMAs are secondary, i.e. using mbind() on a VMA to set policy is a means to an end, e.g. to avoid having to add a file-based equivalent to mbind(). Similarly, retrieve the policy via mpol_shared_policy_lookup(), not get_vma_policy(), even when allocating to fault in memory for userspace mappings, so that the policy stored in gmem_inode is always the source of true. Apply policy changes only to future allocations, i.e. do not migrate existing memory in the guest_memfd instance. This matches mbind(2)'s default behavior, which affects only new allocations unless overridden with MPOL_MF_MOVE/MPOL_MF_MOVE_ALL flags (which are not supported by guest_memfd as guest_memfd memory is unmovable). Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Signed-off-by: Shivank Garg Tested-by: Ashish Kalra Link: https://lore.kernel.org/all/e9d43abc-bcdb-4f9f-9ad7-5644f714de19@amd.com [sean: fold in fixup (see Link above), massage changelog] Link: https://lore.kernel.org/r/20251016172853.52451-6-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 2 deletions(-) (limited to 'virt') diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 88fd812f0f31..4463643bd0a2 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -27,6 +28,7 @@ struct gmem_file { }; struct gmem_inode { + struct shared_policy policy; struct inode vfs_inode; }; @@ -129,7 +131,25 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) { /* TODO: Support huge pages. */ - return filemap_grab_folio(inode->i_mapping, index); + struct mempolicy *policy; + struct folio *folio; + + /* + * Fast-path: See if folio is already present in mapping to avoid + * policy_lookup. + */ + folio = __filemap_get_folio(inode->i_mapping, index, + FGP_LOCK | FGP_ACCESSED, 0); + if (!IS_ERR(folio)) + return folio; + + policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index); + folio = __filemap_get_folio_mpol(inode->i_mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_mask(inode->i_mapping), policy); + mpol_cond_put(policy); + + return folio; } static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode) @@ -411,8 +431,40 @@ out_folio: return ret; } +#ifdef CONFIG_NUMA +static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) +{ + struct inode *inode = file_inode(vma->vm_file); + + return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol); +} + +static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma, + unsigned long addr, pgoff_t *pgoff) +{ + struct inode *inode = file_inode(vma->vm_file); + + *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT); + + /* + * Return the memory policy for this index, or NULL if none is set. + * + * Returning NULL, e.g. instead of the current task's memory policy, is + * important for the .get_policy kernel ABI: it indicates that no + * explicit policy has been set via mbind() for this memory. The caller + * can then replace NULL with the default memory policy instead of the + * current task's memory policy. + */ + return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff); +} +#endif /* CONFIG_NUMA */ + static const struct vm_operations_struct kvm_gmem_vm_ops = { - .fault = kvm_gmem_fault_user_mapping, + .fault = kvm_gmem_fault_user_mapping, +#ifdef CONFIG_NUMA + .get_policy = kvm_gmem_get_policy, + .set_policy = kvm_gmem_set_policy, +#endif }; static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma) @@ -864,11 +916,13 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb) if (!gi) return NULL; + mpol_shared_policy_init(&gi->policy, NULL); return &gi->vfs_inode; } static void kvm_gmem_destroy_inode(struct inode *inode) { + mpol_free_shared_policy(&GMEM_I(inode)->policy); } static void kvm_gmem_free_inode(struct inode *inode) -- cgit v1.2.3 From e66438bb81c4ca773b51292c27e6b5baa34f9a5e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 16 Oct 2025 10:28:53 -0700 Subject: KVM: guest_memfd: Add gmem_inode.flags field instead of using i_private Track a guest_memfd instance's flags in gmem_inode instead of burying them in i_private. Burning an extra 8 bytes per inode is well worth the added clarity provided by explicit tracking. Reviewed-by: Shivank Garg Tested-by: Shivank Garg Link: https://lore.kernel.org/r/20251016172853.52451-13-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'virt') diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 4463643bd0a2..20f6e7fab58d 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -30,6 +30,8 @@ struct gmem_file { struct gmem_inode { struct shared_policy policy; struct inode vfs_inode; + + u64 flags; }; static __always_inline struct gmem_inode *GMEM_I(struct inode *inode) @@ -154,7 +156,7 @@ static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index) static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode) { - if ((u64)inode->i_private & GUEST_MEMFD_FLAG_INIT_SHARED) + if (GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED) return KVM_FILTER_SHARED; return KVM_FILTER_PRIVATE; @@ -385,9 +387,7 @@ static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) static bool kvm_gmem_supports_mmap(struct inode *inode) { - const u64 flags = (u64)inode->i_private; - - return flags & GUEST_MEMFD_FLAG_MMAP; + return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP; } static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) @@ -399,7 +399,7 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf) if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) return VM_FAULT_SIGBUS; - if (!((u64)inode->i_private & GUEST_MEMFD_FLAG_INIT_SHARED)) + if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)) return VM_FAULT_SIGBUS; folio = kvm_gmem_get_folio(inode, vmf->pgoff); @@ -588,7 +588,6 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) goto err_fops; } - inode->i_private = (void *)(unsigned long)flags; inode->i_op = &kvm_gmem_iops; inode->i_mapping->a_ops = &kvm_gmem_aops; inode->i_mode |= S_IFREG; @@ -598,6 +597,8 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags) /* Unmovable mappings are supposed to be marked unevictable as well. */ WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping)); + GMEM_I(inode)->flags = flags; + file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops); if (IS_ERR(file)) { err = PTR_ERR(file); @@ -917,6 +918,8 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb) return NULL; mpol_shared_policy_init(&gi->policy, NULL); + + gi->flags = 0; return &gi->vfs_inode; } -- cgit v1.2.3 From 0bb4d9c39b76b7453040ec8fb27f69f8437d6fe1 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 7 Oct 2025 15:23:56 -0700 Subject: KVM: guest_memfd: Define a CLASS to get+put guest_memfd file from a memslot Add a CLASS to handle getting and putting a guest_memfd file given a memslot to reduce the amount of related boilerplate, and more importantly to minimize the chances of forgetting to put the file (thankfully the bug that prompted this didn't escape initial testing). Define a CLASS instead of using __free(fput) as _free() comes with subtle caveats related to FILO ordering (objects are freed in the order in which they are declared), and the recommended solution/workaround (declare file pointers exactly when they are initialized) is visually jarring relative to KVM's (and the kernel's) overall strict adherence to not mixing declarations and code. E.g. the use in kvm_gmem_populate() would be: slot = gfn_to_memslot(kvm, start_gfn); if (!kvm_slot_has_gmem(slot)) return -EINVAL; struct file *file __free(fput) = kvm_gmem_get_file(slot; if (!file) return -EFAULT; filemap_invalidate_lock(file->f_mapping); Note, using CLASS() still declares variables in the middle of code, but the syntactic sugar obfuscates the declaration, i.e. hides the anomaly to a large extent. No functional change intended. Link: https://lore.kernel.org/r/20251007222356.348349-1-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/guest_memfd.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'virt') diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 20f6e7fab58d..427c0acee9d7 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -385,6 +385,9 @@ static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) return get_file_active(&slot->gmem.file); } +DEFINE_CLASS(gmem_get_file, struct file *, if (_T) fput(_T), + kvm_gmem_get_file(slot), struct kvm_memory_slot *slot); + static bool kvm_gmem_supports_mmap(struct inode *inode) { return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP; @@ -710,13 +713,12 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) unsigned long start = slot->gmem.pgoff; unsigned long end = start + slot->npages; struct gmem_file *f; - struct file *file; /* * Nothing to do if the underlying file was already closed (or is being * closed right now), kvm_gmem_release() invalidates all bindings. */ - file = kvm_gmem_get_file(slot); + CLASS(gmem_get_file, file)(slot); if (!file) return; @@ -731,8 +733,6 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) */ WRITE_ONCE(slot->gmem.file, NULL); filemap_invalidate_unlock(file->f_mapping); - - fput(file); } /* Returns a locked folio on success. */ @@ -778,19 +778,17 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, int *max_order) { pgoff_t index = kvm_gmem_get_index(slot, gfn); - struct file *file = kvm_gmem_get_file(slot); struct folio *folio; bool is_prepared = false; int r = 0; + CLASS(gmem_get_file, file)(slot); if (!file) return -EFAULT; folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order); - if (IS_ERR(folio)) { - r = PTR_ERR(folio); - goto out; - } + if (IS_ERR(folio)) + return PTR_ERR(folio); if (!is_prepared) r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); @@ -802,8 +800,6 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, else folio_put(folio); -out: - fput(file); return r; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn); @@ -812,7 +808,6 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn); long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages, kvm_gmem_populate_cb post_populate, void *opaque) { - struct file *file; struct kvm_memory_slot *slot; void __user *p; @@ -828,7 +823,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long if (!kvm_slot_has_gmem(slot)) return -EINVAL; - file = kvm_gmem_get_file(slot); + CLASS(gmem_get_file, file)(slot); if (!file) return -EFAULT; @@ -886,7 +881,6 @@ put_folio_and_exit: filemap_invalidate_unlock(file->f_mapping); - fput(file); return ret && !i ? ret : i; } EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate); -- cgit v1.2.3 From 0a0da3f92118950862700497bc7917f0fbf6a6e8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:24 -0700 Subject: KVM: Make support for kvm_arch_vcpu_async_ioctl() mandatory Implement kvm_arch_vcpu_async_ioctl() "natively" in x86 and arm64 instead of relying on an #ifdef'd stub, and drop HAVE_KVM_VCPU_ASYNC_IOCTL in anticipation of using the API on x86. Once x86 uses the API, providing a stub for one architecture and having all other architectures opt-in requires more code than simply implementing the API in the lone holdout. Eliminating the Kconfig will also reduce churn if the API is renamed in the future (spoiler alert). No functional change intended. Acked-by: Claudio Imbrenda Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/arm64/kvm/arm.c | 6 ++++++ arch/loongarch/kvm/Kconfig | 1 - arch/mips/kvm/Kconfig | 1 - arch/powerpc/kvm/Kconfig | 1 - arch/riscv/kvm/Kconfig | 1 - arch/s390/kvm/Kconfig | 1 - arch/x86/kvm/x86.c | 6 ++++++ include/linux/kvm_host.h | 10 ---------- virt/kvm/Kconfig | 3 --- 9 files changed, 12 insertions(+), 18 deletions(-) (limited to 'virt') diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 870953b4a8a7..ef5bf57f79b7 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1835,6 +1835,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, return r; } +long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + return -ENOIOCTLCMD; +} + void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig index ae64bbdf83a7..ed4f724db774 100644 --- a/arch/loongarch/kvm/Kconfig +++ b/arch/loongarch/kvm/Kconfig @@ -25,7 +25,6 @@ config KVM select HAVE_KVM_IRQCHIP select HAVE_KVM_MSI select HAVE_KVM_READONLY_MEM - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_COMMON select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_GENERIC_HARDWARE_ENABLING diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index ab57221fa4dd..cc13cc35f208 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -22,7 +22,6 @@ config KVM select EXPORT_UASM select KVM_COMMON select KVM_GENERIC_DIRTYLOG_READ_PROTECT - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_MMIO select KVM_GENERIC_MMU_NOTIFIER select KVM_GENERIC_HARDWARE_ENABLING diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 2f2702c867f7..c9a2d50ff1b0 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -20,7 +20,6 @@ if VIRTUALIZATION config KVM bool select KVM_COMMON - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_VFIO select HAVE_KVM_IRQ_BYPASS diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index c50328212917..77379f77840a 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -23,7 +23,6 @@ config KVM select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQ_ROUTING select HAVE_KVM_MSI - select HAVE_KVM_VCPU_ASYNC_IOCTL select HAVE_KVM_READONLY_MEM select HAVE_KVM_DIRTY_RING_ACQ_REL select KVM_COMMON diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index cae908d64550..96d16028e8b7 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -20,7 +20,6 @@ config KVM def_tristate y prompt "Kernel-based Virtual Machine (KVM) support" select HAVE_KVM_CPU_RELAX_INTERCEPT - select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_ASYNC_PF select KVM_ASYNC_PF_SYNC select KVM_COMMON diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b4b5d2d09634..ca5ba2caf314 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7240,6 +7240,12 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) return 0; } +long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) +{ + return -ENOIOCTLCMD; +} + int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct kvm *kvm = filp->private_data; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5bd76cf394fa..7186b2ae4b57 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2437,18 +2437,8 @@ static inline bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_NO_POLL */ -#ifdef CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); -#else -static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, - unsigned long arg) -{ - return -ENOIOCTLCMD; -} -#endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ - void kvm_arch_guest_memory_reclaimed(struct kvm *kvm); #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 5f0015c5dd95..267c7369c765 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -78,9 +78,6 @@ config HAVE_KVM_IRQ_BYPASS tristate select IRQ_BYPASS_MANAGER -config HAVE_KVM_VCPU_ASYNC_IOCTL - bool - config HAVE_KVM_VCPU_RUN_PID_CHANGE bool -- cgit v1.2.3 From 50efc2340a598da4bafa40bc01e18f8cf73a4ae3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 30 Oct 2025 13:09:25 -0700 Subject: KVM: Rename kvm_arch_vcpu_async_ioctl() to kvm_arch_vcpu_unlocked_ioctl() Rename the "async" ioctl API to "unlocked" so that upcoming usage in x86's TDX code doesn't result in a massive misnomer. To avoid having to retry SEAMCALLs, TDX needs to acquire kvm->lock *and* all vcpu->mutex locks, and acquiring all of those locks after/inside the current vCPU's mutex is a non-starter. However, TDX also needs to acquire the vCPU's mutex and load the vCPU, i.e. the handling is very much not async to the vCPU. No functional change intended. Acked-by: Claudio Imbrenda Reviewed-by: Yan Zhao Tested-by: Yan Zhao Tested-by: Kai Huang Link: https://patch.msgid.link/20251030200951.3402865-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/arm64/kvm/arm.c | 4 ++-- arch/loongarch/kvm/vcpu.c | 4 ++-- arch/mips/kvm/mips.c | 4 ++-- arch/powerpc/kvm/powerpc.c | 4 ++-- arch/riscv/kvm/vcpu.c | 4 ++-- arch/s390/kvm/kvm-s390.c | 4 ++-- arch/x86/kvm/x86.c | 4 ++-- include/linux/kvm_host.h | 4 ++-- virt/kvm/kvm_main.c | 6 +++--- 9 files changed, 19 insertions(+), 19 deletions(-) (limited to 'virt') diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index ef5bf57f79b7..cf23f6b07ec7 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1835,8 +1835,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, return r; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, - unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { return -ENOIOCTLCMD; } diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 30e3b089a596..9a5844e85fd3 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -1471,8 +1471,8 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) return 0; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { void __user *argp = (void __user *)arg; struct kvm_vcpu *vcpu = filp->private_data; diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index a75587018f44..b0fb92fda4d4 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -895,8 +895,8 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return r; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, - unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 2ba057171ebe..9a89a6d98f97 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -2028,8 +2028,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, return -EINVAL; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index bccb919ca615..a4bd6077eecc 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -238,8 +238,8 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 16ba04062854..8c4caa5f2fcd 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -5730,8 +5730,8 @@ static long kvm_s390_vcpu_memsida_op(struct kvm_vcpu *vcpu, return r; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ca5ba2caf314..b85cb213a336 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7240,8 +7240,8 @@ static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) return 0; } -long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, - unsigned long arg) +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl, + unsigned long arg) { return -ENOIOCTLCMD; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7186b2ae4b57..d93f75b05ae2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1557,6 +1557,8 @@ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); +long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg); vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf); int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext); @@ -2437,8 +2439,6 @@ static inline bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_NO_POLL */ -long kvm_arch_vcpu_async_ioctl(struct file *filp, - unsigned int ioctl, unsigned long arg); void kvm_arch_guest_memory_reclaimed(struct kvm *kvm); #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b7a0ae2a7b20..b7db1d5f71a8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4434,10 +4434,10 @@ static long kvm_vcpu_ioctl(struct file *filp, return r; /* - * Some architectures have vcpu ioctls that are asynchronous to vcpu - * execution; mutex_lock() would break them. + * Let arch code handle select vCPU ioctls without holding vcpu->mutex, + * e.g. to support ioctls that can run asynchronous to vCPU execution. */ - r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg); + r = kvm_arch_vcpu_unlocked_ioctl(filp, ioctl, arg); if (r != -ENOIOCTLCMD) return r; -- cgit v1.2.3 From 32bd348be3fa07b26c5ea6b818a161c142dcc2f2 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 10 Nov 2025 11:32:27 +0800 Subject: KVM: Fix last_boosted_vcpu index assignment bug In kvm_vcpu_on_spin(), the loop counter 'i' is incorrectly written to last_boosted_vcpu instead of the actual vCPU index 'idx'. This causes last_boosted_vcpu to store the loop iteration count rather than the vCPU index, leading to incorrect round-robin behavior in subsequent directed yield operations. Fix this by using 'idx' instead of 'i' in the assignment. Signed-off-by: Wanpeng Li Reviewed-by: Sean Christopherson Message-ID: <20251110033232.12538-7-kernellwp@gmail.com> Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'virt') diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b7a0ae2a7b20..cde1eddbaa91 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4026,7 +4026,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) yielded = kvm_vcpu_yield_to(vcpu); if (yielded > 0) { - WRITE_ONCE(kvm->last_boosted_vcpu, i); + WRITE_ONCE(kvm->last_boosted_vcpu, idx); break; } else if (yielded < 0 && !--try) { break; -- cgit v1.2.3