diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2021-05-27 08:09:15 -0400 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2021-08-03 03:44:03 -0400 |
commit | 52ac8b358b0cb7e91c966225fca61be5d1c984bc (patch) | |
tree | 3525b8de9f4e63f163ba3da4bba26c30801ca78c /virt | |
parent | db105fab8d141fc0d9179600c51eba0d168dad34 (diff) | |
download | lwn-52ac8b358b0cb7e91c966225fca61be5d1c984bc.tar.gz lwn-52ac8b358b0cb7e91c966225fca61be5d1c984bc.zip |
KVM: Block memslot updates across range_start() and range_end()
We would like to avoid taking mmu_lock for .invalidate_range_{start,end}()
notifications that are unrelated to KVM. Because mmu_notifier_count
must be modified while holding mmu_lock for write, and must always
be paired across start->end to stay balanced, lock elision must
happen in both or none. Therefore, in preparation for this change,
this patch prevents memslot updates across range_start() and range_end().
Note, technically flag-only memslot updates could be allowed in parallel,
but stalling a memslot update for a relatively short amount of time is
not a scalability issue, and this is all more than complex enough.
A long note on the locking: a previous version of the patch used an rwsem
to block the memslot update while the MMU notifier run, but this resulted
in the following deadlock involving the pseudo-lock tagged as
"mmu_notifier_invalidate_range_start".
======================================================
WARNING: possible circular locking dependency detected
5.12.0-rc3+ #6 Tainted: G OE
------------------------------------------------------
qemu-system-x86/3069 is trying to acquire lock:
ffffffff9c775ca0 (mmu_notifier_invalidate_range_start){+.+.}-{0:0}, at: __mmu_notifier_invalidate_range_end+0x5/0x190
but task is already holding lock:
ffffaff7410a9160 (&kvm->mmu_notifier_slots_lock){.+.+}-{3:3}, at: kvm_mmu_notifier_invalidate_range_start+0x36d/0x4f0 [kvm]
which lock already depends on the new lock.
This corresponds to the following MMU notifier logic:
invalidate_range_start
take pseudo lock
down_read() (*)
release pseudo lock
invalidate_range_end
take pseudo lock (**)
up_read()
release pseudo lock
At point (*) we take the mmu_notifiers_slots_lock inside the pseudo lock;
at point (**) we take the pseudo lock inside the mmu_notifiers_slots_lock.
This could cause a deadlock (ignoring for a second that the pseudo lock
is not a lock):
- invalidate_range_start waits on down_read(), because the rwsem is
held by install_new_memslots
- install_new_memslots waits on down_write(), because the rwsem is
held till (another) invalidate_range_end finishes
- invalidate_range_end sits waits on the pseudo lock, held by
invalidate_range_start.
Removing the fairness of the rwsem breaks the cycle (in lockdep terms,
it would change the *shared* rwsem readers into *shared recursive*
readers), so open-code the wait using a readers count and a
spinlock. This also allows handling blockable and non-blockable
critical section in the same way.
Losing the rwsem fairness does theoretically allow MMU notifiers to
block install_new_memslots forever. Note that mm/mmu_notifier.c's own
retry scheme in mmu_interval_read_begin also uses wait/wake_up
and is likewise not fair.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Diffstat (limited to 'virt')
-rw-r--r-- | virt/kvm/kvm_main.c | 58 |
1 files changed, 54 insertions, 4 deletions
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5cc79373827f..8f9024d65866 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -604,11 +604,9 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, trace_kvm_set_spte_hva(address); /* - * .change_pte() must be surrounded by .invalidate_range_{start,end}(), - * and so always runs with an elevated notifier count. This obviates - * the need to bump the sequence count. + * .change_pte() must be surrounded by .invalidate_range_{start,end}(). */ - WARN_ON_ONCE(!kvm->mmu_notifier_count); + WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count)); kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn); } @@ -658,6 +656,18 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, trace_kvm_unmap_hva_range(range->start, range->end); + /* + * Prevent memslot modification between range_start() and range_end() + * so that conditionally locking provides the same result in both + * functions. Without that guarantee, the mmu_notifier_count + * adjustments will be imbalanced. + * + * Pairs with the decrement in range_end(). + */ + spin_lock(&kvm->mn_invalidate_lock); + kvm->mn_active_invalidate_count++; + spin_unlock(&kvm->mn_invalidate_lock); + __kvm_handle_hva_range(kvm, &hva_range); return 0; @@ -694,9 +704,22 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, .flush_on_ret = false, .may_block = mmu_notifier_range_blockable(range), }; + bool wake; __kvm_handle_hva_range(kvm, &hva_range); + /* Pairs with the increment in range_start(). */ + spin_lock(&kvm->mn_invalidate_lock); + wake = (--kvm->mn_active_invalidate_count == 0); + spin_unlock(&kvm->mn_invalidate_lock); + + /* + * There can only be one waiter, since the wait happens under + * slots_lock. + */ + if (wake) + rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait); + BUG_ON(kvm->mmu_notifier_count < 0); } @@ -977,6 +1000,9 @@ static struct kvm *kvm_create_vm(unsigned long type) mutex_init(&kvm->irq_lock); mutex_init(&kvm->slots_lock); mutex_init(&kvm->slots_arch_lock); + spin_lock_init(&kvm->mn_invalidate_lock); + rcuwait_init(&kvm->mn_memslots_update_rcuwait); + INIT_LIST_HEAD(&kvm->devices); BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); @@ -1099,6 +1125,16 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_coalesced_mmio_free(kvm); #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); + /* + * At this point, pending calls to invalidate_range_start() + * have completed but no more MMU notifiers will run, so + * mn_active_invalidate_count may remain unbalanced. + * No threads can be waiting in install_new_memslots as the + * last reference on KVM has been dropped, but freeing + * memslots would deadlock without this manual intervention. + */ + WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait)); + kvm->mn_active_invalidate_count = 0; #else kvm_arch_flush_shadow_all(kvm); #endif @@ -1360,7 +1396,21 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; + /* + * Do not store the new memslots while there are invalidations in + * progress (preparatory change for the next commit). + */ + spin_lock(&kvm->mn_invalidate_lock); + prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait); + while (kvm->mn_active_invalidate_count) { + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&kvm->mn_invalidate_lock); + schedule(); + spin_lock(&kvm->mn_invalidate_lock); + } + finish_rcuwait(&kvm->mn_memslots_update_rcuwait); rcu_assign_pointer(kvm->memslots[as_id], slots); + spin_unlock(&kvm->mn_invalidate_lock); /* * Acquired in kvm_set_memslot. Must be released before synchronize |