From ba5c20a60eaf2ea2fc6f6fbf6d72e67b8bc2eef3 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 3 Aug 2024 00:03:41 +0100 Subject: [PATCH] KVM: Move gfn_to_pfn_cache invalidation to invalidate_range_end hook The existing retry loop in hva_to_pfn_retry() is extremely pessimistic. If there are any concurrent invalidations running, it's effectively just a complex busy wait loop because its local mmu_notifier_retry_cache() function will always return true. Since multiple invalidations can be running in parallel, this can result in a situation where hva_to_pfn_retry() just backs off and keep retrying for ever, not making any progress. Solve this by being a bit more selective about when to retry. Introduce a separate 'needs invalidation' flag to the GPC, which allows it to be marked invalid even while hva_to_pfn_retry() has dropped the lock to map the newly-found PFN. This allows the invalidation to moved to the range_end hook, and the retry loop only occurs for a given GPC if its particular uHVA is affected. However, the contract for invalidate_range_{start,end} is not like a simple TLB; the pages may have been freed by the time the end hook is called. A "device" may not create new mappings after the _start_ hook is called. To meet this requirement, hva_to_pfn_retry() now waits until no invalidations are currently running which may affect its uHVA, before finally setting the ->valid flag and returning. Signed-off-by: David Woodhouse --- include/linux/kvm_host.h | 3 ++ include/linux/kvm_types.h | 1 + virt/kvm/kvm_main.c | 54 +++++++++++++------ virt/kvm/kvm_mm.h | 12 ++--- virt/kvm/pfncache.c | 106 ++++++++++++++++++++++++++------------ 5 files changed, 122 insertions(+), 54 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 79a6b1a63027a..a0739c343da5d 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -770,6 +770,9 @@ struct kvm { /* For management / invalidation of gfn_to_pfn_caches */ spinlock_t gpc_lock; struct list_head gpc_list; + u64 mmu_gpc_invalidate_range_start; + u64 mmu_gpc_invalidate_range_end; + wait_queue_head_t gpc_invalidate_wq; /* * created_vcpus is protected by kvm->lock, and is incremented diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 827ecc0b7e10a..4d8fbd87c3209 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -69,6 +69,7 @@ struct gfn_to_pfn_cache { void *khva; kvm_pfn_t pfn; bool active; + bool needs_invalidation; bool valid; }; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 92901656a0d41..01cde62841802 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -775,20 +775,26 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, */ spin_lock(&kvm->mn_invalidate_lock); kvm->mn_active_invalidate_count++; + if (likely(kvm->mn_active_invalidate_count == 1)) { + kvm->mmu_gpc_invalidate_range_start = range->start; + kvm->mmu_gpc_invalidate_range_end = range->end; + } else { + /* + * Fully tracking multiple concurrent ranges has diminishing + * returns. Keep things simple and just find the minimal range + * which includes the current and new ranges. As there won't be + * enough information to subtract a range after its invalidate + * completes, any ranges invalidated concurrently will + * accumulate and persist until all outstanding invalidates + * complete. + */ + kvm->mmu_gpc_invalidate_range_start = + min(kvm->mmu_gpc_invalidate_range_start, range->start); + kvm->mmu_gpc_invalidate_range_end = + max(kvm->mmu_gpc_invalidate_range_end, range->end); + } spin_unlock(&kvm->mn_invalidate_lock); - /* - * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e. - * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring - * each cache's lock. There are relatively few caches in existence at - * any given time, and the caches themselves can check for hva overlap, - * i.e. don't need to rely on memslot overlap checks for performance. - * Because this runs without holding mmu_lock, the pfn caches must use - * mn_active_invalidate_count (see above) instead of - * mmu_invalidate_in_progress. - */ - gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end); - /* * If one or more memslots were found and thus zapped, notify arch code * that guest memory has been reclaimed. This needs to be done *after* @@ -842,19 +848,33 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, __kvm_handle_hva_range(kvm, &hva_range); + /* + * It's safe to invalidate the gfn_to_pfn_caches from this 'end' + * hook, because hva_to_pfn_retry() will wait until no active + * invalidations could be affecting the corresponding uHVA + * before before allowing a newly mapped GPC to become valid. + */ + gfn_to_pfn_cache_invalidate(kvm, range->start, range->end); + /* Pairs with the increment in range_start(). */ spin_lock(&kvm->mn_invalidate_lock); if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count)) --kvm->mn_active_invalidate_count; wake = !kvm->mn_active_invalidate_count; + if (wake) { + kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD; + kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD; + } spin_unlock(&kvm->mn_invalidate_lock); /* * There can only be one waiter, since the wait happens under * slots_lock. */ - if (wake) + if (wake) { + wake_up(&kvm->gpc_invalidate_wq); rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait); + } } static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, @@ -1164,7 +1184,9 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) INIT_LIST_HEAD(&kvm->gpc_list); spin_lock_init(&kvm->gpc_lock); - + init_waitqueue_head(&kvm->gpc_invalidate_wq); + kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD; + kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD; INIT_LIST_HEAD(&kvm->devices); kvm->max_vcpus = KVM_MAX_VCPUS; @@ -1340,8 +1362,10 @@ static void kvm_destroy_vm(struct kvm *kvm) * in-progress invalidations. */ WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait)); - if (kvm->mn_active_invalidate_count) + if (kvm->mn_active_invalidate_count) { kvm->mn_active_invalidate_count = 0; + wake_up(&kvm->gpc_invalidate_wq); + } else WARN_ON(kvm->mmu_invalidate_in_progress); #else diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index 715f19669d01f..34e4e67f09f81 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -24,13 +24,13 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible, bool *async, bool write_fault, bool *writable); #ifdef CONFIG_HAVE_KVM_PFNCACHE -void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, - unsigned long start, - unsigned long end); +void gfn_to_pfn_cache_invalidate(struct kvm *kvm, + unsigned long start, + unsigned long end); #else -static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, - unsigned long start, - unsigned long end) +static inline void gfn_to_pfn_cache_invalidate(struct kvm *kvm, + unsigned long start, + unsigned long end) { } #endif /* HAVE_KVM_PFNCACHE */ diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index f0039efb9e1e3..3f48df8cd6e54 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -20,10 +20,15 @@ #include "kvm_mm.h" /* - * MMU notifier 'invalidate_range_start' hook. + * MMU notifier 'invalidate_range_end' hook. The hva_to_pfn_retry() function + * below may look up a PFN just before it is zapped, and may be mapping it + * concurrently with the actual invalidation (with the GPC lock dropped). By + * using a separate 'needs_invalidation' flag, the concurrent invalidation + * can handle this case, causing hva_to_pfn_retry() to drop its result and + * retry correctly. */ -void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start, - unsigned long end) +void gfn_to_pfn_cache_invalidate(struct kvm *kvm, unsigned long start, + unsigned long end) { struct gfn_to_pfn_cache *gpc; @@ -32,7 +37,7 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start, read_lock_irq(&gpc->lock); /* Only a single page so no need to care about length */ - if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) && + if (gpc->needs_invalidation && !is_error_noslot_pfn(gpc->pfn) && gpc->uhva >= start && gpc->uhva < end) { read_unlock_irq(&gpc->lock); @@ -45,9 +50,11 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start, */ write_lock_irq(&gpc->lock); - if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) && - gpc->uhva >= start && gpc->uhva < end) + if (gpc->needs_invalidation && !is_error_noslot_pfn(gpc->pfn) && + gpc->uhva >= start && gpc->uhva < end) { + gpc->needs_invalidation = false; gpc->valid = false; + } write_unlock_irq(&gpc->lock); continue; } @@ -93,6 +100,9 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len) if (!gpc->valid) return false; + /* If it's valid, it needs invalidation! */ + WARN_ON_ONCE(!gpc->needs_invalidation); + return true; } @@ -124,32 +134,37 @@ static void gpc_unmap(kvm_pfn_t pfn, void *khva) #endif } -static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_seq) +static bool gpc_invalidations_pending(struct gfn_to_pfn_cache *gpc) { /* - * mn_active_invalidate_count acts for all intents and purposes - * like mmu_invalidate_in_progress here; but the latter cannot - * be used here because the invalidation of caches in the - * mmu_notifier event occurs _before_ mmu_invalidate_in_progress - * is elevated. - * - * Note, it does not matter that mn_active_invalidate_count - * is not protected by gpc->lock. It is guaranteed to - * be elevated before the mmu_notifier acquires gpc->lock, and - * isn't dropped until after mmu_invalidate_seq is updated. + * No need for locking on GPC here because these fields are protected + * by gpc->refresh_lock. */ - if (kvm->mn_active_invalidate_count) - return true; + return unlikely(gpc->kvm->mn_active_invalidate_count) && + (gpc->kvm->mmu_gpc_invalidate_range_start <= gpc->uhva) && + (gpc->kvm->mmu_gpc_invalidate_range_end > gpc->uhva); +} - /* - * Ensure mn_active_invalidate_count is read before - * mmu_invalidate_seq. This pairs with the smp_wmb() in - * mmu_notifier_invalidate_range_end() to guarantee either the - * old (non-zero) value of mn_active_invalidate_count or the - * new (incremented) value of mmu_invalidate_seq is observed. - */ - smp_rmb(); - return kvm->mmu_invalidate_seq != mmu_seq; +static void gpc_wait_for_invalidations(struct gfn_to_pfn_cache *gpc) +{ + spin_lock(&gpc->kvm->mn_invalidate_lock); + if (gpc_invalidations_pending(gpc)) { + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&gpc->kvm->gpc_invalidate_wq, &wait, + TASK_UNINTERRUPTIBLE); + + if (!gpc_invalidations_pending(gpc)) + break; + + spin_unlock(&gpc->kvm->mn_invalidate_lock); + schedule(); + spin_lock(&gpc->kvm->mn_invalidate_lock); + } + finish_wait(&gpc->kvm->gpc_invalidate_wq, &wait); + } + spin_unlock(&gpc->kvm->mn_invalidate_lock); } static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) @@ -158,7 +173,6 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) void *old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva); kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT; void *new_khva = NULL; - unsigned long mmu_seq; lockdep_assert_held(&gpc->refresh_lock); @@ -172,8 +186,16 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) gpc->valid = false; do { - mmu_seq = gpc->kvm->mmu_invalidate_seq; - smp_rmb(); + /* + * The translation made by hva_to_pfn() below could be made + * invalid as soon as it's mapped. But the uhva is already + * known and that's all that gfn_to_pfn_cache_invalidate() + * looks at. So set the 'needs_invalidation' flag to allow + * the GPC to be marked invalid from the moment the lock is + * dropped, before the corresponding PFN is even found (and, + * more to the point, immediately afterwards). + */ + gpc->needs_invalidation = true; write_unlock_irq(&gpc->lock); @@ -217,6 +239,15 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) goto out_error; } + /* + * Avoid populating a GPC with a user address which is already + * being invalidated, if the invalidate_range_start() notifier + * has already been called. The actual invalidation happens in + * the invalidate_range_end() callback, so wait until there are + * no active invalidations (for the relevant HVA). + */ + gpc_wait_for_invalidations(gpc); + write_lock_irq(&gpc->lock); /* @@ -224,7 +255,14 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) * attempting to refresh. */ WARN_ON_ONCE(gpc->valid); - } while (mmu_notifier_retry_cache(gpc->kvm, mmu_seq)); + + /* + * Since gfn_to_pfn_cache_invalidate() is called from the + * kvm_mmu_notifier_invalidate_range_end() callback, it can + * invalidate the GPC the moment after hva_to_pfn() returned + * a valid PFN. If that happens, retry. + */ + } while (!gpc->needs_invalidation); gpc->valid = true; gpc->pfn = new_pfn; @@ -339,6 +377,7 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l */ if (ret) { gpc->valid = false; + gpc->needs_invalidation = false; gpc->pfn = KVM_PFN_ERR_FAULT; gpc->khva = NULL; } @@ -383,7 +422,7 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm) gpc->pfn = KVM_PFN_ERR_FAULT; gpc->gpa = INVALID_GPA; gpc->uhva = KVM_HVA_ERR_BAD; - gpc->active = gpc->valid = false; + gpc->active = gpc->valid = gpc->needs_invalidation = false; } static int __kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long uhva, @@ -453,6 +492,7 @@ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc) write_lock_irq(&gpc->lock); gpc->active = false; gpc->valid = false; + gpc->needs_invalidation = false; /* * Leave the GPA => uHVA cache intact, it's protected by the -- 2.49.0