]> www.infradead.org Git - users/dwmw2/linux.git/commitdiff
KVM: Move gfn_to_pfn_cache invalidation to invalidate_range_end hook
authorDavid Woodhouse <dwmw@amazon.co.uk>
Fri, 2 Aug 2024 23:03:41 +0000 (00:03 +0100)
committerDavid Woodhouse <dwmw@amazon.co.uk>
Tue, 20 Aug 2024 19:48:42 +0000 (20:48 +0100)
The existing retry loop in hva_to_pfn_retry() is extremely pessimistic.
If there are any concurrent invalidations running, it's effectively just
a complex busy wait loop because its local mmu_notifier_retry_cache()
function will always return true.

Since multiple invalidations can be running in parallel, this can result
in a situation where hva_to_pfn_retry() just backs off and keep retrying
for ever, not making any progress.

Solve this by being a bit more selective about when to retry.

Introduce a separate 'needs invalidation' flag to the GPC, which allows
it to be marked invalid even while hva_to_pfn_retry() has dropped the
lock to map the newly-found PFN. This allows the invalidation to moved
to the range_end hook, and the retry loop only occurs for a given GPC if
its particular uHVA is affected.

However, the contract for invalidate_range_{start,end} is not like a
simple TLB; the pages may have been freed by the time the end hook is
called. A "device" may not create new mappings after the _start_ hook is
called. To meet this requirement, hva_to_pfn_retry() now waits until no
invalidations are currently running which may affect its uHVA, before
finally setting the ->valid flag and returning.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
include/linux/kvm_host.h
include/linux/kvm_types.h
virt/kvm/kvm_main.c
virt/kvm/kvm_mm.h
virt/kvm/pfncache.c

index 79a6b1a63027a410559187bfc0d0722516be9316..a0739c343da5deacecaf843d946edcfcf5fbea58 100644 (file)
@@ -770,6 +770,9 @@ struct kvm {
        /* For management / invalidation of gfn_to_pfn_caches */
        spinlock_t gpc_lock;
        struct list_head gpc_list;
+       u64 mmu_gpc_invalidate_range_start;
+       u64 mmu_gpc_invalidate_range_end;
+       wait_queue_head_t gpc_invalidate_wq;
 
        /*
         * created_vcpus is protected by kvm->lock, and is incremented
index 827ecc0b7e10a8ebfb2f27fbbc43f96127abf247..4d8fbd87c3209dc871ed10d66398be931d5739cc 100644 (file)
@@ -69,6 +69,7 @@ struct gfn_to_pfn_cache {
        void *khva;
        kvm_pfn_t pfn;
        bool active;
+       bool needs_invalidation;
        bool valid;
 };
 
index 92901656a0d413651741ebd069d335f8f5f161be..01cde62841802d00fd8355786cf32c57558c1a82 100644 (file)
@@ -775,20 +775,26 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
         */
        spin_lock(&kvm->mn_invalidate_lock);
        kvm->mn_active_invalidate_count++;
+       if (likely(kvm->mn_active_invalidate_count == 1)) {
+               kvm->mmu_gpc_invalidate_range_start = range->start;
+               kvm->mmu_gpc_invalidate_range_end = range->end;
+       } else {
+               /*
+                * Fully tracking multiple concurrent ranges has diminishing
+                * returns. Keep things simple and just find the minimal range
+                * which includes the current and new ranges. As there won't be
+                * enough information to subtract a range after its invalidate
+                * completes, any ranges invalidated concurrently will
+                * accumulate and persist until all outstanding invalidates
+                * complete.
+                */
+               kvm->mmu_gpc_invalidate_range_start =
+                       min(kvm->mmu_gpc_invalidate_range_start, range->start);
+               kvm->mmu_gpc_invalidate_range_end =
+                       max(kvm->mmu_gpc_invalidate_range_end, range->end);
+       }
        spin_unlock(&kvm->mn_invalidate_lock);
 
-       /*
-        * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
-        * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
-        * each cache's lock.  There are relatively few caches in existence at
-        * any given time, and the caches themselves can check for hva overlap,
-        * i.e. don't need to rely on memslot overlap checks for performance.
-        * Because this runs without holding mmu_lock, the pfn caches must use
-        * mn_active_invalidate_count (see above) instead of
-        * mmu_invalidate_in_progress.
-        */
-       gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end);
-
        /*
         * If one or more memslots were found and thus zapped, notify arch code
         * that guest memory has been reclaimed.  This needs to be done *after*
@@ -842,19 +848,33 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
 
        __kvm_handle_hva_range(kvm, &hva_range);
 
+       /*
+        * It's safe to invalidate the gfn_to_pfn_caches from this 'end'
+        * hook, because hva_to_pfn_retry() will wait until no active
+        * invalidations could be affecting the corresponding uHVA
+        * before before allowing a newly mapped GPC to become valid.
+        */
+       gfn_to_pfn_cache_invalidate(kvm, range->start, range->end);
+
        /* Pairs with the increment in range_start(). */
        spin_lock(&kvm->mn_invalidate_lock);
        if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count))
                --kvm->mn_active_invalidate_count;
        wake = !kvm->mn_active_invalidate_count;
+       if (wake) {
+               kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD;
+               kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD;
+       }
        spin_unlock(&kvm->mn_invalidate_lock);
 
        /*
         * There can only be one waiter, since the wait happens under
         * slots_lock.
         */
-       if (wake)
+       if (wake) {
+               wake_up(&kvm->gpc_invalidate_wq);
                rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
+       }
 }
 
 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
@@ -1164,7 +1184,9 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
 
        INIT_LIST_HEAD(&kvm->gpc_list);
        spin_lock_init(&kvm->gpc_lock);
-
+       init_waitqueue_head(&kvm->gpc_invalidate_wq);
+       kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD;
+       kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD;
        INIT_LIST_HEAD(&kvm->devices);
        kvm->max_vcpus = KVM_MAX_VCPUS;
 
@@ -1340,8 +1362,10 @@ static void kvm_destroy_vm(struct kvm *kvm)
         * in-progress invalidations.
         */
        WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
-       if (kvm->mn_active_invalidate_count)
+       if (kvm->mn_active_invalidate_count) {
                kvm->mn_active_invalidate_count = 0;
+               wake_up(&kvm->gpc_invalidate_wq);
+       }
        else
                WARN_ON(kvm->mmu_invalidate_in_progress);
 #else
index 715f19669d01f72912af9b7393ccd01f65c0527e..34e4e67f09f81267e2976c7f9fab5339e59aa50a 100644 (file)
@@ -24,13 +24,13 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
                     bool *async, bool write_fault, bool *writable);
 
 #ifdef CONFIG_HAVE_KVM_PFNCACHE
-void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
-                                      unsigned long start,
-                                      unsigned long end);
+void gfn_to_pfn_cache_invalidate(struct kvm *kvm,
+                                unsigned long start,
+                                unsigned long end);
 #else
-static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
-                                                    unsigned long start,
-                                                    unsigned long end)
+static inline void gfn_to_pfn_cache_invalidate(struct kvm *kvm,
+                                              unsigned long start,
+                                              unsigned long end)
 {
 }
 #endif /* HAVE_KVM_PFNCACHE */
index f0039efb9e1e34315450bcb589429a8cecb37a73..3f48df8cd6e549b098e04a7dcde5a2441c86b4e0 100644 (file)
 #include "kvm_mm.h"
 
 /*
- * MMU notifier 'invalidate_range_start' hook.
+ * MMU notifier 'invalidate_range_end' hook. The hva_to_pfn_retry() function
+ * below may look up a PFN just before it is zapped, and may be mapping it
+ * concurrently with the actual invalidation (with the GPC lock dropped). By
+ * using a separate 'needs_invalidation' flag, the concurrent invalidation
+ * can handle this case, causing hva_to_pfn_retry() to drop its result and
+ * retry correctly.
  */
-void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
-                                      unsigned long end)
+void gfn_to_pfn_cache_invalidate(struct kvm *kvm, unsigned long start,
+                                unsigned long end)
 {
        struct gfn_to_pfn_cache *gpc;
 
@@ -32,7 +37,7 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
                read_lock_irq(&gpc->lock);
 
                /* Only a single page so no need to care about length */
-               if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
+               if (gpc->needs_invalidation && !is_error_noslot_pfn(gpc->pfn) &&
                    gpc->uhva >= start && gpc->uhva < end) {
                        read_unlock_irq(&gpc->lock);
 
@@ -45,9 +50,11 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
                         */
 
                        write_lock_irq(&gpc->lock);
-                       if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
-                           gpc->uhva >= start && gpc->uhva < end)
+                       if (gpc->needs_invalidation && !is_error_noslot_pfn(gpc->pfn) &&
+                           gpc->uhva >= start && gpc->uhva < end) {
+                               gpc->needs_invalidation = false;
                                gpc->valid = false;
+                       }
                        write_unlock_irq(&gpc->lock);
                        continue;
                }
@@ -93,6 +100,9 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len)
        if (!gpc->valid)
                return false;
 
+       /* If it's valid, it needs invalidation! */
+       WARN_ON_ONCE(!gpc->needs_invalidation);
+
        return true;
 }
 
@@ -124,32 +134,37 @@ static void gpc_unmap(kvm_pfn_t pfn, void *khva)
 #endif
 }
 
-static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_seq)
+static bool gpc_invalidations_pending(struct gfn_to_pfn_cache *gpc)
 {
        /*
-        * mn_active_invalidate_count acts for all intents and purposes
-        * like mmu_invalidate_in_progress here; but the latter cannot
-        * be used here because the invalidation of caches in the
-        * mmu_notifier event occurs _before_ mmu_invalidate_in_progress
-        * is elevated.
-        *
-        * Note, it does not matter that mn_active_invalidate_count
-        * is not protected by gpc->lock.  It is guaranteed to
-        * be elevated before the mmu_notifier acquires gpc->lock, and
-        * isn't dropped until after mmu_invalidate_seq is updated.
+        * No need for locking on GPC here because these fields are protected
+        * by gpc->refresh_lock.
         */
-       if (kvm->mn_active_invalidate_count)
-               return true;
+       return unlikely(gpc->kvm->mn_active_invalidate_count) &&
+               (gpc->kvm->mmu_gpc_invalidate_range_start <= gpc->uhva) &&
+               (gpc->kvm->mmu_gpc_invalidate_range_end > gpc->uhva);
+}
 
-       /*
-        * Ensure mn_active_invalidate_count is read before
-        * mmu_invalidate_seq.  This pairs with the smp_wmb() in
-        * mmu_notifier_invalidate_range_end() to guarantee either the
-        * old (non-zero) value of mn_active_invalidate_count or the
-        * new (incremented) value of mmu_invalidate_seq is observed.
-        */
-       smp_rmb();
-       return kvm->mmu_invalidate_seq != mmu_seq;
+static void gpc_wait_for_invalidations(struct gfn_to_pfn_cache *gpc)
+{
+       spin_lock(&gpc->kvm->mn_invalidate_lock);
+       if (gpc_invalidations_pending(gpc)) {
+               DEFINE_WAIT(wait);
+
+               for (;;) {
+                       prepare_to_wait(&gpc->kvm->gpc_invalidate_wq, &wait,
+                                       TASK_UNINTERRUPTIBLE);
+
+                       if (!gpc_invalidations_pending(gpc))
+                               break;
+
+                       spin_unlock(&gpc->kvm->mn_invalidate_lock);
+                       schedule();
+                       spin_lock(&gpc->kvm->mn_invalidate_lock);
+               }
+               finish_wait(&gpc->kvm->gpc_invalidate_wq, &wait);
+       }
+       spin_unlock(&gpc->kvm->mn_invalidate_lock);
 }
 
 static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
@@ -158,7 +173,6 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
        void *old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva);
        kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT;
        void *new_khva = NULL;
-       unsigned long mmu_seq;
 
        lockdep_assert_held(&gpc->refresh_lock);
 
@@ -172,8 +186,16 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
        gpc->valid = false;
 
        do {
-               mmu_seq = gpc->kvm->mmu_invalidate_seq;
-               smp_rmb();
+               /*
+                * The translation made by hva_to_pfn() below could be made
+                * invalid as soon as it's mapped. But the uhva is already
+                * known and that's all that gfn_to_pfn_cache_invalidate()
+                * looks at. So set the 'needs_invalidation' flag to allow
+                * the GPC to be marked invalid from the moment the lock is
+                * dropped, before the corresponding PFN is even found (and,
+                * more to the point, immediately afterwards).
+                */
+               gpc->needs_invalidation = true;
 
                write_unlock_irq(&gpc->lock);
 
@@ -217,6 +239,15 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
                        goto out_error;
                }
 
+               /*
+                * Avoid populating a GPC with a user address which is already
+                * being invalidated, if the invalidate_range_start() notifier
+                * has already been called. The actual invalidation happens in
+                * the invalidate_range_end() callback, so wait until there are
+                * no active invalidations (for the relevant HVA).
+                */
+               gpc_wait_for_invalidations(gpc);
+
                write_lock_irq(&gpc->lock);
 
                /*
@@ -224,7 +255,14 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
                 * attempting to refresh.
                 */
                WARN_ON_ONCE(gpc->valid);
-       } while (mmu_notifier_retry_cache(gpc->kvm, mmu_seq));
+
+               /*
+                * Since gfn_to_pfn_cache_invalidate() is called from the
+                * kvm_mmu_notifier_invalidate_range_end() callback, it can
+                * invalidate the GPC the moment after hva_to_pfn() returned
+                * a valid PFN. If that happens, retry.
+                */
+       } while (!gpc->needs_invalidation);
 
        gpc->valid = true;
        gpc->pfn = new_pfn;
@@ -339,6 +377,7 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l
         */
        if (ret) {
                gpc->valid = false;
+               gpc->needs_invalidation = false;
                gpc->pfn = KVM_PFN_ERR_FAULT;
                gpc->khva = NULL;
        }
@@ -383,7 +422,7 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm)
        gpc->pfn = KVM_PFN_ERR_FAULT;
        gpc->gpa = INVALID_GPA;
        gpc->uhva = KVM_HVA_ERR_BAD;
-       gpc->active = gpc->valid = false;
+       gpc->active = gpc->valid = gpc->needs_invalidation = false;
 }
 
 static int __kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long uhva,
@@ -453,6 +492,7 @@ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
                write_lock_irq(&gpc->lock);
                gpc->active = false;
                gpc->valid = false;
+               gpc->needs_invalidation = false;
 
                /*
                 * Leave the GPA => uHVA cache intact, it's protected by the