KVM: Move gfn_to_pfn_cache invalidation to invalidate_range_end hook

author David Woodhouse <dwmw@amazon.co.uk>

Fri, 2 Aug 2024 23:03:41 +0000 (00:03 +0100)

committer David Woodhouse <dwmw@amazon.co.uk>

Tue, 20 Aug 2024 19:48:42 +0000 (20:48 +0100)
author David Woodhouse <dwmw@amazon.co.uk>
Fri, 2 Aug 2024 23:03:41 +0000 (00:03 +0100)
committer David Woodhouse <dwmw@amazon.co.uk>
Tue, 20 Aug 2024 19:48:42 +0000 (20:48 +0100)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h

index 79a6b1a63027a410559187bfc0d0722516be9316..a0739c343da5deacecaf843d946edcfcf5fbea58 100644 (file)
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -770,6 +770,9 @@ struct kvm {
         /* For management / invalidation of gfn_to_pfn_caches */
         spinlock_t gpc_lock;
         struct list_head gpc_list;
+       u64 mmu_gpc_invalidate_range_start;
+       u64 mmu_gpc_invalidate_range_end;
+       wait_queue_head_t gpc_invalidate_wq;
  
         /*
          * created_vcpus is protected by kvm->lock, and is incremented
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h

index 827ecc0b7e10a8ebfb2f27fbbc43f96127abf247..4d8fbd87c3209dc871ed10d66398be931d5739cc 100644 (file)
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -69,6 +69,7 @@ struct gfn_to_pfn_cache {
         void *khva;
         kvm_pfn_t pfn;
         bool active;
+       bool needs_invalidation;
         bool valid;
  };
  
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

index 92901656a0d413651741ebd069d335f8f5f161be..01cde62841802d00fd8355786cf32c57558c1a82 100644 (file)
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -775,20 +775,26 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
          */
         spin_lock(&kvm->mn_invalidate_lock);
         kvm->mn_active_invalidate_count++;
+       if (likely(kvm->mn_active_invalidate_count == 1)) {
+               kvm->mmu_gpc_invalidate_range_start = range->start;
+               kvm->mmu_gpc_invalidate_range_end = range->end;
+       } else {
+               /*
+                * Fully tracking multiple concurrent ranges has diminishing
+                * returns. Keep things simple and just find the minimal range
+                * which includes the current and new ranges. As there won't be
+                * enough information to subtract a range after its invalidate
+                * completes, any ranges invalidated concurrently will
+                * accumulate and persist until all outstanding invalidates
+                * complete.
+                */
+               kvm->mmu_gpc_invalidate_range_start =
+                       min(kvm->mmu_gpc_invalidate_range_start, range->start);
+               kvm->mmu_gpc_invalidate_range_end =
+                       max(kvm->mmu_gpc_invalidate_range_end, range->end);
+       }
         spin_unlock(&kvm->mn_invalidate_lock);
  
-       /*
-        * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
-        * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
-        * each cache's lock.  There are relatively few caches in existence at
-        * any given time, and the caches themselves can check for hva overlap,
-        * i.e. don't need to rely on memslot overlap checks for performance.
-        * Because this runs without holding mmu_lock, the pfn caches must use
-        * mn_active_invalidate_count (see above) instead of
-        * mmu_invalidate_in_progress.
-        */
-       gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end);
-
         /*
          * If one or more memslots were found and thus zapped, notify arch code
          * that guest memory has been reclaimed.  This needs to be done *after*
@@ -842,19 +848,33 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
  
         __kvm_handle_hva_range(kvm, &hva_range);
  
+       /*
+        * It's safe to invalidate the gfn_to_pfn_caches from this 'end'
+        * hook, because hva_to_pfn_retry() will wait until no active
+        * invalidations could be affecting the corresponding uHVA
+        * before before allowing a newly mapped GPC to become valid.
+        */
+       gfn_to_pfn_cache_invalidate(kvm, range->start, range->end);
+
         /* Pairs with the increment in range_start(). */
         spin_lock(&kvm->mn_invalidate_lock);
         if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count))
                 --kvm->mn_active_invalidate_count;
         wake = !kvm->mn_active_invalidate_count;
+       if (wake) {
+               kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD;
+               kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD;
+       }
         spin_unlock(&kvm->mn_invalidate_lock);
  
         /*
          * There can only be one waiter, since the wait happens under
          * slots_lock.
          */
-       if (wake)
+       if (wake) {
+               wake_up(&kvm->gpc_invalidate_wq);
                 rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
+       }
  }
  
  static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
@@ -1164,7 +1184,9 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
  
         INIT_LIST_HEAD(&kvm->gpc_list);
         spin_lock_init(&kvm->gpc_lock);
-
+       init_waitqueue_head(&kvm->gpc_invalidate_wq);
+       kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD;
+       kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD;
         INIT_LIST_HEAD(&kvm->devices);
         kvm->max_vcpus = KVM_MAX_VCPUS;
  
@@ -1340,8 +1362,10 @@ static void kvm_destroy_vm(struct kvm *kvm)
          * in-progress invalidations.
          */
         WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
-       if (kvm->mn_active_invalidate_count)
+       if (kvm->mn_active_invalidate_count) {
                 kvm->mn_active_invalidate_count = 0;
+               wake_up(&kvm->gpc_invalidate_wq);
+       }
         else
                 WARN_ON(kvm->mmu_invalidate_in_progress);
  #else
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h

index 715f19669d01f72912af9b7393ccd01f65c0527e..34e4e67f09f81267e2976c7f9fab5339e59aa50a 100644 (file)
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -24,13 +24,13 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
                      bool *async, bool write_fault, bool *writable);
  
  #ifdef CONFIG_HAVE_KVM_PFNCACHE
-void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
-                                      unsigned long start,
-                                      unsigned long end);
+void gfn_to_pfn_cache_invalidate(struct kvm *kvm,
+                                unsigned long start,
+                                unsigned long end);
  #else
-static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
-                                                    unsigned long start,
-                                                    unsigned long end)
+static inline void gfn_to_pfn_cache_invalidate(struct kvm *kvm,
+                                              unsigned long start,
+                                              unsigned long end)
  {
  }
  #endif /* HAVE_KVM_PFNCACHE */
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c

index f0039efb9e1e34315450bcb589429a8cecb37a73..3f48df8cd6e549b098e04a7dcde5a2441c86b4e0 100644 (file)
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -20,10 +20,15 @@
  #include "kvm_mm.h"
  
  /*
- * MMU notifier 'invalidate_range_start' hook.
+ * MMU notifier 'invalidate_range_end' hook. The hva_to_pfn_retry() function
+ * below may look up a PFN just before it is zapped, and may be mapping it
+ * concurrently with the actual invalidation (with the GPC lock dropped). By
+ * using a separate 'needs_invalidation' flag, the concurrent invalidation
+ * can handle this case, causing hva_to_pfn_retry() to drop its result and
+ * retry correctly.
   */
-void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
-                                      unsigned long end)
+void gfn_to_pfn_cache_invalidate(struct kvm *kvm, unsigned long start,
+                                unsigned long end)
  {
         struct gfn_to_pfn_cache *gpc;
  
@@ -32,7 +37,7 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
                 read_lock_irq(&gpc->lock);
  
                 /* Only a single page so no need to care about length */
-               if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
+               if (gpc->needs_invalidation && !is_error_noslot_pfn(gpc->pfn) &&
                     gpc->uhva >= start && gpc->uhva < end) {
                         read_unlock_irq(&gpc->lock);
  
@@ -45,9 +50,11 @@ void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm, unsigned long start,
                          */
  
                         write_lock_irq(&gpc->lock);
-                       if (gpc->valid && !is_error_noslot_pfn(gpc->pfn) &&
-                           gpc->uhva >= start && gpc->uhva < end)
+                       if (gpc->needs_invalidation && !is_error_noslot_pfn(gpc->pfn) &&
+                           gpc->uhva >= start && gpc->uhva < end) {
+                               gpc->needs_invalidation = false;
                                 gpc->valid = false;
+                       }
                         write_unlock_irq(&gpc->lock);
                         continue;
                 }
@@ -93,6 +100,9 @@ bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len)
         if (!gpc->valid)
                 return false;
  
+       /* If it's valid, it needs invalidation! */
+       WARN_ON_ONCE(!gpc->needs_invalidation);
+
         return true;
  }
  
@@ -124,32 +134,37 @@ static void gpc_unmap(kvm_pfn_t pfn, void *khva)
  #endif
  }
  
-static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_seq)
+static bool gpc_invalidations_pending(struct gfn_to_pfn_cache *gpc)
  {
         /*
-        * mn_active_invalidate_count acts for all intents and purposes
-        * like mmu_invalidate_in_progress here; but the latter cannot
-        * be used here because the invalidation of caches in the
-        * mmu_notifier event occurs _before_ mmu_invalidate_in_progress
-        * is elevated.
-        *
-        * Note, it does not matter that mn_active_invalidate_count
-        * is not protected by gpc->lock.  It is guaranteed to
-        * be elevated before the mmu_notifier acquires gpc->lock, and
-        * isn't dropped until after mmu_invalidate_seq is updated.
+        * No need for locking on GPC here because these fields are protected
+        * by gpc->refresh_lock.
          */
-       if (kvm->mn_active_invalidate_count)
-               return true;
+       return unlikely(gpc->kvm->mn_active_invalidate_count) &&
+               (gpc->kvm->mmu_gpc_invalidate_range_start <= gpc->uhva) &&
+               (gpc->kvm->mmu_gpc_invalidate_range_end > gpc->uhva);
+}
  
-       /*
-        * Ensure mn_active_invalidate_count is read before
-        * mmu_invalidate_seq.  This pairs with the smp_wmb() in
-        * mmu_notifier_invalidate_range_end() to guarantee either the
-        * old (non-zero) value of mn_active_invalidate_count or the
-        * new (incremented) value of mmu_invalidate_seq is observed.
-        */
-       smp_rmb();
-       return kvm->mmu_invalidate_seq != mmu_seq;
+static void gpc_wait_for_invalidations(struct gfn_to_pfn_cache *gpc)
+{
+       spin_lock(&gpc->kvm->mn_invalidate_lock);
+       if (gpc_invalidations_pending(gpc)) {
+               DEFINE_WAIT(wait);
+
+               for (;;) {
+                       prepare_to_wait(&gpc->kvm->gpc_invalidate_wq, &wait,
+                                       TASK_UNINTERRUPTIBLE);
+
+                       if (!gpc_invalidations_pending(gpc))
+                               break;
+
+                       spin_unlock(&gpc->kvm->mn_invalidate_lock);
+                       schedule();
+                       spin_lock(&gpc->kvm->mn_invalidate_lock);
+               }
+               finish_wait(&gpc->kvm->gpc_invalidate_wq, &wait);
+       }
+       spin_unlock(&gpc->kvm->mn_invalidate_lock);
  }
  
  static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
@@ -158,7 +173,6 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
         void *old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva);
         kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT;
         void *new_khva = NULL;
-       unsigned long mmu_seq;
  
         lockdep_assert_held(&gpc->refresh_lock);
  
@@ -172,8 +186,16 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
         gpc->valid = false;
  
         do {
-               mmu_seq = gpc->kvm->mmu_invalidate_seq;
-               smp_rmb();
+               /*
+                * The translation made by hva_to_pfn() below could be made
+                * invalid as soon as it's mapped. But the uhva is already
+                * known and that's all that gfn_to_pfn_cache_invalidate()
+                * looks at. So set the 'needs_invalidation' flag to allow
+                * the GPC to be marked invalid from the moment the lock is
+                * dropped, before the corresponding PFN is even found (and,
+                * more to the point, immediately afterwards).
+                */
+               gpc->needs_invalidation = true;
  
                 write_unlock_irq(&gpc->lock);
  
@@ -217,6 +239,15 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
                         goto out_error;
                 }
  
+               /*
+                * Avoid populating a GPC with a user address which is already
+                * being invalidated, if the invalidate_range_start() notifier
+                * has already been called. The actual invalidation happens in
+                * the invalidate_range_end() callback, so wait until there are
+                * no active invalidations (for the relevant HVA).
+                */
+               gpc_wait_for_invalidations(gpc);
+
                 write_lock_irq(&gpc->lock);
  
                 /*
@@ -224,7 +255,14 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
                  * attempting to refresh.
                  */
                 WARN_ON_ONCE(gpc->valid);
-       } while (mmu_notifier_retry_cache(gpc->kvm, mmu_seq));
+
+               /*
+                * Since gfn_to_pfn_cache_invalidate() is called from the
+                * kvm_mmu_notifier_invalidate_range_end() callback, it can
+                * invalidate the GPC the moment after hva_to_pfn() returned
+                * a valid PFN. If that happens, retry.
+                */
+       } while (!gpc->needs_invalidation);
  
         gpc->valid = true;
         gpc->pfn = new_pfn;
@@ -339,6 +377,7 @@ static int __kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned l
          */
         if (ret) {
                 gpc->valid = false;
+               gpc->needs_invalidation = false;
                 gpc->pfn = KVM_PFN_ERR_FAULT;
                 gpc->khva = NULL;
         }
@@ -383,7 +422,7 @@ void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm)
         gpc->pfn = KVM_PFN_ERR_FAULT;
         gpc->gpa = INVALID_GPA;
         gpc->uhva = KVM_HVA_ERR_BAD;
-       gpc->active = gpc->valid = false;
+       gpc->active = gpc->valid = gpc->needs_invalidation = false;
  }
  
  static int __kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long uhva,
@@ -453,6 +492,7 @@ void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc)
                 write_lock_irq(&gpc->lock);
                 gpc->active = false;
                 gpc->valid = false;
+               gpc->needs_invalidation = false;
  
                 /*
                  * Leave the GPA => uHVA cache intact, it's protected by the
author	David Woodhouse <dwmw@amazon.co.uk>
	Fri, 2 Aug 2024 23:03:41 +0000 (00:03 +0100)
committer	David Woodhouse <dwmw@amazon.co.uk>
	Tue, 20 Aug 2024 19:48:42 +0000 (20:48 +0100)
include/linux/kvm_host.h		patch \| blob \| history
include/linux/kvm_types.h		patch \| blob \| history
virt/kvm/kvm_main.c		patch \| blob \| history
virt/kvm/kvm_mm.h		patch \| blob \| history
virt/kvm/pfncache.c		patch \| blob \| history