KVM: pfncache: Implement range-based invalidation check for hva_to_pfn_retry()

author David Woodhouse <dwmw@amazon.co.uk>

Wed, 21 Aug 2024 19:08:17 +0000 (20:08 +0100)

committer David Woodhouse <dwmw@amazon.co.uk>

Wed, 21 Aug 2024 19:59:41 +0000 (20:59 +0100)
author David Woodhouse <dwmw@amazon.co.uk>
Wed, 21 Aug 2024 19:08:17 +0000 (20:08 +0100)
committer David Woodhouse <dwmw@amazon.co.uk>
Wed, 21 Aug 2024 19:59:41 +0000 (20:59 +0100)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h

index 79a6b1a63027a410559187bfc0d0722516be9316..1bfe2e8d52cdc6155b37c0c9f16b5d11dae777b2 100644 (file)
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -770,6 +770,8 @@ struct kvm {
         /* For management / invalidation of gfn_to_pfn_caches */
         spinlock_t gpc_lock;
         struct list_head gpc_list;
+       u64 mmu_gpc_invalidate_range_start;
+       u64 mmu_gpc_invalidate_range_end;
  
         /*
          * created_vcpus is protected by kvm->lock, and is incremented
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

index 92901656a0d413651741ebd069d335f8f5f161be..84eb1ebb6f47dffc21971b2c3ad1c048b52fe266 100644 (file)
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -775,6 +775,24 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
          */
         spin_lock(&kvm->mn_invalidate_lock);
         kvm->mn_active_invalidate_count++;
+       if (likely(kvm->mn_active_invalidate_count == 1)) {
+               kvm->mmu_gpc_invalidate_range_start = range->start;
+               kvm->mmu_gpc_invalidate_range_end = range->end;
+       } else {
+               /*
+                * Fully tracking multiple concurrent ranges has diminishing
+                * returns. Keep things simple and just find the minimal range
+                * which includes the current and new ranges. As there won't be
+                * enough information to subtract a range after its invalidate
+                * completes, any ranges invalidated concurrently will
+                * accumulate and persist until all outstanding invalidates
+                * complete.
+                */
+               kvm->mmu_gpc_invalidate_range_start =
+                       min(kvm->mmu_gpc_invalidate_range_start, range->start);
+               kvm->mmu_gpc_invalidate_range_end =
+                       max(kvm->mmu_gpc_invalidate_range_end, range->end);
+       }
         spin_unlock(&kvm->mn_invalidate_lock);
  
         /*
@@ -1164,6 +1182,8 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
  
         INIT_LIST_HEAD(&kvm->gpc_list);
         spin_lock_init(&kvm->gpc_lock);
+       kvm->mmu_gpc_invalidate_range_start = KVM_HVA_ERR_BAD;
+       kvm->mmu_gpc_invalidate_range_end = KVM_HVA_ERR_BAD;
  
         INIT_LIST_HEAD(&kvm->devices);
         kvm->max_vcpus = KVM_MAX_VCPUS;
diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c

index 7007d32d197a1954ad787028d5d38ad20bcbd626..eeb9bf43c04a1bc050481b2f39db022deb68427b 100644 (file)
--- a/virt/kvm/pfncache.c
+++ b/virt/kvm/pfncache.c
@@ -129,32 +129,17 @@ static void gpc_unmap(kvm_pfn_t pfn, void *khva)
  #endif
  }
  
-static inline bool mmu_notifier_retry_cache(struct kvm *kvm, unsigned long mmu_seq)
+static bool gpc_invalidations_pending(struct gfn_to_pfn_cache *gpc)
  {
         /*
-        * mn_active_invalidate_count acts for all intents and purposes
-        * like mmu_invalidate_in_progress here; but the latter cannot
-        * be used here because the invalidation of caches in the
-        * mmu_notifier event occurs _before_ mmu_invalidate_in_progress
-        * is elevated.
-        *
-        * Note, it does not matter that mn_active_invalidate_count
-        * is not protected by gpc->lock.  It is guaranteed to
-        * be elevated before the mmu_notifier acquires gpc->lock, and
-        * isn't dropped until after mmu_invalidate_seq is updated.
+        * No need for locking on GPC here because these fields are protected
+        * by gpc->refresh_lock.
          */
-       if (kvm->mn_active_invalidate_count)
-               return true;
+       guard(spinlock)(&gpc->kvm->mn_invalidate_lock);
  
-       /*
-        * Ensure mn_active_invalidate_count is read before
-        * mmu_invalidate_seq.  This pairs with the smp_wmb() in
-        * mmu_notifier_invalidate_range_end() to guarantee either the
-        * old (non-zero) value of mn_active_invalidate_count or the
-        * new (incremented) value of mmu_invalidate_seq is observed.
-        */
-       smp_rmb();
-       return kvm->mmu_invalidate_seq != mmu_seq;
+       return unlikely(gpc->kvm->mn_active_invalidate_count) &&
+               (gpc->kvm->mmu_gpc_invalidate_range_start <= gpc->uhva) &&
+               (gpc->kvm->mmu_gpc_invalidate_range_end > gpc->uhva);
  }
  
  static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
@@ -163,7 +148,6 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
         void *old_khva = (void *)PAGE_ALIGN_DOWN((uintptr_t)gpc->khva);
         kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT;
         void *new_khva = NULL;
-       unsigned long mmu_seq;
  
         lockdep_assert_held(&gpc->refresh_lock);
  
@@ -177,9 +161,6 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
         gpc->valid = false;
  
         do {
-               mmu_seq = gpc->kvm->mmu_invalidate_seq;
-               smp_rmb();
-
                 /*
                  * The translation made by hva_to_pfn() below could be made
                  * invalid as soon as it's mapped. But the uhva is already
@@ -193,6 +174,29 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
  
                 write_unlock_irq(&gpc->lock);
  
+               /*
+                * Invalidation occurs from the invalidate_range_start() hook,
+                * which could already have happened before __kvm_gpc_refresh()
+                * (or the previous turn around this loop) took gpc->lock().
+                * If so, and if the corresponding invalidate_range_end() hook
+                * hasn't happened yet, hva_to_pfn() could return a mapping
+                * which is about to be stale and which should not be used. So
+                * check if there are any currently-running invalidations which
+                * affect the uHVA of this GPC, and retry if there are. Any
+                * invalidation which starts after gpc->needs_invalidation is
+                * set is fine, because it will clear that flag and trigger a
+                * retry. And any invalidation which *completes* by having its
+                * invalidate_range_end() hook called immediately prior to this
+                * check is also fine, because the page tables are guaranteed
+                * to have been changted already, so hva_to_pfn() won't return
+                * a stale mapping in that case anyway.
+                */
+               while (gpc_invalidations_pending(gpc)) {
+                       cond_resched();
+                       write_lock_irq(&gpc->lock);
+                       continue;
+               }
+
                 /*
                  * If the previous iteration "failed" due to an mmu_notifier
                  * event, release the pfn and unmap the kernel virtual address
@@ -233,6 +237,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
                         goto out_error;
                 }
  
+
                 write_lock_irq(&gpc->lock);
  
                 /*
@@ -240,8 +245,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc)
                  * attempting to refresh.
                  */
                 WARN_ON_ONCE(gpc->valid);
-       } while (!gpc->needs_invalidation ||
-                mmu_notifier_retry_cache(gpc->kvm, mmu_seq));
+       } while (!gpc->needs_invalidation);
  
         gpc->valid = true;
         gpc->pfn = new_pfn;
author	David Woodhouse <dwmw@amazon.co.uk>
	Wed, 21 Aug 2024 19:08:17 +0000 (20:08 +0100)
committer	David Woodhouse <dwmw@amazon.co.uk>
	Wed, 21 Aug 2024 19:59:41 +0000 (20:59 +0100)
include/linux/kvm_host.h		patch \| blob \| history
virt/kvm/kvm_main.c		patch \| blob \| history
virt/kvm/pfncache.c		patch \| blob \| history