From efaaabc6c6d3cd673ffc9b6ac1da17186a238cb6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:25 -0700 Subject: [PATCH 01/16] KVM: nVMX: Rely on kvm_vcpu_unmap() to track validity of eVMCS mapping MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Remove the explicit evmptr12 validity check when deciding whether or not to unmap the eVMCS pointer, and instead rely on kvm_vcpu_unmap() to play nice with a NULL map->hva, i.e. to do nothing if the map is invalid. Note, vmx->nested.hv_evmcs_map is zero-allocated along with the rest of vcpu_vmx, i.e. the map starts out invalid/NULL. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-24-seanjc@google.com> --- arch/x86/kvm/vmx/nested.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index a8e7bc04d9bf..e94a25373a59 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -231,11 +231,8 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); - if (nested_vmx_is_evmptr12_valid(vmx)) { - kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); - vmx->nested.hv_evmcs = NULL; - } - + kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); + vmx->nested.hv_evmcs = NULL; vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; if (hv_vcpu) { -- 2.51.0 From 2e34f942a5f251f426f240edf68197817935cd31 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:26 -0700 Subject: [PATCH 02/16] KVM: nVMX: Drop pointless msr_bitmap_map field from struct nested_vmx MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Remove vcpu_vmx.msr_bitmap_map and instead use an on-stack structure in the one function that uses the map, nested_vmx_prepare_msr_bitmap(). Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-25-seanjc@google.com> --- arch/x86/kvm/vmx/nested.c | 8 ++++---- arch/x86/kvm/vmx/vmx.h | 2 -- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e94a25373a59..fb37658b62c9 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -621,7 +621,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, int msr; unsigned long *msr_bitmap_l1; unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; - struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; + struct kvm_host_map msr_bitmap_map; /* Nothing to do if the MSR bitmap is not in use. */ if (!cpu_has_vmx_msr_bitmap() || @@ -644,10 +644,10 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, return true; } - if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) + if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &msr_bitmap_map)) return false; - msr_bitmap_l1 = (unsigned long *)map->hva; + msr_bitmap_l1 = (unsigned long *)msr_bitmap_map.hva; /* * To keep the control flow simple, pay eight 8-byte writes (sixteen @@ -711,7 +711,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_FLUSH_CMD, MSR_TYPE_W); - kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); + kvm_vcpu_unmap(vcpu, &msr_bitmap_map, false); vmx->nested.force_msr_bitmap_recalc = false; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 2325f773a20b..40303b43da6c 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -200,8 +200,6 @@ struct nested_vmx { struct kvm_host_map virtual_apic_map; struct kvm_host_map pi_desc_map; - struct kvm_host_map msr_bitmap_map; - struct pi_desc *pi_desc; bool pi_pending; u16 posted_intr_nv; -- 2.51.0 From a629ef9518f5a59f79908d049144721cb2dd5b74 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:27 -0700 Subject: [PATCH 03/16] KVM: nVMX: Add helper to put (unmap) vmcs12 pages MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Add a helper to dedup unmapping the vmcs12 pages. This will reduce the amount of churn when a future patch refactors the kvm_vcpu_unmap() API. No functional change intended. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-26-seanjc@google.com> --- arch/x86/kvm/vmx/nested.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index fb37658b62c9..81865db18e12 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -314,6 +314,21 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) vcpu->arch.regs_dirty = 0; } +static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * Unpin physical memory we referred to in the vmcs02. The APIC access + * page's backing page (yeah, confusing) shouldn't actually be accessed, + * and if it is written, the contents are irrelevant. + */ + kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); + kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); + kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); + vmx->nested.pi_desc = NULL; +} + /* * Free whatever needs to be freed from vmx->nested when L1 goes down, or * just stops using VMX. @@ -346,15 +361,8 @@ static void free_nested(struct kvm_vcpu *vcpu) vmx->nested.cached_vmcs12 = NULL; kfree(vmx->nested.cached_shadow_vmcs12); vmx->nested.cached_shadow_vmcs12 = NULL; - /* - * Unpin physical memory we referred to in the vmcs02. The APIC access - * page's backing page (yeah, confusing) shouldn't actually be accessed, - * and if it is written, the contents are irrelevant. - */ - kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); - kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); - kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); - vmx->nested.pi_desc = NULL; + + nested_put_vmcs12_pages(vcpu); kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); @@ -5010,11 +5018,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, vmx_update_cpu_dirty_logging(vcpu); } - /* Unpin physical memory we referred to in vmcs02 */ - kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); - kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); - kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); - vmx->nested.pi_desc = NULL; + nested_put_vmcs12_pages(vcpu); if (vmx->nested.reload_vmcs01_apic_access_page) { vmx->nested.reload_vmcs01_apic_access_page = false; -- 2.51.0 From 12fac89950996e556a99eb16f0359307ed4c2566 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:28 -0700 Subject: [PATCH 04/16] KVM: Use plain "struct page" pointer instead of single-entry array MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Use a single pointer instead of a single-entry array for the struct page pointer in hva_to_pfn_fast(). Using an array makes the code unnecessarily annoying to read and update. No functional change intended. Reviewed-by: Alex Bennée Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-27-seanjc@google.com> --- virt/kvm/kvm_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index c20386a8aa3e..32f0858da7f2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2752,7 +2752,7 @@ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *w */ static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) { - struct page *page[1]; + struct page *page; /* * Fast pin a writable pfn only if it is a write fault request @@ -2762,8 +2762,8 @@ static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) if (!((kfp->flags & FOLL_WRITE) || kfp->map_writable)) return false; - if (get_user_page_fast_only(kfp->hva, FOLL_WRITE, page)) { - *pfn = page_to_pfn(page[0]); + if (get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page)) { + *pfn = page_to_pfn(page); if (kfp->map_writable) *kfp->map_writable = true; return true; -- 2.51.0 From 3dd48ecfac7fdbcba397a6378ab93c3a9b3cb802 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:29 -0700 Subject: [PATCH 05/16] KVM: Provide refcounted page as output field in struct kvm_follow_pfn MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Add kvm_follow_pfn.refcounted_page as an output for the "to pfn" APIs to "return" the struct page that is associated with the returned pfn (if KVM acquired a reference to the page). This will eventually allow removing KVM's hacky kvm_pfn_to_refcounted_page() code, which is error prone and can't detect pfns that are valid, but aren't (currently) refcounted. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-28-seanjc@google.com> --- virt/kvm/kvm_main.c | 99 +++++++++++++++++++++------------------------ virt/kvm/kvm_mm.h | 9 +++++ 2 files changed, 56 insertions(+), 52 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 32f0858da7f2..b3b4b374dddc 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2746,6 +2746,46 @@ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *w return gfn_to_hva_memslot_prot(slot, gfn, writable); } +static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, + struct follow_pfnmap_args *map, bool writable) +{ + kvm_pfn_t pfn; + + WARN_ON_ONCE(!!page == !!map); + + if (kfp->map_writable) + *kfp->map_writable = writable; + + /* + * FIXME: Remove this once KVM no longer blindly calls put_page() on + * every pfn that points at a struct page. + * + * Get a reference for follow_pte() pfns if they happen to point at a + * struct page, as KVM will ultimately call kvm_release_pfn_clean() on + * the returned pfn, i.e. KVM expects to have a reference. + * + * Certain IO or PFNMAP mappings can be backed with valid struct pages, + * but be allocated without refcounting, e.g. tail pages of + * non-compound higher order allocations. Grabbing and putting a + * reference to such pages would cause KVM to prematurely free a page + * it doesn't own (KVM gets and puts the one and only reference). + * Don't allow those pages until the FIXME is resolved. + */ + if (map) { + pfn = map->pfn; + page = kvm_pfn_to_refcounted_page(pfn); + if (page && !get_page_unless_zero(page)) + return KVM_PFN_ERR_FAULT; + } else { + pfn = page_to_pfn(page); + } + + if (kfp->refcounted_page) + *kfp->refcounted_page = page; + + return pfn; +} + /* * The fast path to get the writable pfn which will be stored in @pfn, * true indicates success, otherwise false is returned. @@ -2763,9 +2803,7 @@ static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) return false; if (get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page)) { - *pfn = page_to_pfn(page); - if (kfp->map_writable) - *kfp->map_writable = true; + *pfn = kvm_resolve_pfn(kfp, page, NULL, true); return true; } @@ -2797,23 +2835,15 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) if (npages != 1) return npages; - if (!kfp->map_writable) - goto out; - - if (kfp->flags & FOLL_WRITE) { - *kfp->map_writable = true; - goto out; - } - /* map read fault as writable if possible */ - if (get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) { - *kfp->map_writable = true; + if (!(flags & FOLL_WRITE) && kfp->map_writable && + get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) { put_page(page); page = wpage; + flags |= FOLL_WRITE; } -out: - *pfn = page_to_pfn(page); + *pfn = kvm_resolve_pfn(kfp, page, NULL, flags & FOLL_WRITE); return npages; } @@ -2828,22 +2858,11 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) return true; } -static int kvm_try_get_pfn(kvm_pfn_t pfn) -{ - struct page *page = kvm_pfn_to_refcounted_page(pfn); - - if (!page) - return 1; - - return get_page_unless_zero(page); -} - static int hva_to_pfn_remapped(struct vm_area_struct *vma, struct kvm_follow_pfn *kfp, kvm_pfn_t *p_pfn) { struct follow_pfnmap_args args = { .vma = vma, .address = kfp->hva }; bool write_fault = kfp->flags & FOLL_WRITE; - kvm_pfn_t pfn; int r; r = follow_pfnmap_start(&args); @@ -2867,37 +2886,13 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, } if (write_fault && !args.writable) { - pfn = KVM_PFN_ERR_RO_FAULT; + *p_pfn = KVM_PFN_ERR_RO_FAULT; goto out; } - if (kfp->map_writable) - *kfp->map_writable = args.writable; - pfn = args.pfn; - - /* - * Get a reference here because callers of *hva_to_pfn* and - * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the - * returned pfn. This is only needed if the VMA has VM_MIXEDMAP - * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will - * simply do nothing for reserved pfns. - * - * Whoever called remap_pfn_range is also going to call e.g. - * unmap_mapping_range before the underlying pages are freed, - * causing a call to our MMU notifier. - * - * Certain IO or PFNMAP mappings can be backed with valid - * struct pages, but be allocated without refcounting e.g., - * tail pages of non-compound higher order allocations, which - * would then underflow the refcount when the caller does the - * required put_page. Don't allow those pages here. - */ - if (!kvm_try_get_pfn(pfn)) - r = -EFAULT; + *p_pfn = kvm_resolve_pfn(kfp, NULL, &args, args.writable); out: follow_pfnmap_end(&args); - *p_pfn = pfn; - return r; } diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index d5a215958f06..d3ac1ba8ba66 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -35,6 +35,15 @@ struct kvm_follow_pfn { * Set to true if a writable mapping was obtained. */ bool *map_writable; + + /* + * Optional output. Set to a valid "struct page" if the returned pfn + * is for a refcounted or pinned struct page, NULL if the returned pfn + * has no struct page or if the struct page is not being refcounted + * (e.g. tail pages of non-compound higher order allocations from + * IO/PFNMAP mappings). + */ + struct page **refcounted_page; }; kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp); -- 2.51.0 From 775e3ff7bf4919285da0456d0ebbfa2874ee7572 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:30 -0700 Subject: [PATCH 06/16] KVM: Move kvm_{set,release}_page_{clean,dirty}() helpers up in kvm_main.c MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Hoist the kvm_{set,release}_page_{clean,dirty}() APIs further up in kvm_main.c so that they can be used by the kvm_follow_pfn family of APIs. No functional change intended. Reviewed-by: Alex Bennée Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-29-seanjc@google.com> --- virt/kvm/kvm_main.c | 82 ++++++++++++++++++++++----------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b3b4b374dddc..64888257e301 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2746,6 +2746,47 @@ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *w return gfn_to_hva_memslot_prot(slot, gfn, writable); } +static bool kvm_is_ad_tracked_page(struct page *page) +{ + /* + * Per page-flags.h, pages tagged PG_reserved "should in general not be + * touched (e.g. set dirty) except by its owner". + */ + return !PageReserved(page); +} + +static void kvm_set_page_dirty(struct page *page) +{ + if (kvm_is_ad_tracked_page(page)) + SetPageDirty(page); +} + +static void kvm_set_page_accessed(struct page *page) +{ + if (kvm_is_ad_tracked_page(page)) + mark_page_accessed(page); +} + +void kvm_release_page_clean(struct page *page) +{ + if (!page) + return; + + kvm_set_page_accessed(page); + put_page(page); +} +EXPORT_SYMBOL_GPL(kvm_release_page_clean); + +void kvm_release_page_dirty(struct page *page) +{ + if (!page) + return; + + kvm_set_page_dirty(page); + kvm_release_page_clean(page); +} +EXPORT_SYMBOL_GPL(kvm_release_page_dirty); + static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, struct follow_pfnmap_args *map, bool writable) { @@ -3099,37 +3140,6 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) } EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); -static bool kvm_is_ad_tracked_page(struct page *page) -{ - /* - * Per page-flags.h, pages tagged PG_reserved "should in general not be - * touched (e.g. set dirty) except by its owner". - */ - return !PageReserved(page); -} - -static void kvm_set_page_dirty(struct page *page) -{ - if (kvm_is_ad_tracked_page(page)) - SetPageDirty(page); -} - -static void kvm_set_page_accessed(struct page *page) -{ - if (kvm_is_ad_tracked_page(page)) - mark_page_accessed(page); -} - -void kvm_release_page_clean(struct page *page) -{ - if (!page) - return; - - kvm_set_page_accessed(page); - put_page(page); -} -EXPORT_SYMBOL_GPL(kvm_release_page_clean); - void kvm_release_pfn_clean(kvm_pfn_t pfn) { struct page *page; @@ -3145,16 +3155,6 @@ void kvm_release_pfn_clean(kvm_pfn_t pfn) } EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); -void kvm_release_page_dirty(struct page *page) -{ - if (!page) - return; - - kvm_set_page_dirty(page); - kvm_release_page_clean(page); -} -EXPORT_SYMBOL_GPL(kvm_release_page_dirty); - void kvm_release_pfn_dirty(kvm_pfn_t pfn) { struct page *page; -- 2.51.0 From 3154ddcb6a9016f314a2f5f1293808ad07c5fab9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:31 -0700 Subject: [PATCH 07/16] KVM: pfncache: Precisely track refcounted pages MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Track refcounted struct page memory using kvm_follow_pfn.refcounted_page instead of relying on kvm_release_pfn_clean() to correctly detect that the pfn is associated with a struct page. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-30-seanjc@google.com> --- virt/kvm/pfncache.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/virt/kvm/pfncache.c b/virt/kvm/pfncache.c index 067daf9ad6ef..728d2c1b488a 100644 --- a/virt/kvm/pfncache.c +++ b/virt/kvm/pfncache.c @@ -159,11 +159,14 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) kvm_pfn_t new_pfn = KVM_PFN_ERR_FAULT; void *new_khva = NULL; unsigned long mmu_seq; + struct page *page; + struct kvm_follow_pfn kfp = { .slot = gpc->memslot, .gfn = gpa_to_gfn(gpc->gpa), .flags = FOLL_WRITE, .hva = gpc->uhva, + .refcounted_page = &page, }; lockdep_assert_held(&gpc->refresh_lock); @@ -198,7 +201,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) if (new_khva != old_khva) gpc_unmap(new_pfn, new_khva); - kvm_release_pfn_clean(new_pfn); + kvm_release_page_unused(page); cond_resched(); } @@ -218,7 +221,7 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) new_khva = gpc_map(new_pfn); if (!new_khva) { - kvm_release_pfn_clean(new_pfn); + kvm_release_page_unused(page); goto out_error; } @@ -236,11 +239,11 @@ static kvm_pfn_t hva_to_pfn_retry(struct gfn_to_pfn_cache *gpc) gpc->khva = new_khva + offset_in_page(gpc->uhva); /* - * Put the reference to the _new_ pfn. The pfn is now tracked by the + * Put the reference to the _new_ page. The page is now tracked by the * cache and can be safely migrated, swapped, etc... as the cache will * invalidate any mappings in response to relevant mmu_notifier events. */ - kvm_release_pfn_clean(new_pfn); + kvm_release_page_clean(page); return 0; -- 2.51.0 From 2ff072ba7ad2deb1c3b2d231faa0ac3a25e2451b Mon Sep 17 00:00:00 2001 From: David Stevens Date: Thu, 10 Oct 2024 11:23:32 -0700 Subject: [PATCH 08/16] KVM: Migrate kvm_vcpu_map() to kvm_follow_pfn() MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Migrate kvm_vcpu_map() to kvm_follow_pfn(), and have it track whether or not the map holds a refcounted struct page. Precisely tracking struct page references will eventually allow removing kvm_pfn_to_refcounted_page() and its various wrappers. Signed-off-by: David Stevens [sean: use a pointer instead of a boolean] Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-31-seanjc@google.com> --- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 26 ++++++++++++++++---------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index cd6f5cc1930f..35e1beb017dd 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -280,6 +280,7 @@ struct kvm_host_map { * can be used as guest memory but they are not managed by host * kernel). */ + struct page *refcounted_page; struct page *page; void *hva; kvm_pfn_t pfn; @@ -1238,7 +1239,6 @@ void kvm_release_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_accessed(kvm_pfn_t pfn); -void kvm_release_pfn(kvm_pfn_t pfn, bool dirty); int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int len); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 64888257e301..842a5d5f3120 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3087,21 +3087,21 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(gfn_to_page); -void kvm_release_pfn(kvm_pfn_t pfn, bool dirty) -{ - if (dirty) - kvm_release_pfn_dirty(pfn); - else - kvm_release_pfn_clean(pfn); -} - int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) { + struct kvm_follow_pfn kfp = { + .slot = gfn_to_memslot(vcpu->kvm, gfn), + .gfn = gfn, + .flags = FOLL_WRITE, + .refcounted_page = &map->refcounted_page, + }; + + map->refcounted_page = NULL; map->page = NULL; map->hva = NULL; map->gfn = gfn; - map->pfn = gfn_to_pfn(vcpu->kvm, gfn); + map->pfn = kvm_follow_pfn(&kfp); if (is_error_noslot_pfn(map->pfn)) return -EINVAL; @@ -3133,10 +3133,16 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) if (dirty) kvm_vcpu_mark_page_dirty(vcpu, map->gfn); - kvm_release_pfn(map->pfn, dirty); + if (map->refcounted_page) { + if (dirty) + kvm_release_page_dirty(map->refcounted_page); + else + kvm_release_page_clean(map->refcounted_page); + } map->hva = NULL; map->page = NULL; + map->refcounted_page = NULL; } EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); -- 2.51.0 From 2bcb52a3602bf4cbc55d8fb4da00c930f83d7789 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:33 -0700 Subject: [PATCH 09/16] KVM: Pin (as in FOLL_PIN) pages during kvm_vcpu_map() MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Pin, as in FOLL_PIN, pages when mapping them for direct access by KVM. As per Documentation/core-api/pin_user_pages.rst, writing to a page that was gotten via FOLL_GET is explicitly disallowed. Correct (uses FOLL_PIN calls): pin_user_pages() write to the data within the pages unpin_user_pages() INCORRECT (uses FOLL_GET calls): get_user_pages() write to the data within the pages put_page() Unfortunately, FOLL_PIN is a "private" flag, and so kvm_follow_pfn must use a one-off bool instead of being able to piggyback the "flags" field. Link: https://lwn.net/Articles/930667 Link: https://lore.kernel.org/all/cover.1683044162.git.lstoakes@gmail.com Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-32-seanjc@google.com> --- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 54 +++++++++++++++++++++++++++++----------- virt/kvm/kvm_mm.h | 7 ++++++ 3 files changed, 47 insertions(+), 16 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 35e1beb017dd..b4c541fa5a1f 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -280,7 +280,7 @@ struct kvm_host_map { * can be used as guest memory but they are not managed by host * kernel). */ - struct page *refcounted_page; + struct page *pinned_page; struct page *page; void *hva; kvm_pfn_t pfn; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 842a5d5f3120..0d59f47f099e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2814,9 +2814,12 @@ static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, */ if (map) { pfn = map->pfn; - page = kvm_pfn_to_refcounted_page(pfn); - if (page && !get_page_unless_zero(page)) - return KVM_PFN_ERR_FAULT; + + if (!kfp->pin) { + page = kvm_pfn_to_refcounted_page(pfn); + if (page && !get_page_unless_zero(page)) + return KVM_PFN_ERR_FAULT; + } } else { pfn = page_to_pfn(page); } @@ -2834,16 +2837,24 @@ static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) { struct page *page; + bool r; /* - * Fast pin a writable pfn only if it is a write fault request - * or the caller allows to map a writable pfn for a read fault - * request. + * Try the fast-only path when the caller wants to pin/get the page for + * writing. If the caller only wants to read the page, KVM must go + * down the full, slow path in order to avoid racing an operation that + * breaks Copy-on-Write (CoW), e.g. so that KVM doesn't end up pointing + * at the old, read-only page while mm/ points at a new, writable page. */ if (!((kfp->flags & FOLL_WRITE) || kfp->map_writable)) return false; - if (get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page)) { + if (kfp->pin) + r = pin_user_pages_fast(kfp->hva, 1, FOLL_WRITE, &page) == 1; + else + r = get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page); + + if (r) { *pfn = kvm_resolve_pfn(kfp, page, NULL, true); return true; } @@ -2872,10 +2883,21 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) struct page *page, *wpage; int npages; - npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags); + if (kfp->pin) + npages = pin_user_pages_unlocked(kfp->hva, 1, &page, flags); + else + npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags); if (npages != 1) return npages; + /* + * Pinning is mutually exclusive with opportunistically mapping a read + * fault as writable, as KVM should never pin pages when mapping memory + * into the guest (pinning is only for direct accesses from KVM). + */ + if (WARN_ON_ONCE(kfp->map_writable && kfp->pin)) + goto out; + /* map read fault as writable if possible */ if (!(flags & FOLL_WRITE) && kfp->map_writable && get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) { @@ -2884,6 +2906,7 @@ static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn) flags |= FOLL_WRITE; } +out: *pfn = kvm_resolve_pfn(kfp, page, NULL, flags & FOLL_WRITE); return npages; } @@ -3093,10 +3116,11 @@ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) .slot = gfn_to_memslot(vcpu->kvm, gfn), .gfn = gfn, .flags = FOLL_WRITE, - .refcounted_page = &map->refcounted_page, + .refcounted_page = &map->pinned_page, + .pin = true, }; - map->refcounted_page = NULL; + map->pinned_page = NULL; map->page = NULL; map->hva = NULL; map->gfn = gfn; @@ -3133,16 +3157,16 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) if (dirty) kvm_vcpu_mark_page_dirty(vcpu, map->gfn); - if (map->refcounted_page) { + if (map->pinned_page) { if (dirty) - kvm_release_page_dirty(map->refcounted_page); - else - kvm_release_page_clean(map->refcounted_page); + kvm_set_page_dirty(map->pinned_page); + kvm_set_page_accessed(map->pinned_page); + unpin_user_page(map->pinned_page); } map->hva = NULL; map->page = NULL; - map->refcounted_page = NULL; + map->pinned_page = NULL; } EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h index d3ac1ba8ba66..acef3f5c582a 100644 --- a/virt/kvm/kvm_mm.h +++ b/virt/kvm/kvm_mm.h @@ -30,6 +30,13 @@ struct kvm_follow_pfn { /* FOLL_* flags modifying lookup behavior, e.g. FOLL_WRITE. */ unsigned int flags; + /* + * Pin the page (effectively FOLL_PIN, which is an mm/ internal flag). + * The page *must* be pinned if KVM will write to the page via a kernel + * mapping, e.g. via kmap(), mremap(), etc. + */ + bool pin; + /* * If non-NULL, try to get a writable mapping even for a read fault. * Set to true if a writable mapping was obtained. -- 2.51.0 From 7afe79f5734aebb55895424074bb944ac4717b7d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:34 -0700 Subject: [PATCH 10/16] KVM: nVMX: Mark vmcs12's APIC access page dirty when unmapping MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Mark the APIC access page as dirty when unmapping it from KVM. The fact that the page _shouldn't_ be written doesn't guarantee the page _won't_ be written. And while the contents are likely irrelevant, the values _are_ visible to the guest, i.e. dropping writes would be visible to the guest (though obviously highly unlikely to be problematic in practice). Marking the map dirty will allow specifying the write vs. read-only when *mapping* the memory, which in turn will allow creating read-only maps. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-33-seanjc@google.com> --- arch/x86/kvm/vmx/nested.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 81865db18e12..ff83b56fe2fa 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -318,12 +318,7 @@ static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - /* - * Unpin physical memory we referred to in the vmcs02. The APIC access - * page's backing page (yeah, confusing) shouldn't actually be accessed, - * and if it is written, the contents are irrelevant. - */ - kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); + kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, true); kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); vmx->nested.pi_desc = NULL; -- 2.51.0 From 365e319208442a0807a96e9ea4d0b1fa338f1929 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:35 -0700 Subject: [PATCH 11/16] KVM: Pass in write/dirty to kvm_vcpu_map(), not kvm_vcpu_unmap() MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Now that all kvm_vcpu_{,un}map() users pass "true" for @dirty, have them pass "true" as a @writable param to kvm_vcpu_map(), and thus create a read-only mapping when possible. Note, creating read-only mappings can be theoretically slower, as they don't play nice with fast GUP due to the need to break CoW before mapping the underlying PFN. But practically speaking, creating a mapping isn't a super hot path, and getting a writable mapping for reading is weird and confusing. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-34-seanjc@google.com> --- arch/x86/kvm/svm/nested.c | 4 ++-- arch/x86/kvm/svm/sev.c | 2 +- arch/x86/kvm/svm/svm.c | 8 ++++---- arch/x86/kvm/vmx/nested.c | 16 ++++++++-------- include/linux/kvm_host.h | 20 ++++++++++++++++++-- virt/kvm/kvm_main.c | 12 +++++++----- 6 files changed, 40 insertions(+), 22 deletions(-) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index cf84103ce38b..b708bdf7eaff 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -926,7 +926,7 @@ out_exit_err: nested_svm_vmexit(svm); out: - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); return ret; } @@ -1130,7 +1130,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) vmcb12->control.exit_int_info_err, KVM_ISA_SVM); - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); nested_svm_transition_tlb_flush(vcpu); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 0b851ef937f2..4557ff3804ae 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3468,7 +3468,7 @@ void sev_es_unmap_ghcb(struct vcpu_svm *svm) sev_es_sync_to_ghcb(svm); - kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map, true); + kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map); svm->sev_es.ghcb = NULL; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9df3e1e5ae81..c1e29307826b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2299,7 +2299,7 @@ static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) svm_copy_vmloadsave_state(vmcb12, svm->vmcb); } - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); return ret; } @@ -4714,7 +4714,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) svm_copy_vmrun_state(map_save.hva + 0x400, &svm->vmcb01.ptr->save); - kvm_vcpu_unmap(vcpu, &map_save, true); + kvm_vcpu_unmap(vcpu, &map_save); return 0; } @@ -4774,9 +4774,9 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) svm->nested.nested_run_pending = 1; unmap_save: - kvm_vcpu_unmap(vcpu, &map_save, true); + kvm_vcpu_unmap(vcpu, &map_save); unmap_map: - kvm_vcpu_unmap(vcpu, &map, true); + kvm_vcpu_unmap(vcpu, &map); return ret; } diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ff83b56fe2fa..259fe445e695 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -231,7 +231,7 @@ static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); - kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); + kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map); vmx->nested.hv_evmcs = NULL; vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; @@ -318,9 +318,9 @@ static void nested_put_vmcs12_pages(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, true); - kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); - kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); + kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map); + kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map); + kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map); vmx->nested.pi_desc = NULL; } @@ -624,7 +624,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, int msr; unsigned long *msr_bitmap_l1; unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; - struct kvm_host_map msr_bitmap_map; + struct kvm_host_map map; /* Nothing to do if the MSR bitmap is not in use. */ if (!cpu_has_vmx_msr_bitmap() || @@ -647,10 +647,10 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, return true; } - if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &msr_bitmap_map)) + if (kvm_vcpu_map_readonly(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), &map)) return false; - msr_bitmap_l1 = (unsigned long *)msr_bitmap_map.hva; + msr_bitmap_l1 = (unsigned long *)map.hva; /* * To keep the control flow simple, pay eight 8-byte writes (sixteen @@ -714,7 +714,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, MSR_IA32_FLUSH_CMD, MSR_TYPE_W); - kvm_vcpu_unmap(vcpu, &msr_bitmap_map, false); + kvm_vcpu_unmap(vcpu, &map); vmx->nested.force_msr_bitmap_recalc = false; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b4c541fa5a1f..101dbf2be1ce 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -285,6 +285,7 @@ struct kvm_host_map { void *hva; kvm_pfn_t pfn; kvm_pfn_t gfn; + bool writable; }; /* @@ -1311,8 +1312,23 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn); struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn); -int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map); -void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty); + +int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map, + bool writable); +void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map); + +static inline int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, + struct kvm_host_map *map) +{ + return __kvm_vcpu_map(vcpu, gpa, map, true); +} + +static inline int kvm_vcpu_map_readonly(struct kvm_vcpu *vcpu, gpa_t gpa, + struct kvm_host_map *map) +{ + return __kvm_vcpu_map(vcpu, gpa, map, false); +} + unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0d59f47f099e..baa741c2b81c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3110,7 +3110,8 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(gfn_to_page); -int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) +int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, + bool writable) { struct kvm_follow_pfn kfp = { .slot = gfn_to_memslot(vcpu->kvm, gfn), @@ -3124,6 +3125,7 @@ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) map->page = NULL; map->hva = NULL; map->gfn = gfn; + map->writable = writable; map->pfn = kvm_follow_pfn(&kfp); if (is_error_noslot_pfn(map->pfn)) @@ -3140,9 +3142,9 @@ int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) return map->hva ? 0 : -EFAULT; } -EXPORT_SYMBOL_GPL(kvm_vcpu_map); +EXPORT_SYMBOL_GPL(__kvm_vcpu_map); -void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) +void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map) { if (!map->hva) return; @@ -3154,11 +3156,11 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) memunmap(map->hva); #endif - if (dirty) + if (map->writable) kvm_vcpu_mark_page_dirty(vcpu, map->gfn); if (map->pinned_page) { - if (dirty) + if (map->writable) kvm_set_page_dirty(map->pinned_page); kvm_set_page_accessed(map->pinned_page); unpin_user_page(map->pinned_page); -- 2.51.0 From 2e5fdf60a9a69971b5e642d32e09bd6b0223f47c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:36 -0700 Subject: [PATCH 12/16] KVM: Get writable mapping for __kvm_vcpu_map() only when necessary MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit When creating a memory map for read, don't request a writable pfn from the primary MMU. While creating read-only mappings can be theoretically slower, as they don't play nice with fast GUP due to the need to break CoW before mapping the underlying PFN, practically speaking, creating a mapping isn't a super hot path, and getting a writable mapping for reading is weird and confusing. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-35-seanjc@google.com> --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index baa741c2b81c..49ac13872e09 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3116,7 +3116,7 @@ int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, struct kvm_follow_pfn kfp = { .slot = gfn_to_memslot(vcpu->kvm, gfn), .gfn = gfn, - .flags = FOLL_WRITE, + .flags = writable ? FOLL_WRITE : 0, .refcounted_page = &map->pinned_page, .pin = true, }; -- 2.51.0 From 68e51d0a437b09dcef621b4cc8e4fe02605bf5c4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:37 -0700 Subject: [PATCH 13/16] KVM: Disallow direct access (w/o mmu_notifier) to unpinned pfn by default MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Add an off-by-default module param to control whether or not KVM is allowed to map memory that isn't pinned, i.e. that KVM can't guarantee won't be freed while it is mapped into KVM and/or the guest. Don't remove the functionality entirely, as there are use cases where mapping unpinned memory is safe (as defined by the platform owner), e.g. when memory is hidden from the kernel and managed by userspace, in which case userspace is already fully trusted to not muck with guest memory mappings. But for more typical setups, mapping unpinned memory is wildly unsafe, and unnecessary. The APIs are used exclusively by x86's nested virtualization support, and there is no known (or sane) use case for mapping PFN-mapped memory a KVM guest _and_ letting the guest use it for virtualization structures. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-36-seanjc@google.com> --- virt/kvm/kvm_main.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 49ac13872e09..becf640e369c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -94,6 +94,13 @@ unsigned int halt_poll_ns_shrink = 2; module_param(halt_poll_ns_shrink, uint, 0644); EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); +/* + * Allow direct access (from KVM or the CPU) without MMU notifier protection + * to unpinned pages. + */ +static bool allow_unsafe_mappings; +module_param(allow_unsafe_mappings, bool, 0444); + /* * Ordering of locks: * @@ -2811,6 +2818,9 @@ static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page, * reference to such pages would cause KVM to prematurely free a page * it doesn't own (KVM gets and puts the one and only reference). * Don't allow those pages until the FIXME is resolved. + * + * Don't grab a reference for pins, callers that pin pages are required + * to check refcounted_page, i.e. must not blindly release the pfn. */ if (map) { pfn = map->pfn; @@ -2929,6 +2939,14 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, bool write_fault = kfp->flags & FOLL_WRITE; int r; + /* + * Remapped memory cannot be pinned in any meaningful sense. Bail if + * the caller wants to pin the page, i.e. access the page outside of + * MMU notifier protection, and unsafe umappings are disallowed. + */ + if (kfp->pin && !allow_unsafe_mappings) + return -EINVAL; + r = follow_pfnmap_start(&args); if (r) { /* -- 2.51.0 From fcd366b95e6e356ae5069753938988b6c6286fa8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:38 -0700 Subject: [PATCH 14/16] KVM: x86: Don't fault-in APIC access page during initial allocation Drop the gfn_to_page() lookup when installing KVM's internal memslot for the APIC access page, as KVM doesn't need to immediately fault-in the page now that the page isn't pinned. In the extremely unlikely event the kernel can't allocate a 4KiB page, KVM can just as easily return -EFAULT on the future page fault. Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-37-seanjc@google.com> --- arch/x86/kvm/lapic.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 20526e4d6c62..65412640cfc7 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2647,7 +2647,6 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) int kvm_alloc_apic_access_page(struct kvm *kvm) { - struct page *page; void __user *hva; int ret = 0; @@ -2663,17 +2662,6 @@ int kvm_alloc_apic_access_page(struct kvm *kvm) goto out; } - page = gfn_to_page(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (!page) { - ret = -EFAULT; - goto out; - } - - /* - * Do not pin the page in memory, so that memory hot-unplug - * is able to migrate it. - */ - put_page(page); kvm->arch.apic_access_memslot_enabled = true; out: mutex_unlock(&kvm->slots_lock); -- 2.51.0 From 447c375c9104b5c2881a5806f26f967492cab867 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:39 -0700 Subject: [PATCH 15/16] KVM: x86/mmu: Add "mmu" prefix fault-in helpers to free up generic names MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Prefix x86's faultin_pfn helpers with "mmu" so that the mmu-less names can be used by common KVM for similar APIs. No functional change intended. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-38-seanjc@google.com> --- arch/x86/kvm/mmu/mmu.c | 19 ++++++++++--------- arch/x86/kvm/mmu/mmu_internal.h | 2 +- arch/x86/kvm/mmu/paging_tmpl.h | 2 +- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 688202ac50c2..7dcd34628f49 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4354,8 +4354,8 @@ static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, return max_level; } -static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, - struct kvm_page_fault *fault) +static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) { int max_order, r; @@ -4378,10 +4378,11 @@ static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu, return RET_PF_CONTINUE; } -static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) +static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault) { if (fault->is_private) - return kvm_faultin_pfn_private(vcpu, fault); + return kvm_mmu_faultin_pfn_private(vcpu, fault); fault->pfn = __gfn_to_pfn_memslot(fault->slot, fault->gfn, false, true, fault->write, &fault->map_writable); @@ -4416,8 +4417,8 @@ static int __kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault return RET_PF_CONTINUE; } -static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, - unsigned int access) +static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu, + struct kvm_page_fault *fault, unsigned int access) { struct kvm_memory_slot *slot = fault->slot; int ret; @@ -4500,7 +4501,7 @@ static int kvm_faultin_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, if (mmu_invalidate_retry_gfn_unsafe(vcpu->kvm, fault->mmu_seq, fault->gfn)) return RET_PF_RETRY; - ret = __kvm_faultin_pfn(vcpu, fault); + ret = __kvm_mmu_faultin_pfn(vcpu, fault); if (ret != RET_PF_CONTINUE) return ret; @@ -4577,7 +4578,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) return r; - r = kvm_faultin_pfn(vcpu, fault, ACC_ALL); + r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL); if (r != RET_PF_CONTINUE) return r; @@ -4668,7 +4669,7 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, if (r) return r; - r = kvm_faultin_pfn(vcpu, fault, ACC_ALL); + r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL); if (r != RET_PF_CONTINUE) return r; diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 633aedec3c2e..59e600f6ff9d 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -235,7 +235,7 @@ struct kvm_page_fault { /* The memslot containing gfn. May be NULL. */ struct kvm_memory_slot *slot; - /* Outputs of kvm_faultin_pfn. */ + /* Outputs of kvm_mmu_faultin_pfn(). */ unsigned long mmu_seq; kvm_pfn_t pfn; bool map_writable; diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 143b7e9f26dc..9bd3d6f5db91 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -812,7 +812,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r) return r; - r = kvm_faultin_pfn(vcpu, fault, walker.pte_access); + r = kvm_mmu_faultin_pfn(vcpu, fault, walker.pte_access); if (r != RET_PF_CONTINUE) return r; -- 2.51.0 From 64d5cd99f78ec39edbc691bb332f34e6c22c32c9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:40 -0700 Subject: [PATCH 16/16] KVM: x86/mmu: Put direct prefetched pages via kvm_release_page_clean() MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Use kvm_release_page_clean() to put prefeteched pages instead of calling put_page() directly. This will allow de-duplicating the prefetch code between indirect and direct MMUs. Note, there's a small functional change as kvm_release_page_clean() marks the page/folio as accessed. While it's not strictly guaranteed that the guest will access the page, KVM won't intercept guest accesses, i.e. won't mark the page accessed if it _is_ accessed by the guest (unless A/D bits are disabled, but running without A/D bits is effectively limited to pre-HSW Intel CPUs). Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-39-seanjc@google.com> --- arch/x86/kvm/mmu/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7dcd34628f49..5e0c91f017f7 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2972,7 +2972,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, for (i = 0; i < ret; i++, gfn++, start++) { mmu_set_spte(vcpu, slot, start, access, gfn, page_to_pfn(pages[i]), NULL); - put_page(pages[i]); + kvm_release_page_clean(pages[i]); } return 0; -- 2.51.0