From 4af18dc6a9204464db76d9771d1f40e2b46bf6ae Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:47 -0700 Subject: [PATCH 01/16] KVM: guest_memfd: Pass index, not gfn, to __kvm_gmem_get_pfn() Refactor guest_memfd usage of __kvm_gmem_get_pfn() to pass the index into the guest_memfd file instead of the gfn, i.e. resolve the index based on the slot+gfn in the caller instead of in __kvm_gmem_get_pfn(). This will allow kvm_gmem_get_pfn() to retrieve and return the specific "struct page", which requires the index into the folio, without a redoing the index calculation multiple times (which isn't costly, just hard to follow). Opportunistically add a kvm_gmem_get_index() helper to make the copy+pasted code easier to understand. Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-46-seanjc@google.com> --- virt/kvm/guest_memfd.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 8f079a61a56d..8a878e57c5d4 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -302,6 +302,11 @@ static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot) return get_file_active(&slot->gmem.file); } +static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn) +{ + return gfn - slot->base_gfn + slot->gmem.pgoff; +} + static struct file_operations kvm_gmem_fops = { .open = generic_file_open, .release = kvm_gmem_release, @@ -551,12 +556,11 @@ void kvm_gmem_unbind(struct kvm_memory_slot *slot) } /* Returns a locked folio on success. */ -static struct folio * -__kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, - gfn_t gfn, kvm_pfn_t *pfn, bool *is_prepared, - int *max_order) +static struct folio *__kvm_gmem_get_pfn(struct file *file, + struct kvm_memory_slot *slot, + pgoff_t index, kvm_pfn_t *pfn, + bool *is_prepared, int *max_order) { - pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff; struct kvm_gmem *gmem = file->private_data; struct folio *folio; @@ -592,6 +596,7 @@ __kvm_gmem_get_pfn(struct file *file, struct kvm_memory_slot *slot, int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, kvm_pfn_t *pfn, int *max_order) { + pgoff_t index = kvm_gmem_get_index(slot, gfn); struct file *file = kvm_gmem_get_file(slot); struct folio *folio; bool is_prepared = false; @@ -600,7 +605,7 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, if (!file) return -EFAULT; - folio = __kvm_gmem_get_pfn(file, slot, gfn, pfn, &is_prepared, max_order); + folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order); if (IS_ERR(folio)) { r = PTR_ERR(folio); goto out; @@ -648,6 +653,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long for (i = 0; i < npages; i += (1 << max_order)) { struct folio *folio; gfn_t gfn = start_gfn + i; + pgoff_t index = kvm_gmem_get_index(slot, gfn); bool is_prepared = false; kvm_pfn_t pfn; @@ -656,7 +662,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long break; } - folio = __kvm_gmem_get_pfn(file, slot, gfn, &pfn, &is_prepared, &max_order); + folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &is_prepared, &max_order); if (IS_ERR(folio)) { ret = PTR_ERR(folio); break; -- 2.51.0 From 1fbee5b01a0fd27db571eed757682a7c20045107 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:48 -0700 Subject: [PATCH 02/16] KVM: guest_memfd: Provide "struct page" as output from kvm_gmem_get_pfn() MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Provide the "struct page" associated with a guest_memfd pfn as an output from __kvm_gmem_get_pfn() so that KVM guest page fault handlers can directly put the page instead of having to rely on kvm_pfn_to_refcounted_page(). Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-47-seanjc@google.com> --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/svm/sev.c | 10 ++++++---- include/linux/kvm_host.h | 6 ++++-- virt/kvm/guest_memfd.c | 8 ++++++-- 4 files changed, 17 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2bea2d20c571..c657c3c449c8 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4407,7 +4407,7 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, } r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn, - &max_order); + &fault->refcounted_page, &max_order); if (r) { kvm_mmu_prepare_memory_fault_exit(vcpu, fault); return r; diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 4557ff3804ae..c6c852485900 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -3849,6 +3849,7 @@ static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) if (VALID_PAGE(svm->sev_es.snp_vmsa_gpa)) { gfn_t gfn = gpa_to_gfn(svm->sev_es.snp_vmsa_gpa); struct kvm_memory_slot *slot; + struct page *page; kvm_pfn_t pfn; slot = gfn_to_memslot(vcpu->kvm, gfn); @@ -3859,7 +3860,7 @@ static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) * The new VMSA will be private memory guest memory, so * retrieve the PFN from the gmem backend. */ - if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, NULL)) + if (kvm_gmem_get_pfn(vcpu->kvm, slot, gfn, &pfn, &page, NULL)) return -EINVAL; /* @@ -3888,7 +3889,7 @@ static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu) * changes then care should be taken to ensure * svm->sev_es.vmsa is pinned through some other means. */ - kvm_release_pfn_clean(pfn); + kvm_release_page_clean(page); } /* @@ -4688,6 +4689,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) struct kvm_memory_slot *slot; struct kvm *kvm = vcpu->kvm; int order, rmp_level, ret; + struct page *page; bool assigned; kvm_pfn_t pfn; gfn_t gfn; @@ -4714,7 +4716,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) return; } - ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &order); + ret = kvm_gmem_get_pfn(kvm, slot, gfn, &pfn, &page, &order); if (ret) { pr_warn_ratelimited("SEV: Unexpected RMP fault, no backing page for private GPA 0x%llx\n", gpa); @@ -4772,7 +4774,7 @@ void sev_handle_rmp_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code) out: trace_kvm_rmp_fault(vcpu, gpa, pfn, error_code, rmp_level, ret); out_no_trace: - put_page(pfn_to_page(pfn)); + kvm_release_page_unused(page); } static bool is_pfn_range_shared(kvm_pfn_t start, kvm_pfn_t end) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a63b0325d3e2..6efdc00b4254 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2487,11 +2487,13 @@ static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn) #ifdef CONFIG_KVM_PRIVATE_MEM int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn, kvm_pfn_t *pfn, int *max_order); + gfn_t gfn, kvm_pfn_t *pfn, struct page **page, + int *max_order); #else static inline int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, - kvm_pfn_t *pfn, int *max_order) + kvm_pfn_t *pfn, struct page **page, + int *max_order) { KVM_BUG_ON(1, kvm); return -EIO; diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c index 8a878e57c5d4..47a9f68f7b24 100644 --- a/virt/kvm/guest_memfd.c +++ b/virt/kvm/guest_memfd.c @@ -594,7 +594,8 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file, } int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, - gfn_t gfn, kvm_pfn_t *pfn, int *max_order) + gfn_t gfn, kvm_pfn_t *pfn, struct page **page, + int *max_order) { pgoff_t index = kvm_gmem_get_index(slot, gfn); struct file *file = kvm_gmem_get_file(slot); @@ -615,7 +616,10 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot, r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio); folio_unlock(folio); - if (r < 0) + + if (!r) + *page = folio_file_page(folio, index); + else folio_put(folio); out: -- 2.51.0 From 8dd861cc07e238c1474bb0903caf8cd3b5b5e2b4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:49 -0700 Subject: [PATCH 03/16] KVM: x86/mmu: Put refcounted pages instead of blindly releasing pfns MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Now that all x86 page fault paths precisely track refcounted pages, use Use kvm_page_fault.refcounted_page to put references to struct page memory when finishing page faults. This is a baby step towards eliminating kvm_pfn_to_refcounted_page(). Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-48-seanjc@google.com> --- arch/x86/kvm/mmu/mmu.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c657c3c449c8..fe3f3bc0bb79 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4380,6 +4380,9 @@ static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu, lockdep_assert_once(lockdep_is_held(&vcpu->kvm->mmu_lock) || r == RET_PF_RETRY); + if (!fault->refcounted_page) + return; + /* * If the page that KVM got from the *primary MMU* is writable, and KVM * installed or reused a SPTE, mark the page/folio dirty. Note, this @@ -4391,9 +4394,9 @@ static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu, * folio dirty if KVM could locklessly make the SPTE writable. */ if (!fault->map_writable || r == RET_PF_RETRY) - kvm_release_pfn_clean(fault->pfn); + kvm_release_page_clean(fault->refcounted_page); else - kvm_release_pfn_dirty(fault->pfn); + kvm_release_page_dirty(fault->refcounted_page); } static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, -- 2.51.0 From 8eaa98004b23e02adb3e11120132e0e2fecc6e0e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:50 -0700 Subject: [PATCH 04/16] KVM: x86/mmu: Don't mark unused faultin pages as accessed MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit When finishing guest page faults, don't mark pages as accessed if KVM is resuming the guest _without_ installing a mapping, i.e. if the page isn't being used. While it's possible that marking the page accessed could avoid minor thrashing due to reclaiming a page that the guest is about to access, it's far more likely that the gfn=>pfn mapping was was invalidated, e.g. due a memslot change, or because the corresponding VMA is being modified. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-49-seanjc@google.com> --- arch/x86/kvm/mmu/mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index fe3f3bc0bb79..f3a4ed6afec2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4393,7 +4393,9 @@ static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu, * fault handler, and so KVM must (somewhat) speculatively mark the * folio dirty if KVM could locklessly make the SPTE writable. */ - if (!fault->map_writable || r == RET_PF_RETRY) + if (r == RET_PF_RETRY) + kvm_release_page_unused(fault->refcounted_page); + else if (!fault->map_writable) kvm_release_page_clean(fault->refcounted_page); else kvm_release_page_dirty(fault->refcounted_page); -- 2.51.0 From dc06193532af4ba88ed20daeef88f22b053ebb91 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:51 -0700 Subject: [PATCH 05/16] KVM: Move x86's API to release a faultin page to common KVM MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Move KVM x86's helper that "finishes" the faultin process to common KVM so that the logic can be shared across all architectures. Note, not all architectures implement a fast page fault path, but the gist of the comment applies to all architectures. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-50-seanjc@google.com> --- arch/x86/kvm/mmu/mmu.c | 24 ++---------------------- include/linux/kvm_host.h | 26 ++++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f3a4ed6afec2..0aae8e63566c 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4377,28 +4377,8 @@ static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, static void kvm_mmu_finish_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, int r) { - lockdep_assert_once(lockdep_is_held(&vcpu->kvm->mmu_lock) || - r == RET_PF_RETRY); - - if (!fault->refcounted_page) - return; - - /* - * If the page that KVM got from the *primary MMU* is writable, and KVM - * installed or reused a SPTE, mark the page/folio dirty. Note, this - * may mark a folio dirty even if KVM created a read-only SPTE, e.g. if - * the GFN is write-protected. Folios can't be safely marked dirty - * outside of mmu_lock as doing so could race with writeback on the - * folio. As a result, KVM can't mark folios dirty in the fast page - * fault handler, and so KVM must (somewhat) speculatively mark the - * folio dirty if KVM could locklessly make the SPTE writable. - */ - if (r == RET_PF_RETRY) - kvm_release_page_unused(fault->refcounted_page); - else if (!fault->map_writable) - kvm_release_page_clean(fault->refcounted_page); - else - kvm_release_page_dirty(fault->refcounted_page); + kvm_release_faultin_page(vcpu->kvm, fault->refcounted_page, + r == RET_PF_RETRY, fault->map_writable); } static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6efdc00b4254..3e06393e5f1e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1231,6 +1231,32 @@ static inline void kvm_release_page_unused(struct page *page) void kvm_release_page_clean(struct page *page); void kvm_release_page_dirty(struct page *page); +static inline void kvm_release_faultin_page(struct kvm *kvm, struct page *page, + bool unused, bool dirty) +{ + lockdep_assert_once(lockdep_is_held(&kvm->mmu_lock) || unused); + + if (!page) + return; + + /* + * If the page that KVM got from the *primary MMU* is writable, and KVM + * installed or reused a SPTE, mark the page/folio dirty. Note, this + * may mark a folio dirty even if KVM created a read-only SPTE, e.g. if + * the GFN is write-protected. Folios can't be safely marked dirty + * outside of mmu_lock as doing so could race with writeback on the + * folio. As a result, KVM can't mark folios dirty in the fast page + * fault handler, and so KVM must (somewhat) speculatively mark the + * folio dirty if KVM could locklessly make the SPTE writable. + */ + if (unused) + kvm_release_page_unused(page); + else if (dirty) + kvm_release_page_dirty(page); + else + kvm_release_page_clean(page); +} + kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn, unsigned int foll, bool *writable, struct page **refcounted_page); -- 2.51.0 From cb444acb697943c0aaeab085c43e07727cb0b85c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:52 -0700 Subject: [PATCH 06/16] KVM: VMX: Hold mmu_lock until page is released when updating APIC access page MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Hold mmu_lock across kvm_release_pfn_clean() when refreshing the APIC access page address to ensure that KVM doesn't mark a page/folio as accessed after it has been unmapped. Practically speaking marking a folio accesses is benign in this scenario, as KVM does hold a reference (it's really just marking folios dirty that is problematic), but there's no reason not to be paranoid (moving the APIC access page isn't a hot path), and no reason to be different from other mmu_notifier-protected flows in KVM. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-51-seanjc@google.com> --- arch/x86/kvm/vmx/vmx.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 81ed596e4454..bee90089c8a7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6832,25 +6832,22 @@ void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) return; read_lock(&vcpu->kvm->mmu_lock); - if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) { + if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - read_unlock(&vcpu->kvm->mmu_lock); - goto out; - } - - vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); - read_unlock(&vcpu->kvm->mmu_lock); + else + vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); - /* - * No need for a manual TLB flush at this point, KVM has already done a - * flush if there were SPTEs pointing at the previous page. - */ -out: /* * Do not pin apic access page in memory, the MMU notifier * will call us again if it is migrated or swapped out. */ kvm_release_pfn_clean(pfn); + + /* + * No need for a manual TLB flush at this point, KVM has already done a + * flush if there were SPTEs pointing at the previous page. + */ + read_unlock(&vcpu->kvm->mmu_lock); } void vmx_hwapic_isr_update(int max_isr) -- 2.51.0 From 93091f0fc7b7c7c76129cba9105269b4e70a7856 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:53 -0700 Subject: [PATCH 07/16] KVM: VMX: Use __kvm_faultin_page() to get APIC access page/pfn MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Use __kvm_faultin_page() get the APIC access page so that KVM can precisely release the refcounted page, i.e. to remove yet another user of kvm_pfn_to_refcounted_page(). While the path isn't handling a guest page fault, the semantics are effectively the same; KVM just happens to be mapping the pfn into a VMCS field instead of a secondary MMU. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-52-seanjc@google.com> --- arch/x86/kvm/vmx/vmx.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index bee90089c8a7..f6900bec4874 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6790,8 +6790,10 @@ void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) struct kvm *kvm = vcpu->kvm; struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *slot; + struct page *refcounted_page; unsigned long mmu_seq; kvm_pfn_t pfn; + bool writable; /* Defer reload until vmcs01 is the current VMCS. */ if (is_guest_mode(vcpu)) { @@ -6827,7 +6829,7 @@ void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) * controls the APIC-access page memslot, and only deletes the memslot * if APICv is permanently inhibited, i.e. the memslot won't reappear. */ - pfn = gfn_to_pfn_memslot(slot, gfn); + pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, &writable, &refcounted_page); if (is_error_noslot_pfn(pfn)) return; @@ -6838,10 +6840,13 @@ void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); /* - * Do not pin apic access page in memory, the MMU notifier - * will call us again if it is migrated or swapped out. + * Do not pin the APIC access page in memory so that it can be freely + * migrated, the MMU notifier will call us again if it is migrated or + * swapped out. KVM backs the memslot with anonymous memory, the pfn + * should always point at a refcounted page (if the pfn is valid). */ - kvm_release_pfn_clean(pfn); + if (!WARN_ON_ONCE(!refcounted_page)) + kvm_release_page_clean(refcounted_page); /* * No need for a manual TLB flush at this point, KVM has already done a -- 2.51.0 From c9be85dabb376299504e0d391d15662c0edf8273 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:54 -0700 Subject: [PATCH 08/16] KVM: PPC: e500: Mark "struct page" dirty in kvmppc_e500_shadow_map() Mark the underlying page as dirty in kvmppc_e500_ref_setup()'s sole caller, kvmppc_e500_shadow_map(), which will allow converting e500 to __kvm_faultin_pfn() + kvm_release_faultin_page() without having to do a weird dance between ref_setup() and shadow_map(). Opportunistically drop the redundant kvm_set_pfn_accessed(), as shadow_map() puts the page via kvm_release_pfn_clean(). Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-53-seanjc@google.com> --- arch/powerpc/kvm/e500_mmu_host.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index c664fdec75b1..5c2adfd19e12 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -242,7 +242,7 @@ static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe) return tlbe->mas7_3 & (MAS3_SW|MAS3_UW); } -static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, +static inline bool kvmppc_e500_ref_setup(struct tlbe_ref *ref, struct kvm_book3e_206_tlb_entry *gtlbe, kvm_pfn_t pfn, unsigned int wimg) { @@ -252,11 +252,7 @@ static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, /* Use guest supplied MAS2_G and MAS2_E */ ref->flags |= (gtlbe->mas2 & MAS2_ATTRIB_MASK) | wimg; - /* Mark the page accessed */ - kvm_set_pfn_accessed(pfn); - - if (tlbe_is_writable(gtlbe)) - kvm_set_pfn_dirty(pfn); + return tlbe_is_writable(gtlbe); } static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref) @@ -337,6 +333,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, unsigned int wimg = 0; pgd_t *pgdir; unsigned long flags; + bool writable = false; /* used to check for invalidations in progress */ mmu_seq = kvm->mmu_invalidate_seq; @@ -490,7 +487,9 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, goto out; } } - kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); + writable = kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); + if (writable) + kvm_set_pfn_dirty(pfn); kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, ref, gvaddr, stlbe); -- 2.51.0 From 84cf78dcd9d65c45ab73998d4ad50f433d53fb93 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:55 -0700 Subject: [PATCH 09/16] KVM: PPC: e500: Mark "struct page" pfn accessed before dropping mmu_lock Mark pages accessed before dropping mmu_lock when faulting in guest memory so that shadow_map() can convert to kvm_release_faultin_page() without tripping its lockdep assertion on mmu_lock being held. Marking pages accessed outside of mmu_lock is ok (not great, but safe), but marking pages _dirty_ outside of mmu_lock can make filesystems unhappy. Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-54-seanjc@google.com> --- arch/powerpc/kvm/e500_mmu_host.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 5c2adfd19e12..334dd96f8081 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -498,11 +498,9 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, kvmppc_mmu_flush_icache(pfn); out: - spin_unlock(&kvm->mmu_lock); - /* Drop refcount on page, so that mmu notifiers can clear it */ kvm_release_pfn_clean(pfn); - + spin_unlock(&kvm->mmu_lock); return ret; } -- 2.51.0 From 419cfb983ca93e75e905794521afefcfa07988bb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:56 -0700 Subject: [PATCH 10/16] KVM: PPC: e500: Use __kvm_faultin_pfn() to handle page faults Convert PPC e500 to use __kvm_faultin_pfn()+kvm_release_faultin_page(), and continue the inexorable march towards the demise of kvm_pfn_to_refcounted_page(). Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-55-seanjc@google.com> --- arch/powerpc/kvm/e500_mmu_host.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 334dd96f8081..e5a145b578a4 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -322,6 +322,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, { struct kvm_memory_slot *slot; unsigned long pfn = 0; /* silence GCC warning */ + struct page *page = NULL; unsigned long hva; int pfnmap = 0; int tsize = BOOK3E_PAGESZ_4K; @@ -443,7 +444,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, if (likely(!pfnmap)) { tsize_pages = 1UL << (tsize + 10 - PAGE_SHIFT); - pfn = gfn_to_pfn_memslot(slot, gfn); + pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, NULL, &page); if (is_error_noslot_pfn(pfn)) { if (printk_ratelimit()) pr_err("%s: real page not found for gfn %lx\n", @@ -488,8 +489,6 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, } } writable = kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); - if (writable) - kvm_set_pfn_dirty(pfn); kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, ref, gvaddr, stlbe); @@ -498,8 +497,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, kvmppc_mmu_flush_icache(pfn); out: - /* Drop refcount on page, so that mmu notifiers can clear it */ - kvm_release_pfn_clean(pfn); + kvm_release_faultin_page(kvm, page, !!ret, writable); spin_unlock(&kvm->mmu_lock); return ret; } -- 2.51.0 From 28991c91d577c39bbd9f1b5424554890c3aa351b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:57 -0700 Subject: [PATCH 11/16] KVM: arm64: Mark "struct page" pfns accessed/dirty before dropping mmu_lock Mark pages/folios accessed+dirty prior to dropping mmu_lock, as marking a page/folio dirty after it has been written back can make some filesystems unhappy (backing KVM guests will such filesystem files is uncommon, and the race is minuscule, hence the lack of complaints). While scary sounding, practically speaking the worst case scenario is that KVM would trigger this WARN in filemap_unaccount_folio(): /* * At this point folio must be either written or cleaned by * truncate. Dirty folio here signals a bug and loss of * unwritten data - on ordinary filesystems. * * But it's harmless on in-memory filesystems like tmpfs; and can * occur when a driver which did get_user_pages() sets page dirty * before putting it, while the inode is being finally evicted. * * Below fixes dirty accounting after removing the folio entirely * but leaves the dirty flag set: it has no effect for truncated * folio and anyway will be cleared before returning folio to * buddy allocator. */ if (WARN_ON_ONCE(folio_test_dirty(folio) && mapping_can_writeback(mapping))) folio_account_cleaned(folio, inode_to_wb(mapping->host)); KVM won't actually write memory because the stage-2 mappings are protected by the mmu_notifier, i.e. there is no risk of loss of data, even if the VM were backed by memory that needs writeback. See the link below for additional details. This will also allow converting arm64 to kvm_release_faultin_page(), which requires that mmu_lock be held (for the aforementioned reason). Link: https://lore.kernel.org/all/cover.1683044162.git.lstoakes@gmail.com Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-56-seanjc@google.com> --- arch/arm64/kvm/mmu.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 246c820379ec..ac37d482a0d1 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1693,15 +1693,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } out_unlock: + if (writable && !ret) + kvm_release_pfn_dirty(pfn); + else + kvm_release_pfn_clean(pfn); + read_unlock(&kvm->mmu_lock); /* Mark the page dirty only if the fault is handled successfully */ - if (writable && !ret) { - kvm_set_pfn_dirty(pfn); + if (writable && !ret) mark_page_dirty_in_slot(kvm, memslot, gfn); - } - kvm_release_pfn_clean(pfn); return ret != -EAGAIN ? ret : 0; } -- 2.51.0 From 85c7869e30b770082b78134e61c1c7db5a903ea4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:58 -0700 Subject: [PATCH 12/16] KVM: arm64: Use __kvm_faultin_pfn() to handle memory aborts MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Convert arm64 to use __kvm_faultin_pfn()+kvm_release_faultin_page(). Three down, six to go. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-57-seanjc@google.com> --- arch/arm64/kvm/mmu.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index ac37d482a0d1..75795fb67d1d 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1440,6 +1440,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, long vma_pagesize, fault_granule; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt; + struct page *page; if (fault_is_perm) fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); @@ -1561,7 +1562,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, /* * Read mmu_invalidate_seq so that KVM can detect if the results of - * vma_lookup() or __gfn_to_pfn_memslot() become stale prior to + * vma_lookup() or __kvm_faultin_pfn() become stale prior to * acquiring kvm->mmu_lock. * * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs @@ -1570,8 +1571,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, mmu_seq = vcpu->kvm->mmu_invalidate_seq; mmap_read_unlock(current->mm); - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, - write_fault, &writable); + pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, + &writable, &page); if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(hva, vma_shift); return 0; @@ -1584,7 +1585,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * If the page was identified as device early by looking at * the VMA flags, vma_pagesize is already representing the * largest quantity we can map. If instead it was mapped - * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE + * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE * and must not be upgraded. * * In both cases, we don't let transparent_hugepage_adjust() @@ -1693,11 +1694,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } out_unlock: - if (writable && !ret) - kvm_release_pfn_dirty(pfn); - else - kvm_release_pfn_clean(pfn); - + kvm_release_faultin_page(kvm, page, !!ret, writable); read_unlock(&kvm->mmu_lock); /* Mark the page dirty only if the fault is handled successfully */ -- 2.51.0 From 9b3639bb02fb98a664110836147a03b45c83ed94 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:59 -0700 Subject: [PATCH 13/16] KVM: RISC-V: Mark "struct page" pfns dirty iff a stage-2 PTE is installed Don't mark pages dirty if KVM bails from the page fault handler without installing a stage-2 mapping, i.e. if the page is guaranteed to not be written by the guest. In addition to being a (very) minor fix, this paves the way for converting RISC-V to use kvm_release_faultin_page(). Reviewed-by: Andrew Jones Acked-by: Anup Patel Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-58-seanjc@google.com> --- arch/riscv/kvm/mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index b63650f9b966..06aa5a0d056d 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -669,7 +669,6 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, goto out_unlock; if (writable) { - kvm_set_pfn_dirty(hfn); mark_page_dirty(kvm, gfn); ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT, vma_pagesize, false, true); @@ -682,6 +681,9 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, kvm_err("Failed to map in G-stage\n"); out_unlock: + if ((!ret || ret == -EEXIST) && writable) + kvm_set_pfn_dirty(hfn); + spin_unlock(&kvm->mmu_lock); kvm_set_pfn_accessed(hfn); kvm_release_pfn_clean(hfn); -- 2.51.0 From 9c902aee686979d9460d8bd0cabcf2fa0195d7d9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:00 -0700 Subject: [PATCH 14/16] KVM: RISC-V: Mark "struct page" pfns accessed before dropping mmu_lock Mark pages accessed before dropping mmu_lock when faulting in guest memory so that RISC-V can convert to kvm_release_faultin_page() without tripping its lockdep assertion on mmu_lock being held. Marking pages accessed outside of mmu_lock is ok (not great, but safe), but marking pages _dirty_ outside of mmu_lock can make filesystems unhappy (see the link below). Do both under mmu_lock to minimize the chances of doing the wrong thing in the future. Link: https://lore.kernel.org/all/cover.1683044162.git.lstoakes@gmail.com Reviewed-by: Andrew Jones Acked-by: Anup Patel Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-59-seanjc@google.com> --- arch/riscv/kvm/mmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 06aa5a0d056d..2e9aee518142 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -682,11 +682,11 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, out_unlock: if ((!ret || ret == -EEXIST) && writable) - kvm_set_pfn_dirty(hfn); + kvm_release_pfn_dirty(hfn); + else + kvm_release_pfn_clean(hfn); spin_unlock(&kvm->mmu_lock); - kvm_set_pfn_accessed(hfn); - kvm_release_pfn_clean(hfn); return ret; } -- 2.51.0 From 334511d468e5b61e9b4b1432145e51dbce365888 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:01 -0700 Subject: [PATCH 15/16] KVM: RISC-V: Use kvm_faultin_pfn() when mapping pfns into the guest Convert RISC-V to __kvm_faultin_pfn()+kvm_release_faultin_page(), which are new APIs to consolidate arch code and provide consistent behavior across all KVM architectures. Opportunisticaly fix a s/priort/prior typo in the related comment. Reviewed-by: Andrew Jones Acked-by: Anup Patel Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-60-seanjc@google.com> --- arch/riscv/kvm/mmu.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 2e9aee518142..e11ad1b616f3 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -601,6 +601,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, bool logging = (memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY)) ? true : false; unsigned long vma_pagesize, mmu_seq; + struct page *page; /* We need minimum second+third level pages */ ret = kvm_mmu_topup_memory_cache(pcache, gstage_pgd_levels); @@ -631,7 +632,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, /* * Read mmu_invalidate_seq so that KVM can detect if the results of - * vma_lookup() or gfn_to_pfn_prot() become stale priort to acquiring + * vma_lookup() or __kvm_faultin_pfn() become stale prior to acquiring * kvm->mmu_lock. * * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs @@ -647,7 +648,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, return -EFAULT; } - hfn = gfn_to_pfn_prot(kvm, gfn, is_write, &writable); + hfn = kvm_faultin_pfn(vcpu, gfn, is_write, &writable, &page); if (hfn == KVM_PFN_ERR_HWPOISON) { send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, vma_pageshift, current); @@ -681,11 +682,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, kvm_err("Failed to map in G-stage\n"); out_unlock: - if ((!ret || ret == -EEXIST) && writable) - kvm_release_pfn_dirty(hfn); - else - kvm_release_pfn_clean(hfn); - + kvm_release_faultin_page(kvm, page, ret && ret != -EEXIST, writable); spin_unlock(&kvm->mmu_lock); return ret; } -- 2.51.0 From 0865ba14b4ee94edea60897bc4a38ead054c7ab4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:02 -0700 Subject: [PATCH 16/16] KVM: PPC: Use __kvm_faultin_pfn() to handle page faults on Book3s HV Replace Book3s HV's homebrewed fault-in logic with __kvm_faultin_pfn(), which functionally does pretty much the exact same thing. Note, when the code was written, KVM indeed didn't do fast GUP without "!atomic && !async", but that has long since changed (KVM tries fast GUP for all writable mappings). Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-61-seanjc@google.com> --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 25 ++++--------------------- 1 file changed, 4 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 2f1d58984b41..f305395cf26e 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -603,27 +603,10 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu, write_ok = writing; hva = gfn_to_hva_memslot(memslot, gfn); - /* - * Do a fast check first, since __gfn_to_pfn_memslot doesn't - * do it with !atomic && !async, which is how we call it. - * We always ask for write permission since the common case - * is that the page is writable. - */ - if (get_user_page_fast_only(hva, FOLL_WRITE, &page)) { - write_ok = true; - } else { - /* Call KVM generic code to do the slow-path check */ - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, - writing, &write_ok); - if (is_error_noslot_pfn(pfn)) - return -EFAULT; - page = NULL; - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - if (PageReserved(page)) - page = NULL; - } - } + pfn = __kvm_faultin_pfn(memslot, gfn, writing ? FOLL_WRITE : 0, + &write_ok, &page); + if (is_error_noslot_pfn(pfn)) + return -EFAULT; /* * Read the PTE from the process' radix tree and use that -- 2.51.0