From c9be85dabb376299504e0d391d15662c0edf8273 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:54 -0700 Subject: [PATCH 01/16] KVM: PPC: e500: Mark "struct page" dirty in kvmppc_e500_shadow_map() Mark the underlying page as dirty in kvmppc_e500_ref_setup()'s sole caller, kvmppc_e500_shadow_map(), which will allow converting e500 to __kvm_faultin_pfn() + kvm_release_faultin_page() without having to do a weird dance between ref_setup() and shadow_map(). Opportunistically drop the redundant kvm_set_pfn_accessed(), as shadow_map() puts the page via kvm_release_pfn_clean(). Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-53-seanjc@google.com> --- arch/powerpc/kvm/e500_mmu_host.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index c664fdec75b1..5c2adfd19e12 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -242,7 +242,7 @@ static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe) return tlbe->mas7_3 & (MAS3_SW|MAS3_UW); } -static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, +static inline bool kvmppc_e500_ref_setup(struct tlbe_ref *ref, struct kvm_book3e_206_tlb_entry *gtlbe, kvm_pfn_t pfn, unsigned int wimg) { @@ -252,11 +252,7 @@ static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, /* Use guest supplied MAS2_G and MAS2_E */ ref->flags |= (gtlbe->mas2 & MAS2_ATTRIB_MASK) | wimg; - /* Mark the page accessed */ - kvm_set_pfn_accessed(pfn); - - if (tlbe_is_writable(gtlbe)) - kvm_set_pfn_dirty(pfn); + return tlbe_is_writable(gtlbe); } static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref) @@ -337,6 +333,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, unsigned int wimg = 0; pgd_t *pgdir; unsigned long flags; + bool writable = false; /* used to check for invalidations in progress */ mmu_seq = kvm->mmu_invalidate_seq; @@ -490,7 +487,9 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, goto out; } } - kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); + writable = kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); + if (writable) + kvm_set_pfn_dirty(pfn); kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, ref, gvaddr, stlbe); -- 2.51.0 From 84cf78dcd9d65c45ab73998d4ad50f433d53fb93 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:55 -0700 Subject: [PATCH 02/16] KVM: PPC: e500: Mark "struct page" pfn accessed before dropping mmu_lock Mark pages accessed before dropping mmu_lock when faulting in guest memory so that shadow_map() can convert to kvm_release_faultin_page() without tripping its lockdep assertion on mmu_lock being held. Marking pages accessed outside of mmu_lock is ok (not great, but safe), but marking pages _dirty_ outside of mmu_lock can make filesystems unhappy. Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-54-seanjc@google.com> --- arch/powerpc/kvm/e500_mmu_host.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 5c2adfd19e12..334dd96f8081 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -498,11 +498,9 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, kvmppc_mmu_flush_icache(pfn); out: - spin_unlock(&kvm->mmu_lock); - /* Drop refcount on page, so that mmu notifiers can clear it */ kvm_release_pfn_clean(pfn); - + spin_unlock(&kvm->mmu_lock); return ret; } -- 2.51.0 From 419cfb983ca93e75e905794521afefcfa07988bb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:56 -0700 Subject: [PATCH 03/16] KVM: PPC: e500: Use __kvm_faultin_pfn() to handle page faults Convert PPC e500 to use __kvm_faultin_pfn()+kvm_release_faultin_page(), and continue the inexorable march towards the demise of kvm_pfn_to_refcounted_page(). Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-55-seanjc@google.com> --- arch/powerpc/kvm/e500_mmu_host.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 334dd96f8081..e5a145b578a4 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -322,6 +322,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, { struct kvm_memory_slot *slot; unsigned long pfn = 0; /* silence GCC warning */ + struct page *page = NULL; unsigned long hva; int pfnmap = 0; int tsize = BOOK3E_PAGESZ_4K; @@ -443,7 +444,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, if (likely(!pfnmap)) { tsize_pages = 1UL << (tsize + 10 - PAGE_SHIFT); - pfn = gfn_to_pfn_memslot(slot, gfn); + pfn = __kvm_faultin_pfn(slot, gfn, FOLL_WRITE, NULL, &page); if (is_error_noslot_pfn(pfn)) { if (printk_ratelimit()) pr_err("%s: real page not found for gfn %lx\n", @@ -488,8 +489,6 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, } } writable = kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); - if (writable) - kvm_set_pfn_dirty(pfn); kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, ref, gvaddr, stlbe); @@ -498,8 +497,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, kvmppc_mmu_flush_icache(pfn); out: - /* Drop refcount on page, so that mmu notifiers can clear it */ - kvm_release_pfn_clean(pfn); + kvm_release_faultin_page(kvm, page, !!ret, writable); spin_unlock(&kvm->mmu_lock); return ret; } -- 2.51.0 From 28991c91d577c39bbd9f1b5424554890c3aa351b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:57 -0700 Subject: [PATCH 04/16] KVM: arm64: Mark "struct page" pfns accessed/dirty before dropping mmu_lock Mark pages/folios accessed+dirty prior to dropping mmu_lock, as marking a page/folio dirty after it has been written back can make some filesystems unhappy (backing KVM guests will such filesystem files is uncommon, and the race is minuscule, hence the lack of complaints). While scary sounding, practically speaking the worst case scenario is that KVM would trigger this WARN in filemap_unaccount_folio(): /* * At this point folio must be either written or cleaned by * truncate. Dirty folio here signals a bug and loss of * unwritten data - on ordinary filesystems. * * But it's harmless on in-memory filesystems like tmpfs; and can * occur when a driver which did get_user_pages() sets page dirty * before putting it, while the inode is being finally evicted. * * Below fixes dirty accounting after removing the folio entirely * but leaves the dirty flag set: it has no effect for truncated * folio and anyway will be cleared before returning folio to * buddy allocator. */ if (WARN_ON_ONCE(folio_test_dirty(folio) && mapping_can_writeback(mapping))) folio_account_cleaned(folio, inode_to_wb(mapping->host)); KVM won't actually write memory because the stage-2 mappings are protected by the mmu_notifier, i.e. there is no risk of loss of data, even if the VM were backed by memory that needs writeback. See the link below for additional details. This will also allow converting arm64 to kvm_release_faultin_page(), which requires that mmu_lock be held (for the aforementioned reason). Link: https://lore.kernel.org/all/cover.1683044162.git.lstoakes@gmail.com Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-56-seanjc@google.com> --- arch/arm64/kvm/mmu.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 246c820379ec..ac37d482a0d1 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1693,15 +1693,17 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } out_unlock: + if (writable && !ret) + kvm_release_pfn_dirty(pfn); + else + kvm_release_pfn_clean(pfn); + read_unlock(&kvm->mmu_lock); /* Mark the page dirty only if the fault is handled successfully */ - if (writable && !ret) { - kvm_set_pfn_dirty(pfn); + if (writable && !ret) mark_page_dirty_in_slot(kvm, memslot, gfn); - } - kvm_release_pfn_clean(pfn); return ret != -EAGAIN ? ret : 0; } -- 2.51.0 From 85c7869e30b770082b78134e61c1c7db5a903ea4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:58 -0700 Subject: [PATCH 05/16] KVM: arm64: Use __kvm_faultin_pfn() to handle memory aborts MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Convert arm64 to use __kvm_faultin_pfn()+kvm_release_faultin_page(). Three down, six to go. Tested-by: Alex Bennée Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-57-seanjc@google.com> --- arch/arm64/kvm/mmu.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index ac37d482a0d1..75795fb67d1d 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1440,6 +1440,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, long vma_pagesize, fault_granule; enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R; struct kvm_pgtable *pgt; + struct page *page; if (fault_is_perm) fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu); @@ -1561,7 +1562,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, /* * Read mmu_invalidate_seq so that KVM can detect if the results of - * vma_lookup() or __gfn_to_pfn_memslot() become stale prior to + * vma_lookup() or __kvm_faultin_pfn() become stale prior to * acquiring kvm->mmu_lock. * * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs @@ -1570,8 +1571,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, mmu_seq = vcpu->kvm->mmu_invalidate_seq; mmap_read_unlock(current->mm); - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, - write_fault, &writable); + pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0, + &writable, &page); if (pfn == KVM_PFN_ERR_HWPOISON) { kvm_send_hwpoison_signal(hva, vma_shift); return 0; @@ -1584,7 +1585,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * If the page was identified as device early by looking at * the VMA flags, vma_pagesize is already representing the * largest quantity we can map. If instead it was mapped - * via gfn_to_pfn_prot(), vma_pagesize is set to PAGE_SIZE + * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE * and must not be upgraded. * * In both cases, we don't let transparent_hugepage_adjust() @@ -1693,11 +1694,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, } out_unlock: - if (writable && !ret) - kvm_release_pfn_dirty(pfn); - else - kvm_release_pfn_clean(pfn); - + kvm_release_faultin_page(kvm, page, !!ret, writable); read_unlock(&kvm->mmu_lock); /* Mark the page dirty only if the fault is handled successfully */ -- 2.51.0 From 9b3639bb02fb98a664110836147a03b45c83ed94 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:23:59 -0700 Subject: [PATCH 06/16] KVM: RISC-V: Mark "struct page" pfns dirty iff a stage-2 PTE is installed Don't mark pages dirty if KVM bails from the page fault handler without installing a stage-2 mapping, i.e. if the page is guaranteed to not be written by the guest. In addition to being a (very) minor fix, this paves the way for converting RISC-V to use kvm_release_faultin_page(). Reviewed-by: Andrew Jones Acked-by: Anup Patel Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-58-seanjc@google.com> --- arch/riscv/kvm/mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index b63650f9b966..06aa5a0d056d 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -669,7 +669,6 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, goto out_unlock; if (writable) { - kvm_set_pfn_dirty(hfn); mark_page_dirty(kvm, gfn); ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT, vma_pagesize, false, true); @@ -682,6 +681,9 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, kvm_err("Failed to map in G-stage\n"); out_unlock: + if ((!ret || ret == -EEXIST) && writable) + kvm_set_pfn_dirty(hfn); + spin_unlock(&kvm->mmu_lock); kvm_set_pfn_accessed(hfn); kvm_release_pfn_clean(hfn); -- 2.51.0 From 9c902aee686979d9460d8bd0cabcf2fa0195d7d9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:00 -0700 Subject: [PATCH 07/16] KVM: RISC-V: Mark "struct page" pfns accessed before dropping mmu_lock Mark pages accessed before dropping mmu_lock when faulting in guest memory so that RISC-V can convert to kvm_release_faultin_page() without tripping its lockdep assertion on mmu_lock being held. Marking pages accessed outside of mmu_lock is ok (not great, but safe), but marking pages _dirty_ outside of mmu_lock can make filesystems unhappy (see the link below). Do both under mmu_lock to minimize the chances of doing the wrong thing in the future. Link: https://lore.kernel.org/all/cover.1683044162.git.lstoakes@gmail.com Reviewed-by: Andrew Jones Acked-by: Anup Patel Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-59-seanjc@google.com> --- arch/riscv/kvm/mmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 06aa5a0d056d..2e9aee518142 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -682,11 +682,11 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, out_unlock: if ((!ret || ret == -EEXIST) && writable) - kvm_set_pfn_dirty(hfn); + kvm_release_pfn_dirty(hfn); + else + kvm_release_pfn_clean(hfn); spin_unlock(&kvm->mmu_lock); - kvm_set_pfn_accessed(hfn); - kvm_release_pfn_clean(hfn); return ret; } -- 2.51.0 From 334511d468e5b61e9b4b1432145e51dbce365888 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:01 -0700 Subject: [PATCH 08/16] KVM: RISC-V: Use kvm_faultin_pfn() when mapping pfns into the guest Convert RISC-V to __kvm_faultin_pfn()+kvm_release_faultin_page(), which are new APIs to consolidate arch code and provide consistent behavior across all KVM architectures. Opportunisticaly fix a s/priort/prior typo in the related comment. Reviewed-by: Andrew Jones Acked-by: Anup Patel Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-60-seanjc@google.com> --- arch/riscv/kvm/mmu.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index 2e9aee518142..e11ad1b616f3 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -601,6 +601,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, bool logging = (memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY)) ? true : false; unsigned long vma_pagesize, mmu_seq; + struct page *page; /* We need minimum second+third level pages */ ret = kvm_mmu_topup_memory_cache(pcache, gstage_pgd_levels); @@ -631,7 +632,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, /* * Read mmu_invalidate_seq so that KVM can detect if the results of - * vma_lookup() or gfn_to_pfn_prot() become stale priort to acquiring + * vma_lookup() or __kvm_faultin_pfn() become stale prior to acquiring * kvm->mmu_lock. * * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs @@ -647,7 +648,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, return -EFAULT; } - hfn = gfn_to_pfn_prot(kvm, gfn, is_write, &writable); + hfn = kvm_faultin_pfn(vcpu, gfn, is_write, &writable, &page); if (hfn == KVM_PFN_ERR_HWPOISON) { send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, vma_pageshift, current); @@ -681,11 +682,7 @@ int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, kvm_err("Failed to map in G-stage\n"); out_unlock: - if ((!ret || ret == -EEXIST) && writable) - kvm_release_pfn_dirty(hfn); - else - kvm_release_pfn_clean(hfn); - + kvm_release_faultin_page(kvm, page, ret && ret != -EEXIST, writable); spin_unlock(&kvm->mmu_lock); return ret; } -- 2.51.0 From 0865ba14b4ee94edea60897bc4a38ead054c7ab4 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:02 -0700 Subject: [PATCH 09/16] KVM: PPC: Use __kvm_faultin_pfn() to handle page faults on Book3s HV Replace Book3s HV's homebrewed fault-in logic with __kvm_faultin_pfn(), which functionally does pretty much the exact same thing. Note, when the code was written, KVM indeed didn't do fast GUP without "!atomic && !async", but that has long since changed (KVM tries fast GUP for all writable mappings). Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-61-seanjc@google.com> --- arch/powerpc/kvm/book3s_64_mmu_hv.c | 25 ++++--------------------- 1 file changed, 4 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 2f1d58984b41..f305395cf26e 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -603,27 +603,10 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu, write_ok = writing; hva = gfn_to_hva_memslot(memslot, gfn); - /* - * Do a fast check first, since __gfn_to_pfn_memslot doesn't - * do it with !atomic && !async, which is how we call it. - * We always ask for write permission since the common case - * is that the page is writable. - */ - if (get_user_page_fast_only(hva, FOLL_WRITE, &page)) { - write_ok = true; - } else { - /* Call KVM generic code to do the slow-path check */ - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, - writing, &write_ok); - if (is_error_noslot_pfn(pfn)) - return -EFAULT; - page = NULL; - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - if (PageReserved(page)) - page = NULL; - } - } + pfn = __kvm_faultin_pfn(memslot, gfn, writing ? FOLL_WRITE : 0, + &write_ok, &page); + if (is_error_noslot_pfn(pfn)) + return -EFAULT; /* * Read the PTE from the process' radix tree and use that -- 2.51.0 From 431d2f7dcbde3646faa0f14b7e046520498f85eb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:03 -0700 Subject: [PATCH 10/16] KVM: PPC: Use __kvm_faultin_pfn() to handle page faults on Book3s Radix Replace Book3s Radix's homebrewed (read: copy+pasted) fault-in logic with __kvm_faultin_pfn(), which functionally does pretty much the exact same thing. Note, when the code was written, KVM indeed didn't do fast GUP without "!atomic && !async", but that has long since changed (KVM tries fast GUP for all writable mappings). Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-62-seanjc@google.com> --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 29 +++++--------------------- 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 8304b6f8fe45..14891d0a3b73 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -829,40 +829,21 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, unsigned long mmu_seq; unsigned long hva, gfn = gpa >> PAGE_SHIFT; bool upgrade_write = false; - bool *upgrade_p = &upgrade_write; pte_t pte, *ptep; unsigned int shift, level; int ret; bool large_enable; + kvm_pfn_t pfn; /* used to check for invalidations in progress */ mmu_seq = kvm->mmu_invalidate_seq; smp_rmb(); - /* - * Do a fast check first, since __gfn_to_pfn_memslot doesn't - * do it with !atomic && !async, which is how we call it. - * We always ask for write permission since the common case - * is that the page is writable. - */ hva = gfn_to_hva_memslot(memslot, gfn); - if (!kvm_ro && get_user_page_fast_only(hva, FOLL_WRITE, &page)) { - upgrade_write = true; - } else { - unsigned long pfn; - - /* Call KVM generic code to do the slow-path check */ - pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, - writing, upgrade_p); - if (is_error_noslot_pfn(pfn)) - return -EFAULT; - page = NULL; - if (pfn_valid(pfn)) { - page = pfn_to_page(pfn); - if (PageReserved(page)) - page = NULL; - } - } + pfn = __kvm_faultin_pfn(memslot, gfn, writing ? FOLL_WRITE : 0, + &upgrade_write, &page); + if (is_error_noslot_pfn(pfn)) + return -EFAULT; /* * Read the PTE from the process' radix tree and use that -- 2.51.0 From dac09f61e7325158a797fe56981764a8dc297e89 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:04 -0700 Subject: [PATCH 11/16] KVM: PPC: Drop unused @kvm_ro param from kvmppc_book3s_instantiate_page() Drop @kvm_ro from kvmppc_book3s_instantiate_page() as it is now only written, and never read. No functional change intended. Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-63-seanjc@google.com> --- arch/powerpc/include/asm/kvm_book3s.h | 2 +- arch/powerpc/kvm/book3s_64_mmu_radix.c | 6 ++---- arch/powerpc/kvm/book3s_hv_nested.c | 4 +--- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 10618622d7ef..3d289dbe3982 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -203,7 +203,7 @@ extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, extern int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, unsigned long gpa, struct kvm_memory_slot *memslot, - bool writing, bool kvm_ro, + bool writing, pte_t *inserted_pte, unsigned int *levelp); extern int kvmppc_init_vm_radix(struct kvm *kvm); extern void kvmppc_free_radix(struct kvm *kvm); diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 14891d0a3b73..b3e6e73d6a08 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -821,7 +821,7 @@ bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, bool writing, int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, unsigned long gpa, struct kvm_memory_slot *memslot, - bool writing, bool kvm_ro, + bool writing, pte_t *inserted_pte, unsigned int *levelp) { struct kvm *kvm = vcpu->kvm; @@ -931,7 +931,6 @@ int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot; long ret; bool writing = !!(dsisr & DSISR_ISSTORE); - bool kvm_ro = false; /* Check for unusual errors */ if (dsisr & DSISR_UNSUPP_MMU) { @@ -984,7 +983,6 @@ int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu, ea, DSISR_ISSTORE | DSISR_PROTFAULT); return RESUME_GUEST; } - kvm_ro = true; } /* Failed to set the reference/change bits */ @@ -1002,7 +1000,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu, /* Try to insert a pte */ ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing, - kvm_ro, NULL, NULL); + NULL, NULL); if (ret == 0 || ret == -EAGAIN) ret = RESUME_GUEST; diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 05f5220960c6..771173509617 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -1527,7 +1527,6 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, unsigned long n_gpa, gpa, gfn, perm = 0UL; unsigned int shift, l1_shift, level; bool writing = !!(dsisr & DSISR_ISSTORE); - bool kvm_ro = false; long int ret; if (!gp->l1_gr_to_hr) { @@ -1607,7 +1606,6 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, ea, DSISR_ISSTORE | DSISR_PROTFAULT); return RESUME_GUEST; } - kvm_ro = true; } /* 2. Find the host pte for this L1 guest real address */ @@ -1629,7 +1627,7 @@ static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, if (!pte_present(pte) || (writing && !(pte_val(pte) & _PAGE_WRITE))) { /* No suitable pte found -> try to insert a mapping */ ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, - writing, kvm_ro, &pte, &level); + writing, &pte, &level); if (ret == -EAGAIN) return RESUME_GUEST; else if (ret) -- 2.51.0 From 2b26d6b7a8ba047bec3f6f2b46ca7b33373c50cb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:05 -0700 Subject: [PATCH 12/16] KVM: PPC: Book3S: Mark "struct page" pfns dirty/accessed after installing PTE Mark pages/folios dirty/accessed after installing a PTE, and more specifically after acquiring mmu_lock and checking for an mmu_notifier invalidation. Marking a page/folio dirty after it has been written back can make some filesystems unhappy (backing KVM guests will such filesystem files is uncommon, and the race is minuscule, hence the lack of complaints). See the link below for details. This will also allow converting Book3S to kvm_release_faultin_page(), which requires that mmu_lock be held (for the aforementioned reason). Link: https://lore.kernel.org/all/cover.1683044162.git.lstoakes@gmail.com Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-64-seanjc@google.com> --- arch/powerpc/kvm/book3s_64_mmu_host.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index bc6a381b5346..d0e4f7bbdc3d 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -121,13 +121,10 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, vpn = hpt_vpn(orig_pte->eaddr, map->host_vsid, MMU_SEGSIZE_256M); - kvm_set_pfn_accessed(pfn); if (!orig_pte->may_write || !writable) rflags |= PP_RXRX; - else { + else mark_page_dirty(vcpu->kvm, gfn); - kvm_set_pfn_dirty(pfn); - } if (!orig_pte->may_execute) rflags |= HPTE_R_N; @@ -202,8 +199,11 @@ map_again: } out_unlock: + if (!orig_pte->may_write || !writable) + kvm_release_pfn_clean(pfn); + else + kvm_release_pfn_dirty(pfn); spin_unlock(&kvm->mmu_lock); - kvm_release_pfn_clean(pfn); if (cpte) kvmppc_mmu_hpte_cache_free(cpte); -- 2.51.0 From 8b135c77994d06d430c6c29eebdab42411993d9c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:06 -0700 Subject: [PATCH 13/16] KVM: PPC: Use kvm_faultin_pfn() to handle page faults on Book3s PR Convert Book3S PR to __kvm_faultin_pfn()+kvm_release_faultin_page(), which are new APIs to consolidate arch code and provide consistent behavior across all KVM architectures. Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-65-seanjc@google.com> --- arch/powerpc/include/asm/kvm_book3s.h | 2 +- arch/powerpc/kvm/book3s.c | 7 ++++--- arch/powerpc/kvm/book3s_32_mmu_host.c | 7 ++++--- arch/powerpc/kvm/book3s_64_mmu_host.c | 10 +++++----- 4 files changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 3d289dbe3982..e1ff291ba891 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -235,7 +235,7 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr); extern int kvmppc_emulate_paired_single(struct kvm_vcpu *vcpu); extern kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, - bool writing, bool *writable); + bool writing, bool *writable, struct page **page); extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, unsigned long *rmap, long pte_index, int realmode); extern void kvmppc_update_dirty_map(const struct kvm_memory_slot *memslot, diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index ff6c38373957..d79c5d1098c0 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -422,7 +422,7 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) EXPORT_SYMBOL_GPL(kvmppc_core_prepare_to_enter); kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing, - bool *writable) + bool *writable, struct page **page) { ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM; gfn_t gfn = gpa >> PAGE_SHIFT; @@ -437,13 +437,14 @@ kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing, kvm_pfn_t pfn; pfn = (kvm_pfn_t)virt_to_phys((void*)shared_page) >> PAGE_SHIFT; - get_page(pfn_to_page(pfn)); + *page = pfn_to_page(pfn); + get_page(*page); if (writable) *writable = true; return pfn; } - return gfn_to_pfn_prot(vcpu->kvm, gfn, writing, writable); + return kvm_faultin_pfn(vcpu, gfn, writing, writable, page); } EXPORT_SYMBOL_GPL(kvmppc_gpa_to_pfn); diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c index 4b3a8d80cfa3..5b7212edbb13 100644 --- a/arch/powerpc/kvm/book3s_32_mmu_host.c +++ b/arch/powerpc/kvm/book3s_32_mmu_host.c @@ -130,6 +130,7 @@ extern char etext[]; int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, bool iswrite) { + struct page *page; kvm_pfn_t hpaddr; u64 vpn; u64 vsid; @@ -145,7 +146,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, bool writable; /* Get host physical address for gpa */ - hpaddr = kvmppc_gpa_to_pfn(vcpu, orig_pte->raddr, iswrite, &writable); + hpaddr = kvmppc_gpa_to_pfn(vcpu, orig_pte->raddr, iswrite, &writable, &page); if (is_error_noslot_pfn(hpaddr)) { printk(KERN_INFO "Couldn't get guest page for gpa %lx!\n", orig_pte->raddr); @@ -232,7 +233,7 @@ next_pteg: pte = kvmppc_mmu_hpte_cache_next(vcpu); if (!pte) { - kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT); + kvm_release_page_unused(page); r = -EAGAIN; goto out; } @@ -250,7 +251,7 @@ next_pteg: kvmppc_mmu_hpte_cache_map(vcpu, pte); - kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT); + kvm_release_page_clean(page); out: return r; } diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index d0e4f7bbdc3d..be20aee6fd7d 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -88,13 +88,14 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, struct hpte_cache *cpte; unsigned long gfn = orig_pte->raddr >> PAGE_SHIFT; unsigned long pfn; + struct page *page; /* used to check for invalidations in progress */ mmu_seq = kvm->mmu_invalidate_seq; smp_rmb(); /* Get host physical address for gpa */ - pfn = kvmppc_gpa_to_pfn(vcpu, orig_pte->raddr, iswrite, &writable); + pfn = kvmppc_gpa_to_pfn(vcpu, orig_pte->raddr, iswrite, &writable, &page); if (is_error_noslot_pfn(pfn)) { printk(KERN_INFO "Couldn't get guest page for gpa %lx!\n", orig_pte->raddr); @@ -199,10 +200,9 @@ map_again: } out_unlock: - if (!orig_pte->may_write || !writable) - kvm_release_pfn_clean(pfn); - else - kvm_release_pfn_dirty(pfn); + /* FIXME: Don't unconditionally pass unused=false. */ + kvm_release_faultin_page(kvm, page, false, + orig_pte->may_write && writable); spin_unlock(&kvm->mmu_lock); if (cpte) kvmppc_mmu_hpte_cache_free(cpte); -- 2.51.0 From 0fe133a33e4ce333e6f8795e3ec50d94a17c9c16 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:07 -0700 Subject: [PATCH 14/16] KVM: LoongArch: Mark "struct page" pfns dirty only in "slow" page fault path Mark pages/folios dirty only the slow page fault path, i.e. only when mmu_lock is held and the operation is mmu_notifier-protected, as marking a page/folio dirty after it has been written back can make some filesystems unhappy (backing KVM guests will such filesystem files is uncommon, and the race is minuscule, hence the lack of complaints). See the link below for details. Link: https://lore.kernel.org/all/cover.1683044162.git.lstoakes@gmail.com Reviewed-by: Bibo Mao Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-66-seanjc@google.com> --- arch/loongarch/kvm/mmu.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c index 28681dfb4b85..cc2a5f289b14 100644 --- a/arch/loongarch/kvm/mmu.c +++ b/arch/loongarch/kvm/mmu.c @@ -608,13 +608,13 @@ static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool writ if (kvm_pte_young(changed)) kvm_set_pfn_accessed(pfn); - if (kvm_pte_dirty(changed)) { - mark_page_dirty(kvm, gfn); - kvm_set_pfn_dirty(pfn); - } if (page) put_page(page); } + + if (kvm_pte_dirty(changed)) + mark_page_dirty(kvm, gfn); + return ret; out: spin_unlock(&kvm->mmu_lock); @@ -915,12 +915,14 @@ retry: else ++kvm->stat.pages; kvm_set_pte(ptep, new_pte); + + if (writeable) + kvm_set_pfn_dirty(pfn); + spin_unlock(&kvm->mmu_lock); - if (prot_bits & _PAGE_DIRTY) { + if (prot_bits & _PAGE_DIRTY) mark_page_dirty_in_slot(kvm, memslot, gfn); - kvm_set_pfn_dirty(pfn); - } kvm_release_pfn_clean(pfn); out: -- 2.51.0 From 4a2bc01b7a963658137a579671cbd75ceb4863aa Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:08 -0700 Subject: [PATCH 15/16] KVM: LoongArch: Mark "struct page" pfns accessed only in "slow" page fault path Mark pages accessed only in the slow path, before dropping mmu_lock when faulting in guest memory so that LoongArch can convert to kvm_release_faultin_page() without tripping its lockdep assertion on mmu_lock being held. Reviewed-by: Bibo Mao Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-67-seanjc@google.com> --- arch/loongarch/kvm/mmu.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c index cc2a5f289b14..ed43504c5c7e 100644 --- a/arch/loongarch/kvm/mmu.c +++ b/arch/loongarch/kvm/mmu.c @@ -552,12 +552,10 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool write) { int ret = 0; - kvm_pfn_t pfn = 0; kvm_pte_t *ptep, changed, new; gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm *kvm = vcpu->kvm; struct kvm_memory_slot *slot; - struct page *page; spin_lock(&kvm->mmu_lock); @@ -570,8 +568,6 @@ static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool writ /* Track access to pages marked old */ new = kvm_pte_mkyoung(*ptep); - /* call kvm_set_pfn_accessed() after unlock */ - if (write && !kvm_pte_dirty(new)) { if (!kvm_pte_write(new)) { ret = -EFAULT; @@ -595,22 +591,10 @@ static int kvm_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa, bool writ } changed = new ^ (*ptep); - if (changed) { + if (changed) kvm_set_pte(ptep, new); - pfn = kvm_pte_pfn(new); - page = kvm_pfn_to_refcounted_page(pfn); - if (page) - get_page(page); - } - spin_unlock(&kvm->mmu_lock); - if (changed) { - if (kvm_pte_young(changed)) - kvm_set_pfn_accessed(pfn); - - if (page) - put_page(page); - } + spin_unlock(&kvm->mmu_lock); if (kvm_pte_dirty(changed)) mark_page_dirty(kvm, gfn); -- 2.51.0 From 35b80f7b494d813ebe64d5c261a88fa8c9ed6f31 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 10 Oct 2024 11:24:09 -0700 Subject: [PATCH 16/16] KVM: LoongArch: Mark "struct page" pfn accessed before dropping mmu_lock Mark pages accessed before dropping mmu_lock when faulting in guest memory so that LoongArch can convert to kvm_release_faultin_page() without tripping its lockdep assertion on mmu_lock being held. Reviewed-by: Bibo Mao Signed-off-by: Sean Christopherson Tested-by: Dmitry Osipenko Signed-off-by: Paolo Bonzini Message-ID: <20241010182427.1434605-68-seanjc@google.com> --- arch/loongarch/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c index ed43504c5c7e..7066cafcce64 100644 --- a/arch/loongarch/kvm/mmu.c +++ b/arch/loongarch/kvm/mmu.c @@ -902,13 +902,13 @@ retry: if (writeable) kvm_set_pfn_dirty(pfn); + kvm_release_pfn_clean(pfn); spin_unlock(&kvm->mmu_lock); if (prot_bits & _PAGE_DIRTY) mark_page_dirty_in_slot(kvm, memslot, gfn); - kvm_release_pfn_clean(pfn); out: srcu_read_unlock(&kvm->srcu, srcu_idx); return err; -- 2.51.0