]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
mm: abstract THP allocation
authorDev Jain <dev.jain@arm.com>
Tue, 8 Oct 2024 06:17:45 +0000 (11:47 +0530)
committerAndrew Morton <akpm@linux-foundation.org>
Fri, 1 Nov 2024 04:29:04 +0000 (21:29 -0700)
Patch series "Do not shatter hugezeropage on wp-fault", v7.

It was observed at [1] and [2] that the current kernel behaviour of
shattering a hugezeropage is inconsistent and suboptimal.  For a VMA with
a THP allowable order, when we write-fault on it, the kernel installs a
PMD-mapped THP.  On the other hand, if we first get a read fault, we get a
PMD pointing to the hugezeropage; subsequent write will trigger a
write-protection fault, shattering the hugezeropage into one writable
page, and all the other PTEs write-protected.  The conclusion being, as
compared to the case of a single write-fault, applications have to suffer
512 extra page faults if they were to use the VMA as such, plus we get the
overhead of khugepaged trying to replace that area with a THP anyway.

Instead, replace the hugezeropage with a THP on wp-fault.

[1]: https://lore.kernel.org/all/3743d7e1-0b79-4eaf-82d5-d1ca29fe347d@arm.com/
[2]: https://lore.kernel.org/all/1cfae0c0-96a2-4308-9c62-f7a640520242@arm.com/

This patch (of 2):

In preparation for the second patch, abstract away the THP allocation
logic present in the create_huge_pmd() path, which corresponds to the
faulting case when no page is present.

There should be no functional change as a result of applying this patch,
except that, as David notes at [1], a PMD-aligned address should be passed
to update_mmu_cache_pmd().

[1]: https://lore.kernel.org/all/ddd3fcd2-48b3-4170-bcaa-2fe66e093f43@redhat.com/

Link: https://lkml.kernel.org/r/20241008061746.285961-1-dev.jain@arm.com
Link: https://lkml.kernel.org/r/20241008061746.285961-2-dev.jain@arm.com
Signed-off-by: Dev Jain <dev.jain@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Lameter <cl@gentwo.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Lance Yang <ioworker0@gmail.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <yang@os.amperecomputing.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
mm/huge_memory.c

index 48cb888d2c5babd170938b450308ecfb417555b4..a6f53ed005921d1e2e0cdc05f3e2e74bb7527cce 100644 (file)
@@ -1139,47 +1139,81 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 }
 EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
 
-static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
-                       struct page *page, gfp_t gfp)
+static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
+               unsigned long addr)
 {
-       struct vm_area_struct *vma = vmf->vma;
-       struct folio *folio = page_folio(page);
-       pgtable_t pgtable;
-       unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
-       vm_fault_t ret = 0;
+       gfp_t gfp = vma_thp_gfp_mask(vma);
+       const int order = HPAGE_PMD_ORDER;
+       struct folio *folio;
 
-       VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+       folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK, true);
 
+       if (unlikely(!folio)) {
+               count_vm_event(THP_FAULT_FALLBACK);
+               count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
+               return NULL;
+       }
+
+       VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
        if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
                folio_put(folio);
                count_vm_event(THP_FAULT_FALLBACK);
                count_vm_event(THP_FAULT_FALLBACK_CHARGE);
-               count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
-               count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
-               return VM_FAULT_FALLBACK;
+               count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
+               count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+               return NULL;
        }
        folio_throttle_swaprate(folio, gfp);
 
-       pgtable = pte_alloc_one(vma->vm_mm);
-       if (unlikely(!pgtable)) {
-               ret = VM_FAULT_OOM;
-               goto release;
-       }
-
-       folio_zero_user(folio, vmf->address);
+       folio_zero_user(folio, addr);
        /*
         * The memory barrier inside __folio_mark_uptodate makes sure that
         * folio_zero_user writes become visible before the set_pmd_at()
         * write.
         */
        __folio_mark_uptodate(folio);
+       return folio;
+}
+
+static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd,
+               struct vm_area_struct *vma, unsigned long haddr)
+{
+       pmd_t entry;
+
+       entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
+       entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+       folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
+       folio_add_lru_vma(folio, vma);
+       set_pmd_at(vma->vm_mm, haddr, pmd, entry);
+       update_mmu_cache_pmd(vma, haddr, pmd);
+       add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+       count_vm_event(THP_FAULT_ALLOC);
+       count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
+       count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
+}
+
+static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
+{
+       unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+       struct vm_area_struct *vma = vmf->vma;
+       struct folio *folio;
+       pgtable_t pgtable;
+       vm_fault_t ret = 0;
+
+       folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
+       if (unlikely(!folio))
+               return VM_FAULT_FALLBACK;
+
+       pgtable = pte_alloc_one(vma->vm_mm);
+       if (unlikely(!pgtable)) {
+               ret = VM_FAULT_OOM;
+               goto release;
+       }
 
        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_none(*vmf->pmd))) {
                goto unlock_release;
        } else {
-               pmd_t entry;
-
                ret = check_stable_address_space(vma->vm_mm);
                if (ret)
                        goto unlock_release;
@@ -1193,21 +1227,11 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
                        VM_BUG_ON(ret & VM_FAULT_FALLBACK);
                        return ret;
                }
-
-               entry = mk_huge_pmd(page, vma->vm_page_prot);
-               entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-               folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
-               folio_add_lru_vma(folio, vma);
                pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
-               set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
-               update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
-               add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+               map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
                mm_inc_nr_ptes(vma->vm_mm);
                deferred_split_folio(folio, false);
                spin_unlock(vmf->ptl);
-               count_vm_event(THP_FAULT_ALLOC);
-               count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
-               count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
        }
 
        return 0;
@@ -1274,8 +1298,6 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
 vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 {
        struct vm_area_struct *vma = vmf->vma;
-       gfp_t gfp;
-       struct folio *folio;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        vm_fault_t ret;
 
@@ -1326,14 +1348,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
                }
                return ret;
        }
-       gfp = vma_thp_gfp_mask(vma);
-       folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
-       if (unlikely(!folio)) {
-               count_vm_event(THP_FAULT_FALLBACK);
-               count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
-               return VM_FAULT_FALLBACK;
-       }
-       return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
+
+       return __do_huge_pmd_anonymous_page(vmf);
 }
 
 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,