From d438f3a9df9fd01acb9e780449b88f94de99cbeb Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Wed, 6 Jul 2022 16:59:24 -0700 Subject: [PATCH] mm/khugepaged: add flag to predicate khugepaged-only behavior Add .is_khugepaged flag to struct collapse_control so khugepaged-specific behavior can be elided by MADV_COLLAPSE context. Start by protecting khugepaged-specific heuristics by this flag. In MADV_COLLAPSE, the user presumably has reason to believe the collapse will be beneficial and khugepaged heuristics shouldn't prevent the user from doing so: 1) sysfs-controlled knobs khugepaged_max_ptes_[none|swap|shared] 2) requirement that some pages in region being collapsed be young or referenced Link: https://lkml.kernel.org/r/20220706235936.2197195-7-zokeefe@google.com Signed-off-by: Zach O'Keefe Reviewed-by: Yang Shi Cc: Alex Shi Cc: Andrea Arcangeli Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Chris Kennelly Cc: Chris Zankel Cc: David Hildenbrand Cc: David Rientjes Cc: Helge Deller Cc: Hugh Dickins Cc: Ivan Kokshaysky Cc: James Bottomley Cc: Jens Axboe Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Miaohe Lin Cc: Michal Hocko Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavel Begunkov Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Cc: Zi Yan Cc: Dan Carpenter Cc: "Souptick Joarder (HPE)" Signed-off-by: Andrew Morton --- mm/khugepaged.c | 62 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 43 insertions(+), 19 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index c022d8810d8f..dad551a0ced7 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -73,6 +73,8 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); * default collapse hugepages if there is at least one pte mapped like * it would have happened if the vma was large enough during page * fault. + * + * Note that these are only respected if collapse was initiated by khugepaged. */ static unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; @@ -86,6 +88,8 @@ static struct kmem_cache *mm_slot_cache __read_mostly; #define MAX_PTE_MAPPED_THP 8 struct collapse_control { + bool is_khugepaged; + /* Num pages scanned per node */ #if defined(CONFIG_PPC64) u32 node_load[MAX_NUMNODES]; @@ -559,6 +563,7 @@ static bool is_refcount_suitable(struct page *page) static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte, + struct collapse_control *cc, struct list_head *compound_pagelist) { struct page *page = NULL; @@ -572,7 +577,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (pte_none(pteval) || (pte_present(pteval) && is_zero_pfn(pte_pfn(pteval)))) { if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { + (++none_or_zero <= khugepaged_max_ptes_none || + !cc->is_khugepaged)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; @@ -592,8 +598,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_PAGE(!PageAnon(page), page); - if (page_mapcount(page) > 1 && - ++shared > khugepaged_max_ptes_shared) { + if (cc->is_khugepaged && page_mapcount(page) > 1 && + ++shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out; @@ -659,10 +665,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (PageCompound(page)) list_add_tail(&page->lru, compound_pagelist); next: - /* There should be enough young pte to collapse the page */ - if (pte_young(pteval) || - page_is_young(page) || PageReferenced(page) || - mmu_notifier_test_young(vma->vm_mm, address)) + /* + * If collapse was initiated by khugepaged, check that there is + * enough young pte to justify collapsing the page + */ + if (cc->is_khugepaged && + (pte_young(pteval) || page_is_young(page) || + PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, + address))) referenced++; if (pte_write(pteval)) @@ -671,7 +681,7 @@ next: if (unlikely(!writable)) { result = SCAN_PAGE_RO; - } else if (unlikely(!referenced)) { + } else if (unlikely(cc->is_khugepaged && !referenced)) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; @@ -750,6 +760,7 @@ static void khugepaged_alloc_sleep(void) struct collapse_control khugepaged_collapse_control = { + .is_khugepaged = true, .last_target_node = NUMA_NO_NODE, }; @@ -1030,7 +1041,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, mmu_notifier_invalidate_range_end(&range); spin_lock(pte_ptl); - result = __collapse_huge_page_isolate(vma, address, pte, + result = __collapse_huge_page_isolate(vma, address, pte, cc, &compound_pagelist); spin_unlock(pte_ptl); @@ -1121,7 +1132,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; if (is_swap_pte(pteval)) { - if (++unmapped <= khugepaged_max_ptes_swap) { + if (++unmapped <= khugepaged_max_ptes_swap || + !cc->is_khugepaged) { /* * Always be strict with uffd-wp * enabled swap entries. Please see @@ -1140,7 +1152,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, } if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { + (++none_or_zero <= khugepaged_max_ptes_none || + !cc->is_khugepaged)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; @@ -1170,8 +1183,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, goto out_unmap; } - if (page_mapcount(page) > 1 && - ++shared > khugepaged_max_ptes_shared) { + if (cc->is_khugepaged && + page_mapcount(page) > 1 && + ++shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out_unmap; @@ -1225,14 +1239,22 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct vm_area_struct *vma, result = SCAN_PAGE_COUNT; goto out_unmap; } - if (pte_young(pteval) || - page_is_young(page) || PageReferenced(page) || - mmu_notifier_test_young(vma->vm_mm, address)) + + /* + * If collapse was initiated by khugepaged, check that there is + * enough young pte to justify collapsing the page + */ + if (cc->is_khugepaged && + (pte_young(pteval) || page_is_young(page) || + PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, + address))) referenced++; } if (!writable) { result = SCAN_PAGE_RO; - } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) { + } else if (cc->is_khugepaged && + (!referenced || + (unmapped && referenced < HPAGE_PMD_NR / 2))) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; @@ -1901,7 +1923,8 @@ static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, continue; if (xa_is_value(page)) { - if (++swap > khugepaged_max_ptes_swap) { + if (cc->is_khugepaged && + ++swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); break; @@ -1952,7 +1975,8 @@ static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, rcu_read_unlock(); if (result == SCAN_SUCCEED) { - if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { + if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none && + cc->is_khugepaged) { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { -- 2.50.1