From 2686d514c345243e06dba40298a13c7803400a61 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 4 Dec 2024 19:09:48 +0800 Subject: [PATCH 01/16] mm: make zap_pte_range() handle full within-PMD range In preparation for reclaiming empty PTE pages, this commit first makes zap_pte_range() to handle the full within-PMD range, so that we can more easily detect and free PTE pages in this function in subsequent commits. Link: https://lkml.kernel.org/r/76c95ee641da7808cd66d642ab95841df4048295.1733305182.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Jann Horn Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Muchun Song Cc: Peter Xu Cc: Peter Zijlstra Cc: Will Deacon Cc: Zach O'Keefe Cc: Dan Carpenter Signed-off-by: Andrew Morton --- mm/memory.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/memory.c b/mm/memory.c index 694156ecaf95..d4d5bd7046e7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1718,6 +1718,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_t *pte; int nr; +retry: tlb_change_page_size(tlb, PAGE_SIZE); init_rss_vec(rss); start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); @@ -1757,6 +1758,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (force_flush) tlb_flush_mmu(tlb); + if (addr != end) { + cond_resched(); + force_flush = false; + force_break = false; + goto retry; + } + return addr; } -- 2.51.0 From 6375e95f381e3dc85065b6f74263a61522736203 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 4 Dec 2024 19:09:49 +0800 Subject: [PATCH 02/16] mm: pgtable: reclaim empty PTE page in madvise(MADV_DONTNEED) Now in order to pursue high performance, applications mostly use some high-performance user-mode memory allocators, such as jemalloc or tcmalloc. These memory allocators use madvise(MADV_DONTNEED or MADV_FREE) to release physical memory, but neither MADV_DONTNEED nor MADV_FREE will release page table memory, which may cause huge page table memory usage. The following are a memory usage snapshot of one process which actually happened on our server: VIRT: 55t RES: 590g VmPTE: 110g In this case, most of the page table entries are empty. For such a PTE page where all entries are empty, we can actually free it back to the system for others to use. As a first step, this commit aims to synchronously free the empty PTE pages in madvise(MADV_DONTNEED) case. We will detect and free empty PTE pages in zap_pte_range(), and will add zap_details.reclaim_pt to exclude cases other than madvise(MADV_DONTNEED). Once an empty PTE is detected, we first try to hold the pmd lock within the pte lock. If successful, we clear the pmd entry directly (fast path). Otherwise, we wait until the pte lock is released, then re-hold the pmd and pte locks and loop PTRS_PER_PTE times to check pte_none() to re-detect whether the PTE page is empty and free it (slow path). For other cases such as madvise(MADV_FREE), consider scanning and freeing empty PTE pages asynchronously in the future. The following code snippet can show the effect of optimization: mmap 50G while (1) { for (; i < 1024 * 25; i++) { touch 2M memory madvise MADV_DONTNEED 2M } } As we can see, the memory usage of VmPTE is reduced: before after VIRT 50.0 GB 50.0 GB RES 3.1 MB 3.1 MB VmPTE 102640 KB 240 KB [zhengqi.arch@bytedance.com: fix uninitialized symbol 'ptl'] Link: https://lkml.kernel.org/r/20241206112348.51570-1-zhengqi.arch@bytedance.com Link: https://lore.kernel.org/linux-mm/224e6a4e-43b5-4080-bdd8-b0a6fb2f0853@stanley.mountain/ Link: https://lkml.kernel.org/r/92aba2b319a734913f18ba41e7d86a265f0b84e2.1733305182.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Andy Lutomirski Cc: Catalin Marinas Cc: Dave Hansen Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Muchun Song Cc: Peter Xu Cc: Peter Zijlstra Cc: Will Deacon Cc: Zach O'Keefe Cc: Dan Carpenter Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 + mm/Kconfig | 15 ++++++++++ mm/Makefile | 1 + mm/internal.h | 19 +++++++++++++ mm/madvise.c | 7 ++++- mm/memory.c | 21 ++++++++++++-- mm/pt_reclaim.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 132 insertions(+), 3 deletions(-) create mode 100644 mm/pt_reclaim.c diff --git a/include/linux/mm.h b/include/linux/mm.h index 2e5ef71b8629..9372bc058b43 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2320,6 +2320,7 @@ extern void pagefault_out_of_memory(void); struct zap_details { struct folio *single_folio; /* Locked folio to be unmapped */ bool even_cows; /* Zap COWed private pages too? */ + bool reclaim_pt; /* Need reclaim page tables? */ zap_flags_t zap_flags; /* Extra flags for zapping */ }; diff --git a/mm/Kconfig b/mm/Kconfig index 84000b016808..7949ab121070 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1301,6 +1301,21 @@ config ARCH_HAS_USER_SHADOW_STACK The architecture has hardware support for userspace shadow call stacks (eg, x86 CET, arm64 GCS or RISC-V Zicfiss). +config ARCH_SUPPORTS_PT_RECLAIM + def_bool n + +config PT_RECLAIM + bool "reclaim empty user page table pages" + default y + depends on ARCH_SUPPORTS_PT_RECLAIM && MMU && SMP + select MMU_GATHER_RCU_TABLE_FREE + help + Try to reclaim empty user page table pages in paths other than munmap + and exit_mmap path. + + Note: now only empty user PTE page table pages will be reclaimed. + + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index dba52bb0da8a..850386a67b3e 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -146,3 +146,4 @@ obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o obj-$(CONFIG_EXECMEM) += execmem.o obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o +obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o diff --git a/mm/internal.h b/mm/internal.h index 02890b29da5f..b438d35045de 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1541,4 +1541,23 @@ int walk_page_range_mm(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); +/* pt_reclaim.c */ +bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval); +void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb, + pmd_t pmdval); +void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, + struct mmu_gather *tlb); + +#ifdef CONFIG_PT_RECLAIM +bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, + struct zap_details *details); +#else +static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, + struct zap_details *details) +{ + return false; +} +#endif /* CONFIG_PT_RECLAIM */ + + #endif /* __MM_INTERNAL_H */ diff --git a/mm/madvise.c b/mm/madvise.c index 0ceae57da7da..49f3a75046f6 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -851,7 +851,12 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, static long madvise_dontneed_single_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - zap_page_range_single(vma, start, end - start, NULL); + struct zap_details details = { + .reclaim_pt = true, + .even_cows = true, + }; + + zap_page_range_single(vma, start, end - start, &details); return 0; } diff --git a/mm/memory.c b/mm/memory.c index d4d5bd7046e7..560520e20ead 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1436,7 +1436,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) static inline bool should_zap_cows(struct zap_details *details) { /* By default, zap all pages */ - if (!details) + if (!details || details->reclaim_pt) return true; /* Or, we zap COWed pages only if the caller wants to */ @@ -1710,12 +1710,15 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, struct zap_details *details) { bool force_flush = false, force_break = false; - bool any_skipped = false; struct mm_struct *mm = tlb->mm; int rss[NR_MM_COUNTERS]; spinlock_t *ptl; pte_t *start_pte; pte_t *pte; + pmd_t pmdval; + unsigned long start = addr; + bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details); + bool direct_reclaim = false; int nr; retry: @@ -1728,17 +1731,24 @@ retry: flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { + bool any_skipped = false; + if (need_resched()) break; nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss, &force_flush, &force_break, &any_skipped); + if (any_skipped) + can_reclaim_pt = false; if (unlikely(force_break)) { addr += nr * PAGE_SIZE; break; } } while (pte += nr, addr += PAGE_SIZE * nr, addr != end); + if (can_reclaim_pt && addr == end) + direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval); + add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); @@ -1765,6 +1775,13 @@ retry: goto retry; } + if (can_reclaim_pt) { + if (direct_reclaim) + free_pte(mm, start, tlb, pmdval); + else + try_to_free_pte(mm, pmd, start, tlb); + } + return addr; } diff --git a/mm/pt_reclaim.c b/mm/pt_reclaim.c new file mode 100644 index 000000000000..7e9455a18aae --- /dev/null +++ b/mm/pt_reclaim.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +#include "internal.h" + +bool reclaim_pt_is_enabled(unsigned long start, unsigned long end, + struct zap_details *details) +{ + return details && details->reclaim_pt && (end - start >= PMD_SIZE); +} + +bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval) +{ + spinlock_t *pml = pmd_lockptr(mm, pmd); + + if (!spin_trylock(pml)) + return false; + + *pmdval = pmdp_get_lockless(pmd); + pmd_clear(pmd); + spin_unlock(pml); + + return true; +} + +void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb, + pmd_t pmdval) +{ + pte_free_tlb(tlb, pmd_pgtable(pmdval), addr); + mm_dec_nr_ptes(mm); +} + +void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, + struct mmu_gather *tlb) +{ + pmd_t pmdval; + spinlock_t *pml, *ptl = NULL; + pte_t *start_pte, *pte; + int i; + + pml = pmd_lock(mm, pmd); + start_pte = pte_offset_map_rw_nolock(mm, pmd, addr, &pmdval, &ptl); + if (!start_pte) + goto out_ptl; + if (ptl != pml) + spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + + /* Check if it is empty PTE page */ + for (i = 0, pte = start_pte; i < PTRS_PER_PTE; i++, pte++) { + if (!pte_none(ptep_get(pte))) + goto out_ptl; + } + pte_unmap(start_pte); + + pmd_clear(pmd); + + if (ptl != pml) + spin_unlock(ptl); + spin_unlock(pml); + + free_pte(mm, addr, tlb, pmdval); + + return; +out_ptl: + if (start_pte) + pte_unmap_unlock(start_pte, ptl); + if (ptl != pml) + spin_unlock(pml); +} -- 2.51.0 From 718b13861d2256ac95d65b892953282a63faf240 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 4 Dec 2024 19:09:50 +0800 Subject: [PATCH 03/16] x86: mm: free page table pages by RCU instead of semi RCU Now, if CONFIG_MMU_GATHER_RCU_TABLE_FREE is selected, the page table pages will be freed by semi RCU, that is: - batch table freeing: asynchronous free by RCU - single table freeing: IPI + synchronous free In this way, the page table can be lockless traversed by disabling IRQ in paths such as fast GUP. But this is not enough to free the empty PTE page table pages in paths other that munmap and exit_mmap path, because IPI cannot be synchronized with rcu_read_lock() in pte_offset_map{_lock}(). In preparation for supporting empty PTE page table pages reclaimation, let single table also be freed by RCU like batch table freeing. Then we can also use pte_offset_map() etc to prevent PTE page from being freed. Like pte_free_defer(), we can also safely use ptdesc->pt_rcu_head to free the page table pages: - The pt_rcu_head is unioned with pt_list and pmd_huge_pte. - For pt_list, it is used to manage the PGD page in x86. Fortunately tlb_remove_table() will not be used for free PGD pages, so it is safe to use pt_rcu_head. - For pmd_huge_pte, it is used for THPs, so it is safe. After applying this patch, if CONFIG_PT_RECLAIM is enabled, the function call of free_pte() is as follows: free_pte pte_free_tlb __pte_free_tlb ___pte_free_tlb paravirt_tlb_remove_table tlb_remove_table [!CONFIG_PARAVIRT, Xen PV, Hyper-V, KVM] [no-free-memory slowpath:] tlb_table_invalidate tlb_remove_table_one __tlb_remove_table_one [frees via RCU] [fastpath:] tlb_table_flush tlb_remove_table_free [frees via RCU] native_tlb_remove_table [CONFIG_PARAVIRT on native] tlb_remove_table [see above] Link: https://lkml.kernel.org/r/0287d442a973150b0e1019cc406e6322d148277a.1733305182.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Muchun Song Cc: Peter Xu Cc: Will Deacon Cc: Zach O'Keefe Cc: Dan Carpenter Signed-off-by: Andrew Morton --- arch/x86/include/asm/tlb.h | 20 ++++++++++++++++++++ arch/x86/kernel/paravirt.c | 7 +++++++ arch/x86/mm/pgtable.c | 10 +++++++++- include/linux/mm_types.h | 4 +++- mm/mmu_gather.c | 9 ++++++++- 5 files changed, 47 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index 4d3c9d00d6b6..73f0786181cc 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -34,8 +34,28 @@ static inline void __tlb_remove_table(void *table) free_page_and_swap_cache(table); } +#ifdef CONFIG_PT_RECLAIM +static inline void __tlb_remove_table_one_rcu(struct rcu_head *head) +{ + struct page *page; + + page = container_of(head, struct page, rcu_head); + put_page(page); +} + +static inline void __tlb_remove_table_one(void *table) +{ + struct page *page; + + page = table; + call_rcu(&page->rcu_head, __tlb_remove_table_one_rcu); +} +#define __tlb_remove_table_one __tlb_remove_table_one +#endif /* CONFIG_PT_RECLAIM */ + static inline void invlpg(unsigned long addr) { asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); } + #endif /* _ASM_X86_TLB_H */ diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index fec381533555..89688921ea62 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -59,10 +59,17 @@ void __init native_pv_lock_init(void) static_branch_enable(&virt_spin_lock_key); } +#ifndef CONFIG_PT_RECLAIM static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) { tlb_remove_page(tlb, table); } +#else +static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + tlb_remove_table(tlb, table); +} +#endif struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 5745a354a241..69a357b15974 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -19,12 +19,20 @@ EXPORT_SYMBOL(physical_mask); #endif #ifndef CONFIG_PARAVIRT +#ifndef CONFIG_PT_RECLAIM static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) { tlb_remove_page(tlb, table); } -#endif +#else +static inline +void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + tlb_remove_table(tlb, table); +} +#endif /* !CONFIG_PT_RECLAIM */ +#endif /* !CONFIG_PARAVIRT */ gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 332cee285662..7490d84af310 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -438,7 +438,9 @@ FOLIO_MATCH(compound_head, _head_2a); * struct ptdesc - Memory descriptor for page tables. * @__page_flags: Same as page flags. Powerpc only. * @pt_rcu_head: For freeing page table pages. - * @pt_list: List of used page tables. Used for s390 and x86. + * @pt_list: List of used page tables. Used for s390 gmap shadow pages + * (which are not linked into the user page tables) and x86 + * pgds. * @_pt_pad_1: Padding that aliases with page's compound head. * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs. * @__page_mapping: Aliases with page->mapping. Unused for page tables. diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 99b3e9408aa0..1e21022bcf33 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -311,11 +311,18 @@ static inline void tlb_table_invalidate(struct mmu_gather *tlb) } } -static void tlb_remove_table_one(void *table) +#ifndef __tlb_remove_table_one +static inline void __tlb_remove_table_one(void *table) { tlb_remove_table_sync_one(); __tlb_remove_table(table); } +#endif + +static void tlb_remove_table_one(void *table) +{ + __tlb_remove_table_one(table); +} static void tlb_table_flush(struct mmu_gather *tlb) { -- 2.51.0 From 4817f70c25b63ee5e6fd42d376700c058ae16a96 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 4 Dec 2024 19:09:51 +0800 Subject: [PATCH 04/16] x86: select ARCH_SUPPORTS_PT_RECLAIM if X86_64 Now, x86 has fully supported the CONFIG_PT_RECLAIM feature, and reclaiming PTE pages is profitable only on 64-bit systems, so select ARCH_SUPPORTS_PT_RECLAIM if X86_64. Link: https://lkml.kernel.org/r/841c1f35478d5354872d307888979c9e20de9c09.1733305182.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: Jann Horn Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mel Gorman Cc: Muchun Song Cc: Peter Xu Cc: Will Deacon Cc: Zach O'Keefe Cc: Dan Carpenter Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9d7bd0ae48c4..2e1a3e4386de 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -322,6 +322,7 @@ config X86 select FUNCTION_ALIGNMENT_4B imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE + select ARCH_SUPPORTS_PT_RECLAIM if X86_64 config INSTRUCTION_DECODER def_bool y -- 2.51.0 From 67c8b11bd58aee4644c9a6e495d0c234771e9175 Mon Sep 17 00:00:00 2001 From: Wenchao Hao Date: Mon, 2 Dec 2024 20:47:30 +0800 Subject: [PATCH 05/16] mm: add per-order mTHP swap-in fallback/fallback_charge counters Currently, large folio swap-in is supported, but we lack a method to analyze their success ratio. Similar to anon_fault_fallback, we introduce per-order mTHP swpin_fallback and swpin_fallback_charge counters for calculating their success ratio. The new counters are located at: /sys/kernel/mm/transparent_hugepage/hugepages-/stats/ swpin_fallback swpin_fallback_charge Link: https://lkml.kernel.org/r/20241202124730.2407037-1-haowenchao22@gmail.com Signed-off-by: Wenchao Hao Reviewed-by: Barry Song Reviewed-by: Lance Yang Cc: Baolin Wang Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Matthew Wilcox Cc: Peter Xu Cc: Ryan Roberts Cc: Usama Arif Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 10 ++++++++++ include/linux/huge_mm.h | 2 ++ mm/huge_memory.c | 6 ++++++ mm/memory.c | 2 ++ 4 files changed, 20 insertions(+) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index d870f83775bc..dff8d5985f0f 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -591,6 +591,16 @@ swpin is incremented every time a huge page is swapped in from a non-zswap swap device in one piece. +swpin_fallback + is incremented if swapin fails to allocate or charge a huge page + and instead falls back to using huge pages with lower orders or + small pages. + +swpin_fallback_charge + is incremented if swapin fails to charge a huge page and instead + falls back to using huge pages with lower orders or small pages + even though the allocation was successful. + swpout is incremented every time a huge page is swapped out to a non-zswap swap device in one piece without splitting. diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b94c2e8ee918..93e509b6c00e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -121,6 +121,8 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, MTHP_STAT_ZSWPOUT, MTHP_STAT_SWPIN, + MTHP_STAT_SWPIN_FALLBACK, + MTHP_STAT_SWPIN_FALLBACK_CHARGE, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, MTHP_STAT_SHMEM_ALLOC, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 45901dc6710c..6d87db53db33 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -617,6 +617,8 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT); DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN); +DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK); +DEFINE_MTHP_STAT_ATTR(swpin_fallback_charge, MTHP_STAT_SWPIN_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); #ifdef CONFIG_SHMEM @@ -637,6 +639,8 @@ static struct attribute *anon_stats_attrs[] = { #ifndef CONFIG_SHMEM &zswpout_attr.attr, &swpin_attr.attr, + &swpin_fallback_attr.attr, + &swpin_fallback_charge_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, #endif @@ -669,6 +673,8 @@ static struct attribute *any_stats_attrs[] = { #ifdef CONFIG_SHMEM &zswpout_attr.attr, &swpin_attr.attr, + &swpin_fallback_attr.attr, + &swpin_fallback_charge_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, #endif diff --git a/mm/memory.c b/mm/memory.c index 560520e20ead..2d97a17dd3ba 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4258,8 +4258,10 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry)) return folio; + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE); folio_put(folio); } + count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK); order = next_order(&orders, order); } -- 2.51.0 From 19b65ffae985c16da237edccae324bcbf1a5408e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 5 Dec 2024 19:07:48 +0000 Subject: [PATCH 06/16] selftests/mm: add fork CoW guard page test When we fork anonymous pages, apply a guard page then remove it, the previous CoW mapping is cleared. This might not be obvious to an outside observer without taking some time to think about how the overall process functions, so document that this is the case through a test, which also usefully asserts that the behaviour is as we expect. This is grouped with other, more important, fork tests that ensure that guard pages are correctly propagated on fork. Fix a typo in a nearby comment at the same time. [ryan.roberts@arm.com: static process_madvise() wrapper for guard-pages] Link: https://lkml.kernel.org/r/20250107142937.1870478-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20241205190748.115656-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Signed-off-by: Ryan Roberts Reviewed-by: Liam R. Howlett Cc: Jann Horn Cc: Shuah Khan Cc: Vlastimil Babka Cc: Ryan Roberts Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/guard-pages.c | 83 ++++++++++++++++++++++- tools/testing/selftests/mm/run_vmtests.sh | 5 ++ 2 files changed, 85 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/mm/guard-pages.c b/tools/testing/selftests/mm/guard-pages.c index 7cdf815d0d63..ece37212a8a2 100644 --- a/tools/testing/selftests/mm/guard-pages.c +++ b/tools/testing/selftests/mm/guard-pages.c @@ -55,6 +55,12 @@ static int pidfd_open(pid_t pid, unsigned int flags) return syscall(SYS_pidfd_open, pid, flags); } +static ssize_t sys_process_madvise(int pidfd, const struct iovec *iovec, + size_t n, int advice, unsigned int flags) +{ + return syscall(__NR_process_madvise, pidfd, iovec, n, advice, flags); +} + /* * Enable our signal catcher and try to read/write the specified buffer. The * return value indicates whether the read/write succeeds without a fatal @@ -419,7 +425,7 @@ TEST_F(guard_pages, process_madvise) ASSERT_EQ(munmap(&ptr_region[99 * page_size], page_size), 0); /* Now guard in one step. */ - count = process_madvise(pidfd, vec, 6, MADV_GUARD_INSTALL, 0); + count = sys_process_madvise(pidfd, vec, 6, MADV_GUARD_INSTALL, 0); /* OK we don't have permission to do this, skip. */ if (count == -1 && errno == EPERM) @@ -440,7 +446,7 @@ TEST_F(guard_pages, process_madvise) ASSERT_FALSE(try_read_write_buf(&ptr3[19 * page_size])); /* Now do the same with unguard... */ - count = process_madvise(pidfd, vec, 6, MADV_GUARD_REMOVE, 0); + count = sys_process_madvise(pidfd, vec, 6, MADV_GUARD_REMOVE, 0); /* ...and everything should now succeed. */ @@ -990,7 +996,7 @@ TEST_F(guard_pages, fork) MAP_ANON | MAP_PRIVATE, -1, 0); ASSERT_NE(ptr, MAP_FAILED); - /* Establish guard apges in the first 5 pages. */ + /* Establish guard pages in the first 5 pages. */ ASSERT_EQ(madvise(ptr, 5 * page_size, MADV_GUARD_INSTALL), 0); pid = fork(); @@ -1029,6 +1035,77 @@ TEST_F(guard_pages, fork) ASSERT_EQ(munmap(ptr, 10 * page_size), 0); } +/* + * Assert expected behaviour after we fork populated ranges of anonymous memory + * and then guard and unguard the range. + */ +TEST_F(guard_pages, fork_cow) +{ + const unsigned long page_size = self->page_size; + char *ptr; + pid_t pid; + int i; + + /* Map 10 pages. */ + ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* Populate range. */ + for (i = 0; i < 10 * page_size; i++) { + char chr = 'a' + (i % 26); + + ptr[i] = chr; + } + + pid = fork(); + ASSERT_NE(pid, -1); + if (!pid) { + /* This is the child process now. */ + + /* Ensure the range is as expected. */ + for (i = 0; i < 10 * page_size; i++) { + char expected = 'a' + (i % 26); + char actual = ptr[i]; + + ASSERT_EQ(actual, expected); + } + + /* Establish guard pages across the whole range. */ + ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0); + /* Remove it. */ + ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0); + + /* + * By removing the guard pages, the page tables will be + * cleared. Assert that we are looking at the zero page now. + */ + for (i = 0; i < 10 * page_size; i++) { + char actual = ptr[i]; + + ASSERT_EQ(actual, '\0'); + } + + exit(0); + } + + /* Parent process. */ + + /* Parent simply waits on child. */ + waitpid(pid, NULL, 0); + + /* Ensure the range is unchanged in parent anon range. */ + for (i = 0; i < 10 * page_size; i++) { + char expected = 'a' + (i % 26); + char actual = ptr[i]; + + ASSERT_EQ(actual, expected); + } + + /* Cleanup. */ + ASSERT_EQ(munmap(ptr, 10 * page_size), 0); +} + /* * Assert that forking a process with VMAs that do have VM_WIPEONFORK set * behave as expected. diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 2fc290d9430c..00c3f07ea100 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -45,6 +45,8 @@ separated by spaces: vmalloc smoke tests - hmm hmm smoke tests +- madv_guard + test madvise(2) MADV_GUARD_INSTALL and MADV_GUARD_REMOVE options - madv_populate test memadvise(2) MADV_POPULATE_{READ,WRITE} options - memfd_secret @@ -375,6 +377,9 @@ CATEGORY="mremap" run_test ./mremap_dontunmap CATEGORY="hmm" run_test bash ./test_hmm.sh smoke +# MADV_GUARD_INSTALL and MADV_GUARD_REMOVE tests +CATEGORY="madv_guard" run_test ./guard-pages + # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests CATEGORY="madv_populate" run_test ./madv_populate -- 2.51.0 From a474d84359d1a25feae2aafacf35dcaa196fe5f3 Mon Sep 17 00:00:00 2001 From: Guo Weikang Date: Thu, 5 Dec 2024 17:45:21 +0800 Subject: [PATCH 07/16] mm/shmem: refactor to reuse vfs_parse_monolithic_sep for option parsing shmem_parse_options() is refactored to use vfs_parse_monolithic_sep() with a custom separator function, shmem_next_opt(). This eliminates redundant logic for parsing comma-separated options and ensures consistency with other kernel code that uses the same interface. The vfs_parse_monolithic_sep() helper was introduced in commit e001d1447cd4 ("fs: factor out vfs_parse_monolithic_sep() helper"). Link: https://lkml.kernel.org/r/20241205094521.1244678-1-guoweikang.kernel@gmail.com Signed-off-by: Guo Weikang Cc: Amir Goldstein Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/shmem.c | 65 +++++++++++++++++++++++------------------------------- 1 file changed, 27 insertions(+), 38 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 80af54b0b527..0892889744a2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4647,48 +4647,37 @@ bad_value: return invalfc(fc, "Bad value for '%s'", param->key); } -static int shmem_parse_options(struct fs_context *fc, void *data) +static char *shmem_next_opt(char **s) { - char *options = data; + char *sbegin = *s; + char *p; - if (options) { - int err = security_sb_eat_lsm_opts(options, &fc->security); - if (err) - return err; - } + if (sbegin == NULL) + return NULL; - while (options != NULL) { - char *this_char = options; - for (;;) { - /* - * NUL-terminate this option: unfortunately, - * mount options form a comma-separated list, - * but mpol's nodelist may also contain commas. - */ - options = strchr(options, ','); - if (options == NULL) - break; - options++; - if (!isdigit(*options)) { - options[-1] = '\0'; - break; - } - } - if (*this_char) { - char *value = strchr(this_char, '='); - size_t len = 0; - int err; - - if (value) { - *value++ = '\0'; - len = strlen(value); - } - err = vfs_parse_fs_string(fc, this_char, value, len); - if (err < 0) - return err; + /* + * NUL-terminate this option: unfortunately, + * mount options form a comma-separated list, + * but mpol's nodelist may also contain commas. + */ + for (;;) { + p = strchr(*s, ','); + if (p == NULL) + break; + *s = p + 1; + if (!isdigit(*(p+1))) { + *p = '\0'; + return sbegin; } } - return 0; + + *s = NULL; + return sbegin; +} + +static int shmem_parse_monolithic(struct fs_context *fc, void *data) +{ + return vfs_parse_monolithic_sep(fc, data, shmem_next_opt); } /* @@ -5038,7 +5027,7 @@ static const struct fs_context_operations shmem_fs_context_ops = { .free = shmem_free_fc, .get_tree = shmem_get_tree, #ifdef CONFIG_TMPFS - .parse_monolithic = shmem_parse_options, + .parse_monolithic = shmem_parse_monolithic, .parse_param = shmem_parse_one, .reconfigure = shmem_reconfigure, #endif -- 2.51.0 From dba4761a3e40433a8d9e434d515ecbae19b3dcb1 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 22 Nov 2024 09:44:14 -0800 Subject: [PATCH 08/16] seqlock: add raw_seqcount_try_begin Add raw_seqcount_try_begin() to opens a read critical section of the given seqcount_t if the counter is even. This enables eliding the critical section entirely if the counter is odd, instead of doing the speculation knowing it will fail. Link: https://lkml.kernel.org/r/20241122174416.1367052-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: David Hildenbrand Reviewed-by: Liam R. Howlett Suggested-by: Peter Zijlstra Cc: Christian Brauner Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: Paul E. McKenney Cc: Peter Xu Cc: Shakeel Butt Cc: Sourav Panda Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/seqlock.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 5298765d6ca4..22c2c48b4265 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -318,6 +318,28 @@ SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) __seq; \ }) +/** + * raw_seqcount_try_begin() - begin a seqcount_t read critical section + * w/o lockdep and w/o counter stabilization + * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants + * + * Similar to raw_seqcount_begin(), except it enables eliding the critical + * section entirely if odd, instead of doing the speculation knowing it will + * fail. + * + * Useful when counter stabilization is more or less equivalent to taking + * the lock and there is a slowpath that does that. + * + * If true, start will be set to the (even) sequence count read. + * + * Return: true when a read critical section is started. + */ +#define raw_seqcount_try_begin(s, start) \ +({ \ + start = raw_read_seqcount(s); \ + !(start & 1); \ +}) + /** * raw_seqcount_begin() - begin a seqcount_t read critical section w/o * lockdep and w/o counter stabilization -- 2.51.0 From e5e7fb278e5924f29ceab42bbbb891cde528f7cc Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 22 Nov 2024 09:44:15 -0800 Subject: [PATCH 09/16] mm: convert mm_lock_seq to a proper seqcount Convert mm_lock_seq to be seqcount_t and change all mmap_write_lock variants to increment it, in-line with the usual seqcount usage pattern. This lets us check whether the mmap_lock is write-locked by checking mm_lock_seq.sequence counter (odd=locked, even=unlocked). This will be used when implementing mmap_lock speculation functions. As a result vm_lock_seq is also change to be unsigned to match the type of mm_lock_seq.sequence. Link: https://lkml.kernel.org/r/20241122174416.1367052-2-surenb@google.com Suggested-by: Peter Zijlstra Signed-off-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: Paul E. McKenney Cc: Peter Xu Cc: Shakeel Butt Cc: Sourav Panda Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 +++---- include/linux/mm_types.h | 7 ++-- include/linux/mmap_lock.h | 55 +++++++++++++++++++++----------- kernel/fork.c | 5 +-- mm/init-mm.c | 2 +- tools/testing/vma/vma.c | 4 +-- tools/testing/vma/vma_internal.h | 4 +-- 7 files changed, 53 insertions(+), 36 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9372bc058b43..a3a50c37603e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -711,7 +711,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq)) + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) return false; if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) @@ -728,7 +728,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * after it has been unlocked. * This pairs with RELEASE semantics in vma_end_write_all(). */ - if (unlikely(vma->vm_lock_seq == smp_load_acquire(&vma->vm_mm->mm_lock_seq))) { + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { up_read(&vma->vm_lock->lock); return false; } @@ -743,7 +743,7 @@ static inline void vma_end_read(struct vm_area_struct *vma) } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ -static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) +static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); @@ -751,7 +751,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) * current task is holding mmap_write_lock, both vma->vm_lock_seq and * mm->mm_lock_seq can't be concurrently modified. */ - *mm_lock_seq = vma->vm_mm->mm_lock_seq; + *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence; return (vma->vm_lock_seq == *mm_lock_seq); } @@ -762,7 +762,7 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) */ static inline void vma_start_write(struct vm_area_struct *vma) { - int mm_lock_seq; + unsigned int mm_lock_seq; if (__is_vma_write_locked(vma, &mm_lock_seq)) return; @@ -780,7 +780,7 @@ static inline void vma_start_write(struct vm_area_struct *vma) static inline void vma_assert_write_locked(struct vm_area_struct *vma) { - int mm_lock_seq; + unsigned int mm_lock_seq; VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7490d84af310..5f1b2dc788e2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -729,7 +729,7 @@ struct vm_area_struct { * counter reuse can only lead to occasional unnecessary use of the * slowpath. */ - int vm_lock_seq; + unsigned int vm_lock_seq; /* Unstable RCU readers are allowed to read this. */ struct vma_lock *vm_lock; #endif @@ -923,6 +923,9 @@ struct mm_struct { * Roughly speaking, incrementing the sequence number is * equivalent to releasing locks on VMAs; reading the sequence * number can be part of taking a read lock on a VMA. + * Incremented every time mmap_lock is write-locked/unlocked. + * Initialized to 0, therefore odd values indicate mmap_lock + * is write-locked and even values that it's released. * * Can be modified under write mmap_lock using RELEASE * semantics. @@ -931,7 +934,7 @@ struct mm_struct { * Can be read with ACQUIRE semantics if not holding write * mmap_lock. */ - int mm_lock_seq; + seqcount_t mm_lock_seq; #endif diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index de9dc20b01ba..9715326f5a85 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -71,39 +71,39 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) } #ifdef CONFIG_PER_VMA_LOCK -/* - * Drop all currently-held per-VMA locks. - * This is called from the mmap_lock implementation directly before releasing - * a write-locked mmap_lock (or downgrading it to read-locked). - * This should normally NOT be called manually from other places. - * If you want to call this manually anyway, keep in mind that this will release - * *all* VMA write locks, including ones from further up the stack. - */ -static inline void vma_end_write_all(struct mm_struct *mm) +static inline void mm_lock_seqcount_init(struct mm_struct *mm) { - mmap_assert_write_locked(mm); - /* - * Nobody can concurrently modify mm->mm_lock_seq due to exclusive - * mmap_lock being held. - * We need RELEASE semantics here to ensure that preceding stores into - * the VMA take effect before we unlock it with this store. - * Pairs with ACQUIRE semantics in vma_start_read(). - */ - smp_store_release(&mm->mm_lock_seq, mm->mm_lock_seq + 1); + seqcount_init(&mm->mm_lock_seq); +} + +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) +{ + do_raw_write_seqcount_begin(&mm->mm_lock_seq); +} + +static inline void mm_lock_seqcount_end(struct mm_struct *mm) +{ + ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq); + do_raw_write_seqcount_end(&mm->mm_lock_seq); } + #else -static inline void vma_end_write_all(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {} +static inline void mm_lock_seqcount_end(struct mm_struct *mm) {} #endif static inline void mmap_init_lock(struct mm_struct *mm) { init_rwsem(&mm->mmap_lock); + mm_lock_seqcount_init(mm); } static inline void mmap_write_lock(struct mm_struct *mm) { __mmap_lock_trace_start_locking(mm, true); down_write(&mm->mmap_lock); + mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, true); } @@ -111,6 +111,7 @@ static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) { __mmap_lock_trace_start_locking(mm, true); down_write_nested(&mm->mmap_lock, subclass); + mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, true); } @@ -120,10 +121,26 @@ static inline int mmap_write_lock_killable(struct mm_struct *mm) __mmap_lock_trace_start_locking(mm, true); ret = down_write_killable(&mm->mmap_lock); + if (!ret) + mm_lock_seqcount_begin(mm); __mmap_lock_trace_acquire_returned(mm, true, ret == 0); return ret; } +/* + * Drop all currently-held per-VMA locks. + * This is called from the mmap_lock implementation directly before releasing + * a write-locked mmap_lock (or downgrading it to read-locked). + * This should normally NOT be called manually from other places. + * If you want to call this manually anyway, keep in mind that this will release + * *all* VMA write locks, including ones from further up the stack. + */ +static inline void vma_end_write_all(struct mm_struct *mm) +{ + mmap_assert_write_locked(mm); + mm_lock_seqcount_end(mm); +} + static inline void mmap_write_unlock(struct mm_struct *mm) { __mmap_lock_trace_released(mm, true); diff --git a/kernel/fork.c b/kernel/fork.c index 9b301180fd41..ded49f18cd95 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -448,7 +448,7 @@ static bool vma_lock_alloc(struct vm_area_struct *vma) return false; init_rwsem(&vma->vm_lock->lock); - vma->vm_lock_seq = -1; + vma->vm_lock_seq = UINT_MAX; return true; } @@ -1262,9 +1262,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, seqcount_init(&mm->write_protect_seq); mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); -#ifdef CONFIG_PER_VMA_LOCK - mm->mm_lock_seq = 0; -#endif mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; diff --git a/mm/init-mm.c b/mm/init-mm.c index 24c809379274..6af3ad675930 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -40,7 +40,7 @@ struct mm_struct init_mm = { .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), #ifdef CONFIG_PER_VMA_LOCK - .mm_lock_seq = 0, + .mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq), #endif .user_ns = &init_user_ns, .cpu_bitmap = CPU_BITS_NONE, diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index 891d87a9ad6b..920fba58884e 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -100,7 +100,7 @@ static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm, * begun. Linking to the tree will have caused this to be incremented, * which means we will get a false positive otherwise. */ - vma->vm_lock_seq = -1; + vma->vm_lock_seq = UINT_MAX; return vma; } @@ -225,7 +225,7 @@ static bool vma_write_started(struct vm_area_struct *vma) int seq = vma->vm_lock_seq; /* We reset after each check. */ - vma->vm_lock_seq = -1; + vma->vm_lock_seq = UINT_MAX; /* The vma_start_write() stub simply increments this value. */ return seq > -1; diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index a7de59a0d694..b973b3e41c83 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -281,7 +281,7 @@ struct vm_area_struct { * counter reuse can only lead to occasional unnecessary use of the * slowpath. */ - int vm_lock_seq; + unsigned int vm_lock_seq; struct vma_lock *vm_lock; #endif @@ -467,7 +467,7 @@ static inline bool vma_lock_alloc(struct vm_area_struct *vma) return false; init_rwsem(&vma->vm_lock->lock); - vma->vm_lock_seq = -1; + vma->vm_lock_seq = UINT_MAX; return true; } -- 2.51.0 From 6f030e32e4499942a223677169d006085d8c57ce Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 22 Nov 2024 09:44:16 -0800 Subject: [PATCH 10/16] mm: introduce mmap_lock_speculate_{try_begin|retry} Add helper functions to speculatively perform operations without read-locking mmap_lock, expecting that mmap_lock will not be write-locked and mm is not modified from under us. [akpm@linux-foundation.org: use read_seqcount_retry() in mmap_lock_speculate_retry(), per Wei Yang] Link: https://lkml.kernel.org/r/20241122174416.1367052-3-surenb@google.com Suggested-by: Peter Zijlstra Signed-off-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: Paul E. McKenney Cc: Peter Xu Cc: Shakeel Butt Cc: Sourav Panda Cc: Vlastimil Babka Cc: Wei Yang Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 9715326f5a85..45a21faa3ff6 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -71,6 +71,7 @@ static inline void mmap_assert_write_locked(const struct mm_struct *mm) } #ifdef CONFIG_PER_VMA_LOCK + static inline void mm_lock_seqcount_init(struct mm_struct *mm) { seqcount_init(&mm->mm_lock_seq); @@ -87,11 +88,39 @@ static inline void mm_lock_seqcount_end(struct mm_struct *mm) do_raw_write_seqcount_end(&mm->mm_lock_seq); } -#else +static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) +{ + /* + * Since mmap_lock is a sleeping lock, and waiting for it to become + * unlocked is more or less equivalent with taking it ourselves, don't + * bother with the speculative path if mmap_lock is already write-locked + * and take the slow path, which takes the lock. + */ + return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq); +} + +static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) +{ + return read_seqcount_retry(&mm->mm_lock_seq, seq); +} + +#else /* CONFIG_PER_VMA_LOCK */ + static inline void mm_lock_seqcount_init(struct mm_struct *mm) {} static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {} static inline void mm_lock_seqcount_end(struct mm_struct *mm) {} -#endif + +static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq) +{ + return false; +} + +static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq) +{ + return true; +} + +#endif /* CONFIG_PER_VMA_LOCK */ static inline void mmap_init_lock(struct mm_struct *mm) { -- 2.51.0 From 79dc2bff4d8d36f04d7683916eb6d4eb9ea04da4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 9 Dec 2024 18:20:01 -0800 Subject: [PATCH 11/16] mm/damon/tests/vaddr-kunit.h: reduce stack consumption After "mm: move per-vma lock into vm_area_struct" we're hitting mm/damon/tests/vaddr-kunit.h: In function 'damon_test_three_regions_in_vmas': mm/damon/tests/vaddr-kunit.h:92:1: error: the frame size of 3280 bytes is larger than 2048 bytes [-Werror=frame-larger-than=] Fix by moving all those vmas off the stack. [akpm@linux-foundation.org: fix build] Closes: https://lkml.kernel.org/r/20241209170829.11311e70@canb.auug.org.au Reported-by: Stephen Rothwell Reviewed-by: SeongJae Park Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/damon/tests/vaddr-kunit.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index b9fe3bc8472b..7cd944266a92 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -68,7 +68,7 @@ static void damon_test_three_regions_in_vmas(struct kunit *test) static struct mm_struct mm; struct damon_addr_range regions[3] = {0}; /* 10-20-25, 200-210-220, 300-305, 307-330 */ - struct vm_area_struct vmas[] = { + static struct vm_area_struct vmas[] = { (struct vm_area_struct) {.vm_start = 10, .vm_end = 20}, (struct vm_area_struct) {.vm_start = 20, .vm_end = 25}, (struct vm_area_struct) {.vm_start = 200, .vm_end = 210}, -- 2.51.0 From 24bd843b1d1017ffc479246c7fc17537eb180763 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 6 Dec 2024 22:50:36 +0000 Subject: [PATCH 12/16] mm: enforce __must_check on VMA merge and split It is of critical importance to check the return results on VMA merge (and split), failure to do so can result in use-after-free's. This bug has recurred, so have the compiler enforce this check to prevent any future repetition. Link: https://lkml.kernel.org/r/20241206225036.273103-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Jann Horn Signed-off-by: Andrew Morton --- mm/vma.c | 8 +++++--- mm/vma.h | 26 +++++++++++++++----------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/mm/vma.c b/mm/vma.c index 06554a732bce..6fa240e5b0c5 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -447,8 +447,9 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, * has already been checked or doesn't make sense to fail. * VMA Iterator will point to the original VMA. */ -static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, int new_below) +static __must_check int +__split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, int new_below) { struct vma_prepare vp; struct vm_area_struct *new; @@ -710,7 +711,8 @@ static bool can_merge_remove_vma(struct vm_area_struct *vma) * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. * - vmi must be positioned within [@vmg->vma->vm_start, @vmg->vma->vm_end). */ -static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *vmg) +static __must_check struct vm_area_struct *vma_merge_existing_range( + struct vma_merge_struct *vmg) { struct vm_area_struct *vma = vmg->vma; struct vm_area_struct *prev = vmg->prev; diff --git a/mm/vma.h b/mm/vma.h index 295d44ea54db..61ed044b6145 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -139,9 +139,10 @@ void validate_mm(struct mm_struct *mm); #define validate_mm(mm) do { } while (0) #endif -int vma_expand(struct vma_merge_struct *vmg); -int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, pgoff_t pgoff); +__must_check int vma_expand(struct vma_merge_struct *vmg); +__must_check int vma_shrink(struct vma_iterator *vmi, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff); static inline int vma_iter_store_gfp(struct vma_iterator *vmi, struct vm_area_struct *vma, gfp_t gfp) @@ -174,13 +175,14 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next); /* We are about to modify the VMA's flags. */ -struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, +__must_check struct vm_area_struct +*vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags); /* We are about to modify the VMA's flags and/or anon_name. */ -struct vm_area_struct +__must_check struct vm_area_struct *vma_modify_flags_name(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, @@ -190,7 +192,7 @@ struct vm_area_struct struct anon_vma_name *new_name); /* We are about to modify the VMA's memory policy. */ -struct vm_area_struct +__must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, @@ -198,7 +200,7 @@ struct vm_area_struct struct mempolicy *new_pol); /* We are about to modify the VMA's flags and/or uffd context. */ -struct vm_area_struct +__must_check struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, @@ -206,11 +208,13 @@ struct vm_area_struct unsigned long new_flags, struct vm_userfaultfd_ctx new_ctx); -struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg); +__must_check struct vm_area_struct +*vma_merge_new_range(struct vma_merge_struct *vmg); -struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, - struct vm_area_struct *vma, - unsigned long delta); +__must_check struct vm_area_struct +*vma_merge_extend(struct vma_iterator *vmi, + struct vm_area_struct *vma, + unsigned long delta); void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb); -- 2.51.0 From fa00b8ef1803fe133b4897c25227aa0d298dd093 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 6 Dec 2024 21:28:46 +0000 Subject: [PATCH 13/16] mm: perform all memfd seal checks in a single place We no longer actually need to perform these checks in the f_op->mmap() hook any longer. We already moved the operation which clears VM_MAYWRITE on a read-only mapping of a write-sealed memfd in order to work around the restrictions imposed by commit 5de195060b2e ("mm: resolve faulty mmap_region() error path behaviour"). There is no reason for us not to simply go ahead and additionally check to see if any pre-existing seals are in place here rather than defer this to the f_op->mmap() hook. By doing this we remove more logic from shmem_mmap() which doesn't belong there, as well as doing the same for hugetlbfs_file_mmap(). We also remove dubious shared logic in mm.h which simply does not belong there either. It makes sense to do these checks at the earliest opportunity, we know these are shmem (or hugetlbfs) mappings whose relevant VMA flags will not change from the invoking do_mmap() so there is simply no need to wait. This also means the implementation of further memfd seal flags can be done within mm/memfd.c and also have the opportunity to modify VMA flags as necessary early in the mapping logic. [lorenzo.stoakes@oracle.com: fix typos in !memfd inline stub] Link: https://lkml.kernel.org/r/7dee6c5d-480b-4c24-b98e-6fa47dbd8a23@lucifer.local Link: https://lkml.kernel.org/r/20241206212846.210835-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Tested-by: Isaac J. Manjarres Cc: Hugh Dickins Cc: Jann Horn Cc: Kalesh Singh Cc: Liam R. Howlett Cc: Muchun Song Cc: Vlastimil Babka Cc: Jeff Xu Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 5 ---- include/linux/memfd.h | 23 +++++++++--------- include/linux/mm.h | 55 ------------------------------------------- mm/memfd.c | 44 +++++++++++++++++++++++++++++++++- mm/mmap.c | 12 +++++++--- mm/shmem.c | 6 ----- 6 files changed, 63 insertions(+), 82 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index fc1ae5132127..62fb0cbc93ab 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -99,7 +99,6 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = { static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); - struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); loff_t len, vma_len; int ret; struct hstate *h = hstate_file(file); @@ -116,10 +115,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); vma->vm_ops = &hugetlb_vm_ops; - ret = seal_check_write(info->seals, vma); - if (ret) - return ret; - /* * page based offset in vm_pgoff could be sufficiently large to * overflow a loff_t when converted to byte offset. This can diff --git a/include/linux/memfd.h b/include/linux/memfd.h index d437e3070850..246daadbfde8 100644 --- a/include/linux/memfd.h +++ b/include/linux/memfd.h @@ -7,7 +7,14 @@ #ifdef CONFIG_MEMFD_CREATE extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg); struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx); -unsigned int *memfd_file_seals_ptr(struct file *file); +/* + * Check for any existing seals on mmap, return an error if access is denied due + * to sealing, or 0 otherwise. + * + * We also update VMA flags if appropriate by manipulating the VMA flags pointed + * to by vm_flags_ptr. + */ +int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr); #else static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned int a) { @@ -17,19 +24,11 @@ static inline struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx) { return ERR_PTR(-EINVAL); } - -static inline unsigned int *memfd_file_seals_ptr(struct file *file) +static inline int memfd_check_seals_mmap(struct file *file, + unsigned long *vm_flags_ptr) { - return NULL; + return 0; } #endif -/* Retrieve memfd seals associated with the file, if any. */ -static inline unsigned int memfd_file_seals(struct file *file) -{ - unsigned int *sealsp = memfd_file_seals_ptr(file); - - return sealsp ? *sealsp : 0; -} - #endif /* __LINUX_MEMFD_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index a3a50c37603e..e7c54b9aac6d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4102,61 +4102,6 @@ void mem_dump_obj(void *object); static inline void mem_dump_obj(void *object) {} #endif -static inline bool is_write_sealed(int seals) -{ - return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); -} - -/** - * is_readonly_sealed - Checks whether write-sealed but mapped read-only, - * in which case writes should be disallowing moving - * forwards. - * @seals: the seals to check - * @vm_flags: the VMA flags to check - * - * Returns whether readonly sealed, in which case writess should be disallowed - * going forward. - */ -static inline bool is_readonly_sealed(int seals, vm_flags_t vm_flags) -{ - /* - * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as - * MAP_SHARED and read-only, take care to not allow mprotect to - * revert protections on such mappings. Do this only for shared - * mappings. For private mappings, don't need to mask - * VM_MAYWRITE as we still want them to be COW-writable. - */ - if (is_write_sealed(seals) && - ((vm_flags & (VM_SHARED | VM_WRITE)) == VM_SHARED)) - return true; - - return false; -} - -/** - * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and - * handle them. - * @seals: the seals to check - * @vma: the vma to operate on - * - * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper - * check/handling on the vma flags. Return 0 if check pass, or <0 for errors. - */ -static inline int seal_check_write(int seals, struct vm_area_struct *vma) -{ - if (!is_write_sealed(seals)) - return 0; - - /* - * New PROT_WRITE and MAP_SHARED mmaps are not allowed when - * write seals are active. - */ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) - return -EPERM; - - return 0; -} - #ifdef CONFIG_ANON_VMA_NAME int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, unsigned long len_in, diff --git a/mm/memfd.c b/mm/memfd.c index 35a370d75c9a..5f5a23c9051d 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -170,7 +170,7 @@ static int memfd_wait_for_pins(struct address_space *mapping) return error; } -unsigned int *memfd_file_seals_ptr(struct file *file) +static unsigned int *memfd_file_seals_ptr(struct file *file) { if (shmem_file(file)) return &SHMEM_I(file_inode(file))->seals; @@ -327,6 +327,48 @@ static int check_sysctl_memfd_noexec(unsigned int *flags) return 0; } +static inline bool is_write_sealed(unsigned int seals) +{ + return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE); +} + +static int check_write_seal(unsigned long *vm_flags_ptr) +{ + unsigned long vm_flags = *vm_flags_ptr; + unsigned long mask = vm_flags & (VM_SHARED | VM_WRITE); + + /* If a private matting then writability is irrelevant. */ + if (!(mask & VM_SHARED)) + return 0; + + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when + * write seals are active. + */ + if (mask & VM_WRITE) + return -EPERM; + + /* + * This is a read-only mapping, disallow mprotect() from making a + * write-sealed mapping writable in future. + */ + *vm_flags_ptr &= ~VM_MAYWRITE; + + return 0; +} + +int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr) +{ + int err = 0; + unsigned int *seals_ptr = memfd_file_seals_ptr(file); + unsigned int seals = seals_ptr ? *seals_ptr : 0; + + if (is_write_sealed(seals)) + err = check_write_seal(vm_flags_ptr); + + return err; +} + SYSCALL_DEFINE2(memfd_create, const char __user *, uname, unsigned int, flags) diff --git a/mm/mmap.c b/mm/mmap.c index b373486bd1c6..df9154b15ef9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -368,8 +368,8 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (file) { struct inode *inode = file_inode(file); - unsigned int seals = memfd_file_seals(file); unsigned long flags_mask; + int err; if (!file_mmap_ok(file, inode, pgoff, len)) return -EOVERFLOW; @@ -409,8 +409,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= VM_SHARED | VM_MAYSHARE; if (!(file->f_mode & FMODE_WRITE)) vm_flags &= ~(VM_MAYWRITE | VM_SHARED); - else if (is_readonly_sealed(seals, vm_flags)) - vm_flags &= ~VM_MAYWRITE; fallthrough; case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) @@ -430,6 +428,14 @@ unsigned long do_mmap(struct file *file, unsigned long addr, default: return -EINVAL; } + + /* + * Check to see if we are violating any seals and update VMA + * flags if necessary to avoid future seal violations. + */ + err = memfd_check_seals_mmap(file, &vm_flags); + if (err) + return (unsigned long)err; } else { switch (flags & MAP_TYPE) { case MAP_SHARED: diff --git a/mm/shmem.c b/mm/shmem.c index 0892889744a2..95b80c24f6f9 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2816,12 +2816,6 @@ out_nomem: static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); - struct shmem_inode_info *info = SHMEM_I(inode); - int ret; - - ret = seal_check_write(info->seals, vma); - if (ret) - return ret; file_accessed(file); /* This is anonymous shared memory if it is unlinked at the time of mmap */ -- 2.51.0 From 052ccfbcc6cd87fe678b0ab27cc0715cb3aefb21 Mon Sep 17 00:00:00 2001 From: Guillaume Morin Date: Fri, 6 Dec 2024 22:28:36 +0100 Subject: [PATCH 14/16] mm/hugetlb: support FOLL_FORCE|FOLL_WRITE Eric reported that PTRACE_POKETEXT fails when applications use hugetlb for mapping text using huge pages. Before commit 1d8d14641fd9 ("mm/hugetlb: support write-faults in shared mappings"), PTRACE_POKETEXT worked by accident, but it was buggy and silently ended up mapping pages writable into the page tables even though VM_WRITE was not set. In general, FOLL_FORCE|FOLL_WRITE does currently not work with hugetlb. Let's implement FOLL_FORCE|FOLL_WRITE properly for hugetlb, such that what used to work in the past by accident now properly works, allowing applications using hugetlb for text etc. to get properly debugged. This change might also be required to implement uprobes support for hugetlb [1]. [1] https://lore.kernel.org/lkml/ZiK50qob9yl5e0Xz@bender.morinfr.org/ Link: https://lkml.kernel.org/r/Z1NshNfWuzUCPebA@bender.morinfr.org Signed-off-by: Guillaume Morin Cc: Muchun Song Cc: Andrew Morton Cc: Peter Xu Cc: David Hildenbrand Cc: Eric Hagberg Signed-off-by: Andrew Morton --- mm/gup.c | 91 +++++++++++++++++++++++++--------------------------- mm/hugetlb.c | 17 +++++----- 2 files changed, 53 insertions(+), 55 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 3b75e631f369..00a1269cbee0 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -596,6 +596,33 @@ static struct folio *try_grab_folio_fast(struct page *page, int refs, } #endif /* CONFIG_HAVE_GUP_FAST */ +/* Common code for can_follow_write_* */ +static inline bool can_follow_write_common(struct page *page, + struct vm_area_struct *vma, unsigned int flags) +{ + /* Maybe FOLL_FORCE is set to override it? */ + if (!(flags & FOLL_FORCE)) + return false; + + /* But FOLL_FORCE has no effect on shared mappings */ + if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) + return false; + + /* ... or read-only private ones */ + if (!(vma->vm_flags & VM_MAYWRITE)) + return false; + + /* ... or already writable ones that just need to take a write fault */ + if (vma->vm_flags & VM_WRITE) + return false; + + /* + * See can_change_pte_writable(): we broke COW and could map the page + * writable if we have an exclusive anonymous page ... + */ + return page && PageAnon(page) && PageAnonExclusive(page); +} + static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags, unsigned long address) { @@ -622,6 +649,18 @@ static struct page *no_page_table(struct vm_area_struct *vma, } #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES +/* FOLL_FORCE can write to even unwritable PUDs in COW mappings. */ +static inline bool can_follow_write_pud(pud_t pud, struct page *page, + struct vm_area_struct *vma, + unsigned int flags) +{ + /* If the pud is writable, we can write to the page. */ + if (pud_write(pud)) + return true; + + return can_follow_write_common(page, vma, flags); +} + static struct page *follow_huge_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp, int flags, struct follow_page_context *ctx) @@ -634,10 +673,11 @@ static struct page *follow_huge_pud(struct vm_area_struct *vma, assert_spin_locked(pud_lockptr(mm, pudp)); - if ((flags & FOLL_WRITE) && !pud_write(pud)) + if (!pud_present(pud)) return NULL; - if (!pud_present(pud)) + if ((flags & FOLL_WRITE) && + !can_follow_write_pud(pud, pfn_to_page(pfn), vma, flags)) return NULL; pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT; @@ -686,27 +726,7 @@ static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, if (pmd_write(pmd)) return true; - /* Maybe FOLL_FORCE is set to override it? */ - if (!(flags & FOLL_FORCE)) - return false; - - /* But FOLL_FORCE has no effect on shared mappings */ - if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) - return false; - - /* ... or read-only private ones */ - if (!(vma->vm_flags & VM_MAYWRITE)) - return false; - - /* ... or already writable ones that just need to take a write fault */ - if (vma->vm_flags & VM_WRITE) - return false; - - /* - * See can_change_pte_writable(): we broke COW and could map the page - * writable if we have an exclusive anonymous page ... - */ - if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + if (!can_follow_write_common(page, vma, flags)) return false; /* ... and a write-fault isn't required for other reasons. */ @@ -807,27 +827,7 @@ static inline bool can_follow_write_pte(pte_t pte, struct page *page, if (pte_write(pte)) return true; - /* Maybe FOLL_FORCE is set to override it? */ - if (!(flags & FOLL_FORCE)) - return false; - - /* But FOLL_FORCE has no effect on shared mappings */ - if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) - return false; - - /* ... or read-only private ones */ - if (!(vma->vm_flags & VM_MAYWRITE)) - return false; - - /* ... or already writable ones that just need to take a write fault */ - if (vma->vm_flags & VM_WRITE) - return false; - - /* - * See can_change_pte_writable(): we broke COW and could map the page - * writable if we have an exclusive anonymous page ... - */ - if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + if (!can_follow_write_common(page, vma, flags)) return false; /* ... and a write-fault isn't required for other reasons. */ @@ -1294,9 +1294,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; - /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */ - if (is_vm_hugetlb_page(vma)) - return -EFAULT; /* * We used to let the write,force case do COW in a * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c9d8c6a1c03c..21de25546a25 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5183,6 +5183,13 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, update_mmu_cache(vma, address, ptep); } +static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + if (vma->vm_flags & VM_WRITE) + set_huge_ptep_writable(vma, address, ptep); +} + bool is_hugetlb_entry_migration(pte_t pte) { swp_entry_t swp; @@ -5828,13 +5835,6 @@ static vm_fault_t hugetlb_wp(struct folio *pagecache_folio, if (!unshare && huge_pte_uffd_wp(pte)) return 0; - /* - * hugetlb does not support FOLL_FORCE-style write faults that keep the - * PTE mapped R/O such as maybe_mkwrite() would do. - */ - if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE))) - return VM_FAULT_SIGSEGV; - /* Let's take out MAP_SHARED mappings first. */ if (vma->vm_flags & VM_MAYSHARE) { set_huge_ptep_writable(vma, vmf->address, vmf->pte); @@ -5863,7 +5863,8 @@ retry_avoidcopy: SetPageAnonExclusive(&old_folio->page); } if (likely(!unshare)) - set_huge_ptep_writable(vma, vmf->address, vmf->pte); + set_huge_ptep_maybe_writable(vma, vmf->address, + vmf->pte); delayacct_wpcopy_end(); return 0; -- 2.51.0 From b487c02430104a2df7ecee8dabaf3b3b66e536bf Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Fri, 6 Dec 2024 19:48:39 +0000 Subject: [PATCH 15/16] mseal: remove can_do_mseal() No code logic change. can_do_mseal() is called exclusively by mseal.c, and mseal.c is compiled only when CONFIG_64BIT flag is set in makefile. Therefore, it is unnecessary to have 32 bit stub function in the header file, remove this function and merge the logic into do_mseal(). Link: https://lkml.kernel.org/r/20241206013934.2782793-1-jeffxu@google.com Link: https://lkml.kernel.org/r/20241206194839.3030596-2-jeffxu@google.com Signed-off-by: Jeff Xu Reviewed-by: Lorenzo Stoakes Cc: Jorge Lucangeli Obes Cc: Kees Cook Cc: Liam R. Howlett Cc: Pedro Falcato Cc: Randy Dunlap Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/internal.h | 16 ---------------- mm/mseal.c | 6 +++--- 2 files changed, 3 insertions(+), 19 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index b438d35045de..4d4028d74e5d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1451,22 +1451,6 @@ void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); -#ifdef CONFIG_64BIT -static inline int can_do_mseal(unsigned long flags) -{ - if (flags) - return -EINVAL; - - return 0; -} - -#else -static inline int can_do_mseal(unsigned long flags) -{ - return -EPERM; -} -#endif - #ifdef CONFIG_SHRINKER_DEBUG static inline __printf(2, 0) int shrinker_debugfs_name_alloc( struct shrinker *shrinker, const char *fmt, va_list ap) diff --git a/mm/mseal.c b/mm/mseal.c index 81d6e980e8a9..c27197ac04e8 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -217,9 +217,9 @@ int do_mseal(unsigned long start, size_t len_in, unsigned long flags) unsigned long end; struct mm_struct *mm = current->mm; - ret = can_do_mseal(flags); - if (ret) - return ret; + /* Verify flags not set. */ + if (flags) + return -EINVAL; start = untagged_addr(start); if (!PAGE_ALIGNED(start)) -- 2.51.0 From 2f3577d16c8644455f215abb3c2f77aa32b967b1 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Mon, 9 Dec 2024 23:56:21 +0500 Subject: [PATCH 16/16] selftests/mm: thp_settings: remove const from return type Patch series "selftest/mm: Remove warnings found by adding compiler flags". Recently, I reviewed a patch on the mm/kselftest mailing list about a test which had obvious type mismatch fix in it. It was strange why that wasn't caught during development and when patch was accepted. This led me to discover that those extra compiler options to catch these warnings aren't being used. When I added them, I found tens of warnings in just mm suite. In this series, I'm fixing those warnings in a few files. More fixes will be sent later. This patch (of 4): Remove cost from the return type as it is ignored anyways and generates the warning: warning: type qualifiers ignored on function return type [-Wignored-qualifiers] Link: https://lkml.kernel.org/r/20241209185624.2245158-1-usama.anjum@collabora.com Link: https://lkml.kernel.org/r/20241209185624.2245158-2-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: David Hildenbrand Cc: Ryan Roberts Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/thp_settings.c | 4 ++-- tools/testing/selftests/mm/thp_settings.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/mm/thp_settings.c b/tools/testing/selftests/mm/thp_settings.c index 577eaab6266f..ad872af1c81a 100644 --- a/tools/testing/selftests/mm/thp_settings.c +++ b/tools/testing/selftests/mm/thp_settings.c @@ -87,7 +87,7 @@ int write_file(const char *path, const char *buf, size_t buflen) return (unsigned int) numwritten; } -const unsigned long read_num(const char *path) +unsigned long read_num(const char *path) { char buf[21]; @@ -172,7 +172,7 @@ void thp_write_string(const char *name, const char *val) } } -const unsigned long thp_read_num(const char *name) +unsigned long thp_read_num(const char *name) { char path[PATH_MAX]; int ret; diff --git a/tools/testing/selftests/mm/thp_settings.h b/tools/testing/selftests/mm/thp_settings.h index 876235a23460..fc131d23d593 100644 --- a/tools/testing/selftests/mm/thp_settings.h +++ b/tools/testing/selftests/mm/thp_settings.h @@ -64,12 +64,12 @@ struct thp_settings { int read_file(const char *path, char *buf, size_t buflen); int write_file(const char *path, const char *buf, size_t buflen); -const unsigned long read_num(const char *path); +unsigned long read_num(const char *path); void write_num(const char *path, unsigned long num); int thp_read_string(const char *name, const char * const strings[]); void thp_write_string(const char *name, const char *val); -const unsigned long thp_read_num(const char *name); +unsigned long thp_read_num(const char *name); void thp_write_num(const char *name, unsigned long num); void thp_write_settings(struct thp_settings *settings); -- 2.51.0