From f33cea94e37ce10e27b192e3c5e80ff685ac7b1f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 26 Sep 2024 17:20:44 +0200 Subject: [PATCH 01/16] selftests/mm: hugetlb_fault_after_madv: improve test output Let's improve the test output. For example, print the proper test result. Install a SIGBUS handler to catch any SIGBUS instead of crashing the test on failure. With unsuitable hugetlb page count: $ ./hugetlb_fault_after_madv TAP version 13 1..1 # [INFO] detected default hugetlb page size: 2048 KiB ok 2 # SKIP This test needs one and only one page to execute. Got 0 # Totals: pass:0 fail:0 xfail:0 xpass:0 skip:1 error:0 On a failure: $ ./hugetlb_fault_after_madv TAP version 13 1..1 not ok 1 SIGBUS behavior Bail out! 1 out of 1 tests failed On success: $ ./hugetlb_fault_after_madv TAP version 13 1..1 # [INFO] detected default hugetlb page size: 2048 KiB ok 1 SIGBUS behavior # Totals: pass:1 fail:0 xfail:0 xpass:0 skip:0 error:0 Link: https://lkml.kernel.org/r/20240926152044.2205129-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Breno Leitao Tested-by: Mario Casquero Cc: Shuah Khan Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../selftests/mm/hugetlb_fault_after_madv.c | 34 ++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/hugetlb_fault_after_madv.c b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c index ff3ba675278d..e2640529dbb2 100644 --- a/tools/testing/selftests/mm/hugetlb_fault_after_madv.c +++ b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include "vm_util.h" #include "../kselftest.h" @@ -14,11 +16,25 @@ static char *huge_ptr; static size_t huge_page_size; +static sigjmp_buf sigbuf; +static bool sigbus_triggered; + +static void signal_handler(int signal) +{ + if (signal == SIGBUS) { + sigbus_triggered = true; + siglongjmp(sigbuf, 1); + } +} + /* Touch the memory while it is being madvised() */ void *touch(void *unused) { char *ptr = (char *)huge_ptr; + if (sigsetjmp(sigbuf, 1)) + return NULL; + for (int i = 0; i < INLOOP_ITER; i++) ptr[0] = '.'; @@ -44,13 +60,23 @@ int main(void) * interactions */ int max = 10000; + int err; + + ksft_print_header(); + ksft_set_plan(1); srand(getpid()); + if (signal(SIGBUS, signal_handler) == SIG_ERR) + ksft_exit_skip("Could not register signal handler."); + huge_page_size = default_huge_page_size(); if (!huge_page_size) ksft_exit_skip("Could not detect default hugetlb page size."); + ksft_print_msg("[INFO] detected default hugetlb page size: %zu KiB\n", + huge_page_size / 1024); + free_hugepages = get_free_hugepages(); if (free_hugepages != 1) { ksft_exit_skip("This test needs one and only one page to execute. Got %lu\n", @@ -73,5 +99,11 @@ int main(void) munmap(huge_ptr, huge_page_size); } - return KSFT_PASS; + ksft_test_result(!sigbus_triggered, "SIGBUS behavior\n"); + + err = ksft_get_fail_cnt(); + if (err) + ksft_exit_fail_msg("%d out of %d tests failed\n", + err, ksft_test_num()); + ksft_exit_pass(); } -- 2.51.0 From 021781b01275c07cd5b7d3e4e8afc2bdf2429a84 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 26 Sep 2024 16:10:19 +0100 Subject: [PATCH 02/16] mm/madvise: unrestrict process_madvise() for current process The process_madvise() call was introduced in commit ecb8ac8b1f14 ("mm/madvise: introduce process_madvise() syscall: an external memory hinting API") as a means of performing madvise() operations on another process. However, as it provides the means by which to perform multiple madvise() operations in a batch via an iovec, it is useful to utilise the same interface for performing operations on the current process rather than a remote one. Commit 22af8caff7d1 ("mm/madvise: process_madvise() drop capability check if same mm") removed the need for a caller invoking process_madvise() on its own pidfd to possess the CAP_SYS_NICE capability, however this leaves the restrictions on operation in place. Resolve this by only applying the restriction on operations when accessing a remote process. Moving forward we plan to implement a simpler means of specifying this condition other than needing to establish a self pidfd, perhaps in the form of a sentinel pidfd. Also take the opportunity to refactor the system call implementation abstracting the vectorised operation. Link: https://lkml.kernel.org/r/20240926151019.82902-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Shakeel Butt Acked-by: Vlastimil Babka Cc: Arnd Bergmann Cc: Christian Brauner Cc: "Liam R. Howlett" Cc: Minchan Kim Cc: Pedro Falcato Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/madvise.c | 55 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 50d223ab3894..e871a72a6c32 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1208,7 +1208,8 @@ madvise_behavior_valid(int behavior) } } -static bool process_madvise_behavior_valid(int behavior) +/* Can we invoke process_madvise() on a remote mm for the specified behavior? */ +static bool process_madvise_remote_valid(int behavior) { switch (behavior) { case MADV_COLD: @@ -1477,6 +1478,28 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) return do_madvise(current->mm, start, len_in, behavior); } +/* Perform an madvise operation over a vector of addresses and lengths. */ +static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, + int behavior) +{ + ssize_t ret = 0; + size_t total_len; + + total_len = iov_iter_count(iter); + + while (iov_iter_count(iter)) { + ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter), + iter_iov_len(iter), behavior); + if (ret < 0) + break; + iov_iter_advance(iter, iter_iov_len(iter)); + } + + ret = (total_len - iov_iter_count(iter)) ? : ret; + + return ret; +} + SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, size_t, vlen, int, behavior, unsigned int, flags) { @@ -1486,7 +1509,6 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, struct iov_iter iter; struct task_struct *task; struct mm_struct *mm; - size_t total_len; unsigned int f_flags; if (flags != 0) { @@ -1504,11 +1526,6 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, goto free_iov; } - if (!process_madvise_behavior_valid(behavior)) { - ret = -EINVAL; - goto release_task; - } - /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR(mm)) { @@ -1516,26 +1533,26 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, goto release_task; } + /* + * We need only perform this check if we are attempting to manipulate a + * remote process's address space. + */ + if (mm != current->mm && !process_madvise_remote_valid(behavior)) { + ret = -EINVAL; + goto release_mm; + } + /* * Require CAP_SYS_NICE for influencing process performance. Note that - * only non-destructive hints are currently supported. + * only non-destructive hints are currently supported for remote + * processes. */ if (mm != current->mm && !capable(CAP_SYS_NICE)) { ret = -EPERM; goto release_mm; } - total_len = iov_iter_count(&iter); - - while (iov_iter_count(&iter)) { - ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter), - iter_iov_len(&iter), behavior); - if (ret < 0) - break; - iov_iter_advance(&iter, iter_iov_len(&iter)); - } - - ret = (total_len - iov_iter_count(&iter)) ? : ret; + ret = vector_madvise(mm, &iter, behavior); release_mm: mmput(mm); -- 2.51.0 From f2f484085ef1a2bb5aea861a06bc6b4dc50d2ab8 Mon Sep 17 00:00:00 2001 From: Nanyong Sun Date: Thu, 26 Sep 2024 15:49:22 +0800 Subject: [PATCH 03/16] mm: move mm flags to mm_types.h The types of mm flags are now far beyond the core dump related features. This patch moves mm flags from linux/sched/coredump.h to linux/mm_types.h. The linux/sched/coredump.h has include the mm_types.h, so the C files related to coredump does not need to change head file inclusion. In addition, the inclusion of sched/coredump.h now can be deleted from the C files that irrelevant to core dump. Link: https://lkml.kernel.org/r/20240926074922.2721274-1-sunnanyong@huawei.com Signed-off-by: Nanyong Sun Cc: Kefeng Wang Cc: Masami Hiramatsu Cc: Matthew Wilcox Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 1 - include/linux/khugepaged.h | 2 - include/linux/ksm.h | 1 - include/linux/mm_types.h | 84 ++++++++++++++++++++++++++++++++++ include/linux/oom.h | 1 - include/linux/sched/coredump.h | 82 --------------------------------- kernel/events/uprobes.c | 1 - kernel/fork.c | 1 - mm/huge_memory.c | 1 - mm/khugepaged.c | 1 - mm/ksm.c | 1 - mm/memory.c | 1 - mm/oom_kill.c | 1 - 13 files changed, 84 insertions(+), 94 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ef5b80e48599..8afe09a2cf03 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -2,7 +2,6 @@ #ifndef _LINUX_HUGE_MM_H #define _LINUX_HUGE_MM_H -#include #include #include /* only for vma_is_dax() */ diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 30baae91b225..1f46046080f5 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -2,8 +2,6 @@ #ifndef _LINUX_KHUGEPAGED_H #define _LINUX_KHUGEPAGED_H -#include /* MMF_VM_HUGEPAGE */ - extern unsigned int khugepaged_max_ptes_none __read_mostly; #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern struct attribute_group khugepaged_attr_group; diff --git a/include/linux/ksm.h b/include/linux/ksm.h index ec9c05044d4f..29022e71a074 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -13,7 +13,6 @@ #include #include #include -#include #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6e3bdf8e38bc..ff8627acbaa7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1499,4 +1499,88 @@ enum { /* See also internal only FOLL flags in mm/internal.h */ }; +/* mm flags */ + +/* + * The first two bits represent core dump modes for set-user-ID, + * the modes are SUID_DUMP_* defined in linux/sched/coredump.h + */ +#define MMF_DUMPABLE_BITS 2 +#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) +/* coredump filter bits */ +#define MMF_DUMP_ANON_PRIVATE 2 +#define MMF_DUMP_ANON_SHARED 3 +#define MMF_DUMP_MAPPED_PRIVATE 4 +#define MMF_DUMP_MAPPED_SHARED 5 +#define MMF_DUMP_ELF_HEADERS 6 +#define MMF_DUMP_HUGETLB_PRIVATE 7 +#define MMF_DUMP_HUGETLB_SHARED 8 +#define MMF_DUMP_DAX_PRIVATE 9 +#define MMF_DUMP_DAX_SHARED 10 + +#define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS +#define MMF_DUMP_FILTER_BITS 9 +#define MMF_DUMP_FILTER_MASK \ + (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) +#define MMF_DUMP_FILTER_DEFAULT \ + ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\ + (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) + +#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS +# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS) +#else +# define MMF_DUMP_MASK_DEFAULT_ELF 0 +#endif + /* leave room for more dump flags */ +#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ +#define MMF_VM_HUGEPAGE 17 /* set when mm is available for khugepaged */ + +/* + * This one-shot flag is dropped due to necessity of changing exe once again + * on NFS restore + */ +//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ + +#define MMF_HAS_UPROBES 19 /* has uprobes */ +#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ +#define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ +#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ +#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ +#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ +#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) +#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ +#define MMF_MULTIPROCESS 26 /* mm is shared between processes */ +/* + * MMF_HAS_PINNED: Whether this mm has pinned any pages. This can be either + * replaced in the future by mm.pinned_vm when it becomes stable, or grow into + * a counter on its own. We're aggresive on this bit for now: even if the + * pinned pages were unpinned later on, we'll still keep this bit set for the + * lifecycle of this mm, just for simplicity. + */ +#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ + +#define MMF_HAS_MDWE 28 +#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE) + + +#define MMF_HAS_MDWE_NO_INHERIT 29 + +#define MMF_VM_MERGE_ANY 30 +#define MMF_VM_MERGE_ANY_MASK (1 << MMF_VM_MERGE_ANY) + +#define MMF_TOPDOWN 31 /* mm searches top down by default */ +#define MMF_TOPDOWN_MASK (1 << MMF_TOPDOWN) + +#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ + MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ + MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK) + +static inline unsigned long mmf_init_flags(unsigned long flags) +{ + if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT)) + flags &= ~((1UL << MMF_HAS_MDWE) | + (1UL << MMF_HAS_MDWE_NO_INHERIT)); + return flags & MMF_INIT_MASK; +} + #endif /* _LINUX_MM_TYPES_H */ diff --git a/include/linux/oom.h b/include/linux/oom.h index 7d0c9c48a0c5..1e0fc6931ce9 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -7,7 +7,6 @@ #include #include #include -#include /* MMF_* */ #include /* VM_FAULT* */ struct zonelist; diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index e62ff805cfc9..6eb65ceed213 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -8,12 +8,6 @@ #define SUID_DUMP_USER 1 /* Dump as user of process */ #define SUID_DUMP_ROOT 2 /* Dump as root */ -/* mm flags */ - -/* for SUID_DUMP_* above */ -#define MMF_DUMPABLE_BITS 2 -#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) - extern void set_dumpable(struct mm_struct *mm, int value); /* * This returns the actual value of the suid_dumpable flag. For things @@ -31,80 +25,4 @@ static inline int get_dumpable(struct mm_struct *mm) return __get_dumpable(mm->flags); } -/* coredump filter bits */ -#define MMF_DUMP_ANON_PRIVATE 2 -#define MMF_DUMP_ANON_SHARED 3 -#define MMF_DUMP_MAPPED_PRIVATE 4 -#define MMF_DUMP_MAPPED_SHARED 5 -#define MMF_DUMP_ELF_HEADERS 6 -#define MMF_DUMP_HUGETLB_PRIVATE 7 -#define MMF_DUMP_HUGETLB_SHARED 8 -#define MMF_DUMP_DAX_PRIVATE 9 -#define MMF_DUMP_DAX_SHARED 10 - -#define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS -#define MMF_DUMP_FILTER_BITS 9 -#define MMF_DUMP_FILTER_MASK \ - (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) -#define MMF_DUMP_FILTER_DEFAULT \ - ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\ - (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) - -#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS -# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS) -#else -# define MMF_DUMP_MASK_DEFAULT_ELF 0 -#endif - /* leave room for more dump flags */ -#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ -#define MMF_VM_HUGEPAGE 17 /* set when mm is available for - khugepaged */ -/* - * This one-shot flag is dropped due to necessity of changing exe once again - * on NFS restore - */ -//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ - -#define MMF_HAS_UPROBES 19 /* has uprobes */ -#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ -#define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ -#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ -#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ -#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ -#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) -#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ -#define MMF_MULTIPROCESS 26 /* mm is shared between processes */ -/* - * MMF_HAS_PINNED: Whether this mm has pinned any pages. This can be either - * replaced in the future by mm.pinned_vm when it becomes stable, or grow into - * a counter on its own. We're aggresive on this bit for now: even if the - * pinned pages were unpinned later on, we'll still keep this bit set for the - * lifecycle of this mm, just for simplicity. - */ -#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ - -#define MMF_HAS_MDWE 28 -#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE) - - -#define MMF_HAS_MDWE_NO_INHERIT 29 - -#define MMF_VM_MERGE_ANY 30 -#define MMF_VM_MERGE_ANY_MASK (1 << MMF_VM_MERGE_ANY) - -#define MMF_TOPDOWN 31 /* mm searches top down by default */ -#define MMF_TOPDOWN_MASK (1 << MMF_TOPDOWN) - -#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ - MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ - MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK) - -static inline unsigned long mmf_init_flags(unsigned long flags) -{ - if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT)) - flags &= ~((1UL << MMF_HAS_MDWE) | - (1UL << MMF_HAS_MDWE_NO_INHERIT)); - return flags & MMF_INIT_MASK; -} - #endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4b52cb2ae6d6..75ac18a3ac0f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include /* anon_vma_prepare */ #include diff --git a/kernel/fork.c b/kernel/fork.c index b2ab422f6230..61a4abd628f3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 03fd4bc39ea1..e71b58d84cba 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 0bd80e134010..ed1a225dd198 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include diff --git a/mm/ksm.c b/mm/ksm.c index a2e2a521df0a..dec536d6d91a 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/mm/memory.c b/mm/memory.c index bdf77a3ec47b..c8d5d040d6ab 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -44,7 +44,6 @@ #include #include #include -#include #include #include #include diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 4d7a0004df2c..1c485beb0b93 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include -- 2.51.0 From 66efef9b1a7d6cc725efa9395fb390483ad5b555 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:14 +0800 Subject: [PATCH 04/16] mm: pgtable: introduce pte_offset_map_{ro|rw}_nolock() Patch series "introduce pte_offset_map_{ro|rw}_nolock()", v5. As proposed by David Hildenbrand [1], this series introduces the following two new helper functions to replace pte_offset_map_nolock(). 1. pte_offset_map_ro_nolock() 2. pte_offset_map_rw_nolock() As the name suggests, pte_offset_map_ro_nolock() is used for read-only case. In this case, only read-only operations will be performed on PTE page after the PTL is held. The RCU lock in pte_offset_map_nolock() will ensure that the PTE page will not be freed, and there is no need to worry about whether the pmd entry is modified. Therefore pte_offset_map_ro_nolock() is just a renamed version of pte_offset_map_nolock(). pte_offset_map_rw_nolock() is used for may-write case. In this case, the pte or pmd entry may be modified after the PTL is held, so we need to ensure that the pmd entry has not been modified concurrently. So in addition to the name change, it also outputs the pmdval when successful. The users should make sure the page table is stable like checking pte_same() or checking pmd_same() by using the output pmdval before performing the write operations. This series will convert all pte_offset_map_nolock() into the above two helper functions one by one, and finally completely delete it. This also a preparation for reclaiming the empty user PTE page table pages. This patch (of 13): Currently, the usage of pte_offset_map_nolock() can be divided into the following two cases: 1) After acquiring PTL, only read-only operations are performed on the PTE page. In this case, the RCU lock in pte_offset_map_nolock() will ensure that the PTE page will not be freed, and there is no need to worry about whether the pmd entry is modified. 2) After acquiring PTL, the pte or pmd entries may be modified. At this time, we need to ensure that the pmd entry has not been modified concurrently. To more clearing distinguish between these two cases, this commit introduces two new helper functions to replace pte_offset_map_nolock(). For 1), just rename it to pte_offset_map_ro_nolock(). For 2), in addition to changing the name to pte_offset_map_rw_nolock(), it also outputs the pmdval when successful. It is applicable for may-write cases where any modification operations to the page table may happen after the corresponding spinlock is held afterwards. But the users should make sure the page table is stable like checking pte_same() or checking pmd_same() by using the output pmdval before performing the write operations. Note: "RO" / "RW" expresses the intended semantics, not that the *kmap* will be read-only/read-write protected. Subsequent commits will convert pte_offset_map_nolock() into the above two functions one by one, and finally completely delete it. Link: https://lkml.kernel.org/r/cover.1727332572.git.zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/5aeecfa131600a454b1f3a038a1a54282ca3b856.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- Documentation/mm/split_page_table_lock.rst | 7 ++++ include/linux/mm.h | 5 +++ mm/pgtable-generic.c | 48 ++++++++++++++++++++++ 3 files changed, 60 insertions(+) diff --git a/Documentation/mm/split_page_table_lock.rst b/Documentation/mm/split_page_table_lock.rst index e4f6972eb6c0..08d0e706a32d 100644 --- a/Documentation/mm/split_page_table_lock.rst +++ b/Documentation/mm/split_page_table_lock.rst @@ -19,6 +19,13 @@ There are helpers to lock/unlock a table and other accessor functions: - pte_offset_map_nolock() maps PTE, returns pointer to PTE with pointer to its PTE table lock (not taken), or returns NULL if no PTE table; + - pte_offset_map_ro_nolock() + maps PTE, returns pointer to PTE with pointer to its PTE table + lock (not taken), or returns NULL if no PTE table; + - pte_offset_map_rw_nolock() + maps PTE, returns pointer to PTE with pointer to its PTE table + lock (not taken) and the value of its pmd entry, or returns NULL + if no PTE table; - pte_offset_map() maps PTE, returns pointer to PTE, or returns NULL if no PTE table; - pte_unmap() diff --git a/include/linux/mm.h b/include/linux/mm.h index 61fff5d34ed5..0cf45d4b7286 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3017,6 +3017,11 @@ static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp); +pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, spinlock_t **ptlp); +pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, pmd_t *pmdvalp, + spinlock_t **ptlp); #define pte_unmap_unlock(pte, ptl) do { \ spin_unlock(ptl); \ diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index a78a4adf711a..daa08b91ab6b 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -317,6 +317,31 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, return pte; } +pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, spinlock_t **ptlp) +{ + pmd_t pmdval; + pte_t *pte; + + pte = __pte_offset_map(pmd, addr, &pmdval); + if (likely(pte)) + *ptlp = pte_lockptr(mm, &pmdval); + return pte; +} + +pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, pmd_t *pmdvalp, + spinlock_t **ptlp) +{ + pte_t *pte; + + VM_WARN_ON_ONCE(!pmdvalp); + pte = __pte_offset_map(pmd, addr, pmdvalp); + if (likely(pte)) + *ptlp = pte_lockptr(mm, pmdvalp); + return pte; +} + /* * pte_offset_map_lock(mm, pmd, addr, ptlp), and its internal implementation * __pte_offset_map_lock() below, is usually called with the pmd pointer for @@ -356,6 +381,29 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, * recheck *pmd once the lock is taken; in practice, no callsite needs that - * either the mmap_lock for write, or pte_same() check on contents, is enough. * + * pte_offset_map_ro_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map(); + * but when successful, it also outputs a pointer to the spinlock in ptlp - as + * pte_offset_map_lock() does, but in this case without locking it. This helps + * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time + * act on a changed *pmd: pte_offset_map_ro_nolock() provides the correct spinlock + * pointer for the page table that it returns. Even after grabbing the spinlock, + * we might be looking either at a page table that is still mapped or one that + * was unmapped and is about to get freed. But for R/O access this is sufficient. + * So it is only applicable for read-only cases where any modification operations + * to the page table are not allowed even if the corresponding spinlock is held + * afterwards. + * + * pte_offset_map_rw_nolock(mm, pmd, addr, pmdvalp, ptlp), above, is like + * pte_offset_map_ro_nolock(); but when successful, it also outputs the pdmval. + * It is applicable for may-write cases where any modification operations to the + * page table may happen after the corresponding spinlock is held afterwards. + * But the users should make sure the page table is stable like checking pte_same() + * or checking pmd_same() by using the output pmdval before performing the write + * operations. + * + * Note: "RO" / "RW" expresses the intended semantics, not that the *kmap* will + * be read-only/read-write protected. + * * Note that free_pgtables(), used after unmapping detached vmas, or when * exiting the whole mm, does not take page table lock before freeing a page * table, and may not use RCU at all: "outsiders" like khugepaged should avoid -- 2.51.0 From 7aefa59899e576db093ff077fd1ebd0d1b748f33 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:15 +0800 Subject: [PATCH 05/16] powerpc: assert_pte_locked() use pte_offset_map_ro_nolock() In assert_pte_locked(), we just get the ptl and assert if it was already held, so convert it to using pte_offset_map_ro_nolock(). Link: https://lkml.kernel.org/r/42559e042eb6fc3129a40f710d671712030646b4.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: David Hildenbrand Reviewed-by: Muchun Song Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- arch/powerpc/mm/pgtable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 7316396e452d..61df5aed7989 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -398,7 +398,7 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr) */ if (pmd_none(*pmd)) return; - pte = pte_offset_map_nolock(mm, pmd, addr, &ptl); + pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl); BUG_ON(!pte); assert_spin_locked(ptl); pte_unmap(pte); -- 2.51.0 From bd6ad65ddcbb2d0aceb843d31d4f1bd8d628200a Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:16 +0800 Subject: [PATCH 06/16] mm: filemap: filemap_fault_recheck_pte_none() use pte_offset_map_ro_nolock() In filemap_fault_recheck_pte_none(), we just do pte_none() check, so convert it to using pte_offset_map_ro_nolock(). Link: https://lkml.kernel.org/r/9f7cbbaa772385ced1b8931b67a8b9d246c9b82d.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: David Hildenbrand Reviewed-by: Muchun Song Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 36d22968be9a..630a1c431ea1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3258,8 +3258,8 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf) if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) return 0; - ptep = pte_offset_map_nolock(vma->vm_mm, vmf->pmd, vmf->address, - &vmf->ptl); + ptep = pte_offset_map_ro_nolock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); if (unlikely(!ptep)) return VM_FAULT_NOPAGE; -- 2.51.0 From c85507857bb8904f8631b3a89b19aa73b1f77e48 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:17 +0800 Subject: [PATCH 07/16] mm: khugepaged: __collapse_huge_page_swapin() use pte_offset_map_ro_nolock() In __collapse_huge_page_swapin(), we just use the ptl for pte_same() check in do_swap_page(). In other places, we directly use pte_offset_map_lock(), so convert it to using pte_offset_map_ro_nolock(). Link: https://lkml.kernel.org/r/dc97a6c3cb9ea80cab30c5626eeea79959d93258.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: David Hildenbrand Reviewed-by: Muchun Song Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/khugepaged.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ed1a225dd198..8e0d05bd3d56 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1014,7 +1014,11 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, }; if (!pte++) { - pte = pte_offset_map_nolock(mm, pmd, address, &ptl); + /* + * Here the ptl is only used to check pte_same() in + * do_swap_page(), so readonly version is enough. + */ + pte = pte_offset_map_ro_nolock(mm, pmd, address, &ptl); if (!pte) { mmap_read_unlock(mm); result = SCAN_PMD_NULL; -- 2.51.0 From fc9c45b71f43cafcc0435dd4c7a2d3b99955a0fa Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:18 +0800 Subject: [PATCH 08/16] arm: adjust_pte() use pte_offset_map_rw_nolock() In do_adjust_pte(), we may modify the pte entry. The corresponding pmd entry may have been modified concurrently. Therefore, in order to ensure the stability if pmd entry, use pte_offset_map_rw_nolock() to replace pte_offset_map_nolock(), and do pmd_same() check after holding the PTL. All callers of update_mmu_cache_range() hold the vmf->ptl, so we can determined whether split PTE locks is being used by doing the following, just as we do elsewhere in the kernel. ptl != vmf->ptl And then we can delete the do_pte_lock() and do_pte_unlock(). Link: https://lkml.kernel.org/r/0eaf6b69aeb2fe35092a633fed12537efe645303.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: David Hildenbrand Reviewed-by: Muchun Song Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- arch/arm/mm/fault-armv.c | 53 +++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index 831793cd6ff9..2bec87c3327d 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c @@ -61,32 +61,8 @@ static int do_adjust_pte(struct vm_area_struct *vma, unsigned long address, return ret; } -#if defined(CONFIG_SPLIT_PTE_PTLOCKS) -/* - * If we are using split PTE locks, then we need to take the page - * lock here. Otherwise we are using shared mm->page_table_lock - * which is already locked, thus cannot take it. - */ -static inline void do_pte_lock(spinlock_t *ptl) -{ - /* - * Use nested version here to indicate that we are already - * holding one similar spinlock. - */ - spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); -} - -static inline void do_pte_unlock(spinlock_t *ptl) -{ - spin_unlock(ptl); -} -#else /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */ -static inline void do_pte_lock(spinlock_t *ptl) {} -static inline void do_pte_unlock(spinlock_t *ptl) {} -#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ - static int adjust_pte(struct vm_area_struct *vma, unsigned long address, - unsigned long pfn) + unsigned long pfn, struct vm_fault *vmf) { spinlock_t *ptl; pgd_t *pgd; @@ -94,6 +70,7 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, pud_t *pud; pmd_t *pmd; pte_t *pte; + pmd_t pmdval; int ret; pgd = pgd_offset(vma->vm_mm, address); @@ -112,20 +89,33 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, if (pmd_none_or_clear_bad(pmd)) return 0; +again: /* * This is called while another page table is mapped, so we * must use the nested version. This also means we need to * open-code the spin-locking. */ - pte = pte_offset_map_nolock(vma->vm_mm, pmd, address, &ptl); + pte = pte_offset_map_rw_nolock(vma->vm_mm, pmd, address, &pmdval, &ptl); if (!pte) return 0; - do_pte_lock(ptl); + /* + * If we are using split PTE locks, then we need to take the page + * lock here. Otherwise we are using shared mm->page_table_lock + * which is already locked, thus cannot take it. + */ + if (ptl != vmf->ptl) { + spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) { + pte_unmap_unlock(pte, ptl); + goto again; + } + } ret = do_adjust_pte(vma, address, pfn, pte); - do_pte_unlock(ptl); + if (ptl != vmf->ptl) + spin_unlock(ptl); pte_unmap(pte); return ret; @@ -133,7 +123,8 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, static void make_coherent(struct address_space *mapping, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, unsigned long pfn) + unsigned long addr, pte_t *ptep, unsigned long pfn, + struct vm_fault *vmf) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *mpnt; @@ -160,7 +151,7 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma, if (!(mpnt->vm_flags & VM_MAYSHARE)) continue; offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT; - aliases += adjust_pte(mpnt, mpnt->vm_start + offset, pfn); + aliases += adjust_pte(mpnt, mpnt->vm_start + offset, pfn, vmf); } flush_dcache_mmap_unlock(mapping); if (aliases) @@ -203,7 +194,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, __flush_dcache_folio(mapping, folio); if (mapping) { if (cache_is_vivt()) - make_coherent(mapping, vma, addr, ptep, pfn); + make_coherent(mapping, vma, addr, ptep, pfn, vmf); else if (vma->vm_flags & VM_EXEC) __flush_icache_all(); } -- 2.51.0 From d9c1ddf37b4c287597a4578e70d19ed68d536be8 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:19 +0800 Subject: [PATCH 09/16] mm: handle_pte_fault() use pte_offset_map_rw_nolock() In handle_pte_fault(), we may modify the vmf->pte after acquiring the vmf->ptl, so convert it to using pte_offset_map_rw_nolock(). But since we will do the pte_same() check, so there is no need to get pmdval to do pmd_same() check, just pass a dummy variable to it. Link: https://lkml.kernel.org/r/af8d694853b44c5a6018403ae435440e275854c7.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: David Hildenbrand Reviewed-by: Muchun Song Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index c8d5d040d6ab..ce5cd8d4c401 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5742,14 +5742,24 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) vmf->pte = NULL; vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID; } else { + pmd_t dummy_pmdval; + /* * A regular pmd is established and it can't morph into a huge * pmd by anon khugepaged, since that takes mmap_lock in write * mode; but shmem or file collapse to THP could still morph * it into a huge pmd: just retry later if so. + * + * Use the maywrite version to indicate that vmf->pte may be + * modified, but since we will use pte_same() to detect the + * change of the !pte_none() entry, there is no need to recheck + * the pmdval. Here we chooes to pass a dummy variable instead + * of NULL, which helps new user think about why this place is + * special. */ - vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd, - vmf->address, &vmf->ptl); + vmf->pte = pte_offset_map_rw_nolock(vmf->vma->vm_mm, vmf->pmd, + vmf->address, &dummy_pmdval, + &vmf->ptl); if (unlikely(!vmf->pte)) return 0; vmf->orig_pte = ptep_get_lockless(vmf->pte); -- 2.51.0 From 6dfd0d2cb3691040979ddbd6c758956694a3185d Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:20 +0800 Subject: [PATCH 10/16] mm: khugepaged: collapse_pte_mapped_thp() use pte_offset_map_rw_nolock() In collapse_pte_mapped_thp(), we may modify the pte and pmd entry after acquiring the ptl, so convert it to using pte_offset_map_rw_nolock(). At this time, the pte_same() check is not performed after the PTL held. So we should get pgt_pmd and do pmd_same() check after the ptl held. Link: https://lkml.kernel.org/r/055e42db68da00ac8ecab94bd2633c7cd965eb1c.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/khugepaged.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 8e0d05bd3d56..6f8d46d107b4 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1608,7 +1608,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED)) pml = pmd_lock(mm, pmd); - start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl); + start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl); if (!start_pte) /* mmap_lock + page lock should prevent this */ goto abort; if (!pml) @@ -1616,6 +1616,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, else if (ptl != pml) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) + goto abort; + /* step 2: clear page table and adjust rmap */ for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { @@ -1648,7 +1651,6 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, nr_ptes++; } - pte_unmap(start_pte); if (!pml) spin_unlock(ptl); @@ -1661,14 +1663,19 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, /* step 4: remove empty page table */ if (!pml) { pml = pmd_lock(mm, pmd); - if (ptl != pml) + if (ptl != pml) { spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) { + flush_tlb_mm(mm); + goto unlock; + } + } } pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd); pmdp_get_lockless_sync(); + pte_unmap_unlock(start_pte, ptl); if (ptl != pml) - spin_unlock(ptl); - spin_unlock(pml); + spin_unlock(pml); mmu_notifier_invalidate_range_end(&range); @@ -1688,6 +1695,7 @@ abort: folio_ref_sub(folio, nr_ptes); add_mm_counter(mm, mm_counter_file(folio), -nr_ptes); } +unlock: if (start_pte) pte_unmap_unlock(start_pte, ptl); if (pml && pml != ptl) -- 2.51.0 From 24553a978b6fbd96fcb83c897c23569351ddebe2 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:21 +0800 Subject: [PATCH 11/16] mm: copy_pte_range() use pte_offset_map_rw_nolock() In copy_pte_range(), we may modify the src_pte entry after holding the src_ptl, so convert it to using pte_offset_map_rw_nolock(). Since we already hold the exclusive mmap_lock, and the copy_pte_range() and retract_page_tables() are using vma->anon_vma to be exclusive, so the PTE page is stable, there is no need to get pmdval and do pmd_same() check. Link: https://lkml.kernel.org/r/9166f6fad806efbca72e318ab6f0f8af458056a9.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index ce5cd8d4c401..6bda739a60e8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1084,6 +1084,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, struct mm_struct *src_mm = src_vma->vm_mm; pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; + pmd_t dummy_pmdval; pte_t ptent; spinlock_t *src_ptl, *dst_ptl; int progress, max_nr, ret = 0; @@ -1109,7 +1110,15 @@ again: ret = -ENOMEM; goto out; } - src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl); + + /* + * We already hold the exclusive mmap_lock, the copy_pte_range() and + * retract_page_tables() are using vma->anon_vma to be exclusive, so + * the PTE page is stable, and there is no need to get pmdval and do + * pmd_same() check. + */ + src_pte = pte_offset_map_rw_nolock(src_mm, src_pmd, addr, &dummy_pmdval, + &src_ptl); if (!src_pte) { pte_unmap_unlock(dst_pte, dst_ptl); /* ret == 0 */ -- 2.51.0 From 838d02354464c301fcddf4f524365846608ac296 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:22 +0800 Subject: [PATCH 12/16] mm: mremap: move_ptes() use pte_offset_map_rw_nolock() In move_ptes(), we may modify the new_pte after acquiring the new_ptl, so convert it to using pte_offset_map_rw_nolock(). Now new_pte is none, so hpage_collapse_scan_file() path can not find this by traversing file->f_mapping, so there is no concurrency with retract_page_tables(). In addition, we already hold the exclusive mmap_lock, so this new_pte page is stable, so there is no need to get pmdval and do pmd_same() check. Link: https://lkml.kernel.org/r/9d582a09dbcf12e562ac5fe0ba05e9248a58f5e0.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/mremap.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/mm/mremap.c b/mm/mremap.c index dda09e957a5d..5917feafe8cc 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -140,6 +140,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, { struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; + pmd_t dummy_pmdval; spinlock_t *old_ptl, *new_ptl; bool force_flush = false; unsigned long len = old_end - old_addr; @@ -175,7 +176,15 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, err = -EAGAIN; goto out; } - new_pte = pte_offset_map_nolock(mm, new_pmd, new_addr, &new_ptl); + /* + * Now new_pte is none, so hpage_collapse_scan_file() path can not find + * this by traversing file->f_mapping, so there is no concurrency with + * retract_page_tables(). In addition, we already hold the exclusive + * mmap_lock, so this new_pte page is stable, so there is no need to get + * pmdval and do pmd_same() check. + */ + new_pte = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &dummy_pmdval, + &new_ptl); if (!new_pte) { pte_unmap_unlock(old_pte, old_ptl); err = -EAGAIN; -- 2.51.0 From 04965da7a4af790d99c360e79b00bd1f93f80eb1 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:23 +0800 Subject: [PATCH 13/16] mm: page_vma_mapped_walk: map_pte() use pte_offset_map_rw_nolock() In the caller of map_pte(), we may modify the pvmw->pte after acquiring the pvmw->ptl, so convert it to using pte_offset_map_rw_nolock(). At this time, the pte_same() check is not performed after the pvmw->ptl held, so we should get pmdval and do pmd_same() check to ensure the stability of pvmw->pmd. Link: https://lkml.kernel.org/r/2620a48f34c9f19864ab0169cdbf253d31a8fcaa.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/page_vma_mapped.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index ae5cc42aa208..ab1671e71cb2 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -13,7 +13,8 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw) return false; } -static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) +static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp, + spinlock_t **ptlp) { pte_t ptent; @@ -25,6 +26,7 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) return !!pvmw->pte; } +again: /* * It is important to return the ptl corresponding to pte, * in case *pvmw->pmd changes underneath us; so we need to @@ -32,8 +34,8 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) * proceeds to loop over next ptes, and finds a match later. * Though, in most cases, page lock already protects this. */ - pvmw->pte = pte_offset_map_nolock(pvmw->vma->vm_mm, pvmw->pmd, - pvmw->address, ptlp); + pvmw->pte = pte_offset_map_rw_nolock(pvmw->vma->vm_mm, pvmw->pmd, + pvmw->address, pmdvalp, ptlp); if (!pvmw->pte) return false; @@ -67,8 +69,13 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) } else if (!pte_present(ptent)) { return false; } + spin_lock(*ptlp); + if (unlikely(!pmd_same(*pmdvalp, pmdp_get_lockless(pvmw->pmd)))) { + pte_unmap_unlock(pvmw->pte, *ptlp); + goto again; + } pvmw->ptl = *ptlp; - spin_lock(pvmw->ptl); + return true; } @@ -278,7 +285,7 @@ restart: step_forward(pvmw, PMD_SIZE); continue; } - if (!map_pte(pvmw, &ptl)) { + if (!map_pte(pvmw, &pmde, &ptl)) { if (!pvmw->pte) goto restart; goto next_pte; @@ -305,8 +312,13 @@ next_pte: } while (pte_none(ptep_get(pvmw->pte))); if (!pvmw->ptl) { + spin_lock(ptl); + if (unlikely(!pmd_same(pmde, pmdp_get_lockless(pvmw->pmd)))) { + pte_unmap_unlock(pvmw->pte, ptl); + pvmw->pte = NULL; + goto restart; + } pvmw->ptl = ptl; - spin_lock(pvmw->ptl); } goto this_pte; } while (pvmw->address < end); -- 2.51.0 From e9c74b5431632d2ca60725ffff6fc1fe2b80f246 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:24 +0800 Subject: [PATCH 14/16] mm: userfaultfd: move_pages_pte() use pte_offset_map_rw_nolock() In move_pages_pte(), we may modify the dst_pte and src_pte after acquiring the ptl, so convert it to using pte_offset_map_rw_nolock(). But since we will use pte_same() to detect the change of the pte entry, there is no need to get pmdval, so just pass a dummy variable to it. Link: https://lkml.kernel.org/r/1530e8fdbfc72eacf3b095babe139ce3d715600a.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index ce13c4062647..48b87c62fc3d 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1135,7 +1135,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, spinlock_t *src_ptl, *dst_ptl; pte_t *src_pte = NULL; pte_t *dst_pte = NULL; - + pmd_t dummy_pmdval; struct folio *src_folio = NULL; struct anon_vma *src_anon_vma = NULL; struct mmu_notifier_range range; @@ -1146,7 +1146,14 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, src_addr, src_addr + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); retry: - dst_pte = pte_offset_map_nolock(mm, dst_pmd, dst_addr, &dst_ptl); + /* + * Use the maywrite version to indicate that dst_pte will be modified, + * but since we will use pte_same() to detect the change of the pte + * entry, there is no need to get pmdval, so just pass a dummy variable + * to it. + */ + dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dummy_pmdval, + &dst_ptl); /* Retry if a huge pmd materialized from under us */ if (unlikely(!dst_pte)) { @@ -1154,7 +1161,9 @@ retry: goto out; } - src_pte = pte_offset_map_nolock(mm, src_pmd, src_addr, &src_ptl); + /* same as dst_pte */ + src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval, + &src_ptl); /* * We held the mmap_lock for reading so MADV_DONTNEED -- 2.51.0 From 2441774f2d2890940f2db21bbc264c7e2f56d1ae Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:25 +0800 Subject: [PATCH 15/16] mm: multi-gen LRU: walk_pte_range() use pte_offset_map_rw_nolock() In walk_pte_range(), we may modify the pte entry after holding the ptl, so convert it to using pte_offset_map_rw_nolock(). At this time, the pte_same() check is not performed after the ptl held, so we should get pmdval and do pmd_same() check to ensure the stability of pmd entry. Link: https://lkml.kernel.org/r/7e9c194a5efacc9609cfd31abb9c7df88b53b530.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/vmscan.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 20dd72c98813..8f25dd6cec54 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3386,8 +3386,10 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); DEFINE_MAX_SEQ(walk->lruvec); int old_gen, new_gen = lru_gen_from_seq(max_seq); + pmd_t pmdval; - pte = pte_offset_map_nolock(args->mm, pmd, start & PMD_MASK, &ptl); + pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, + &ptl); if (!pte) return false; if (!spin_trylock(ptl)) { @@ -3395,6 +3397,11 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, return false; } + if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) { + pte_unmap_unlock(pte, ptl); + return false; + } + arch_enter_lazy_mmu_mode(); restart: for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { -- 2.51.0 From 583e66debd1d5aa8c401aebe924c7406e15579a7 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:26 +0800 Subject: [PATCH 16/16] mm: pgtable: remove pte_offset_map_nolock() Now no users are using the pte_offset_map_nolock(), remove it. Link: https://lkml.kernel.org/r/d04f9bbbcde048fb6ffa6f2bdbc6f9b22d5286f9.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- Documentation/mm/split_page_table_lock.rst | 3 --- include/linux/mm.h | 2 -- mm/pgtable-generic.c | 21 --------------------- 3 files changed, 26 deletions(-) diff --git a/Documentation/mm/split_page_table_lock.rst b/Documentation/mm/split_page_table_lock.rst index 08d0e706a32d..581446d4a4eb 100644 --- a/Documentation/mm/split_page_table_lock.rst +++ b/Documentation/mm/split_page_table_lock.rst @@ -16,9 +16,6 @@ There are helpers to lock/unlock a table and other accessor functions: - pte_offset_map_lock() maps PTE and takes PTE table lock, returns pointer to PTE with pointer to its PTE table lock, or returns NULL if no PTE table; - - pte_offset_map_nolock() - maps PTE, returns pointer to PTE with pointer to its PTE table - lock (not taken), or returns NULL if no PTE table; - pte_offset_map_ro_nolock() maps PTE, returns pointer to PTE with pointer to its PTE table lock (not taken), or returns NULL if no PTE table; diff --git a/include/linux/mm.h b/include/linux/mm.h index 0cf45d4b7286..8f5394d75ce2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3015,8 +3015,6 @@ static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, return pte; } -pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, spinlock_t **ptlp); pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp); pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index daa08b91ab6b..5297dcc38c37 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -305,18 +305,6 @@ nomap: return NULL; } -pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, spinlock_t **ptlp) -{ - pmd_t pmdval; - pte_t *pte; - - pte = __pte_offset_map(pmd, addr, &pmdval); - if (likely(pte)) - *ptlp = pte_lockptr(mm, &pmdval); - return pte; -} - pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp) { @@ -372,15 +360,6 @@ pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, * and disconnected table. Until pte_unmap(pte) unmaps and rcu_read_unlock()s * afterwards. * - * pte_offset_map_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map(); - * but when successful, it also outputs a pointer to the spinlock in ptlp - as - * pte_offset_map_lock() does, but in this case without locking it. This helps - * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time - * act on a changed *pmd: pte_offset_map_nolock() provides the correct spinlock - * pointer for the page table that it returns. In principle, the caller should - * recheck *pmd once the lock is taken; in practice, no callsite needs that - - * either the mmap_lock for write, or pte_same() check on contents, is enough. - * * pte_offset_map_ro_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map(); * but when successful, it also outputs a pointer to the spinlock in ptlp - as * pte_offset_map_lock() does, but in this case without locking it. This helps -- 2.51.0