From 4d71d9062dd7b2ace56f2351b9f3f06e6c0acf81 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 26 Mar 2025 00:25:26 +0800 Subject: [PATCH 01/16] mm: swap: free each cluster individually in swap_entries_put_map_nr() 1. Factor out general swap_entries_put_map() helper to drop entries belonging to one cluster. If entries are last map, free entries in batch, otherwise put entries with cluster lock acquired and released only once. 2. Iterate and call swap_entries_put_map() for each cluster in swap_entries_put_nr() to leverage batch-remove for last map belonging to one cluster and reduce lock acquire/release in fallback case. 3. As swap_entries_put_nr() won't handle SWAP_HSA_CACHE drop, rename it to swap_entries_put_map_nr(). 4. As we won't drop each entry invidually with swap_entry_put() now, do reclaim in free_swap_and_cache_nr() because swap_entries_put_map_nr() is general routine to drop reference and the relcaim work should only be done in free_swap_and_cache_nr(). Remove stale comment accordingly. Link: https://lkml.kernel.org/r/20250325162528.68385-7-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Tim Chen Reviewed-by: Baoquan He Cc: Kairui Song Signed-off-by: Andrew Morton --- mm/swapfile.c | 70 +++++++++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 38 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 668684dc9efa..4f4fc74239d6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1463,25 +1463,10 @@ put_out: return NULL; } -static unsigned char swap_entry_put(struct swap_info_struct *si, - swp_entry_t entry) -{ - struct swap_cluster_info *ci; - unsigned long offset = swp_offset(entry); - unsigned char usage; - - ci = lock_cluster(si, offset); - usage = swap_entry_put_locked(si, ci, entry, 1); - unlock_cluster(ci); - - return usage; -} - -static bool swap_entries_put_nr(struct swap_info_struct *si, - swp_entry_t entry, int nr) +static bool swap_entries_put_map(struct swap_info_struct *si, + swp_entry_t entry, int nr) { unsigned long offset = swp_offset(entry); - unsigned int type = swp_type(entry); struct swap_cluster_info *ci; bool has_cache = false; unsigned char count; @@ -1492,14 +1477,10 @@ static bool swap_entries_put_nr(struct swap_info_struct *si, count = swap_count(data_race(si->swap_map[offset])); if (count != 1 && count != SWAP_MAP_SHMEM) goto fallback; - /* cross into another cluster */ - if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER) - goto fallback; ci = lock_cluster(si, offset); if (!swap_is_last_map(si, offset, nr, &has_cache)) { - unlock_cluster(ci); - goto fallback; + goto locked_fallback; } if (!has_cache) swap_entries_free(si, ci, entry, nr); @@ -1511,15 +1492,34 @@ static bool swap_entries_put_nr(struct swap_info_struct *si, return has_cache; fallback: - for (i = 0; i < nr; i++) { - if (data_race(si->swap_map[offset + i])) { - count = swap_entry_put(si, swp_entry(type, offset + i)); - if (count == SWAP_HAS_CACHE) - has_cache = true; - } else { - WARN_ON_ONCE(1); - } + ci = lock_cluster(si, offset); +locked_fallback: + for (i = 0; i < nr; i++, entry.val++) { + count = swap_entry_put_locked(si, ci, entry, 1); + if (count == SWAP_HAS_CACHE) + has_cache = true; + } + unlock_cluster(ci); + return has_cache; + +} + +static bool swap_entries_put_map_nr(struct swap_info_struct *si, + swp_entry_t entry, int nr) +{ + int cluster_nr, cluster_rest; + unsigned long offset = swp_offset(entry); + bool has_cache = false; + + cluster_rest = SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER; + while (nr) { + cluster_nr = min(nr, cluster_rest); + has_cache |= swap_entries_put_map(si, entry, cluster_nr); + cluster_rest = SWAPFILE_CLUSTER; + nr -= cluster_nr; + entry.val += cluster_nr; } + return has_cache; } @@ -1818,7 +1818,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) /* * First free all entries in the range. */ - any_only_cache = swap_entries_put_nr(si, entry, nr); + any_only_cache = swap_entries_put_map_nr(si, entry, nr); /* * Short-circuit the below loop if none of the entries had their @@ -1828,13 +1828,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) goto out; /* - * Now go back over the range trying to reclaim the swap cache. This is - * more efficient for large folios because we will only try to reclaim - * the swap once per folio in the common case. If we do - * swap_entry_put() and __try_to_reclaim_swap() in the same loop, the - * latter will get a reference and lock the folio for every individual - * page but will only succeed once the swap slot for every subpage is - * zero. + * Now go back over the range trying to reclaim the swap cache. */ for (offset = start_offset; offset < end_offset; offset += nr) { nr = 1; -- 2.51.0 From d4f8000bd6b0cc33c9dddd369e72a13c4c080cb1 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 26 Mar 2025 00:25:27 +0800 Subject: [PATCH 02/16] mm: swap: factor out helper to drop cache of entries within a single cluster Factor out helper swap_entries_put_cache() from put_swap_folio() to serve as a general-purpose routine for dropping cache flag of entries within a single cluster. Link: https://lkml.kernel.org/r/20250325162528.68385-8-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Tim Chen Reviewed-by: Baoquan He Cc: Kairui Song Signed-off-by: Andrew Morton --- mm/swapfile.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 4f4fc74239d6..953dcd99006e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1463,6 +1463,22 @@ put_out: return NULL; } +static void swap_entries_put_cache(struct swap_info_struct *si, + swp_entry_t entry, int nr) +{ + unsigned long offset = swp_offset(entry); + struct swap_cluster_info *ci; + + ci = lock_cluster(si, offset); + if (swap_only_has_cache(si, offset, nr)) + swap_entries_free(si, ci, entry, nr); + else { + for (int i = 0; i < nr; i++, entry.val++) + swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE); + } + unlock_cluster(ci); +} + static bool swap_entries_put_map(struct swap_info_struct *si, swp_entry_t entry, int nr) { @@ -1607,8 +1623,6 @@ void swap_free_nr(swp_entry_t entry, int nr_pages) */ void put_swap_folio(struct folio *folio, swp_entry_t entry) { - unsigned long offset = swp_offset(entry); - struct swap_cluster_info *ci; struct swap_info_struct *si; int size = 1 << swap_entry_order(folio_order(folio)); @@ -1616,14 +1630,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) if (!si) return; - ci = lock_cluster(si, offset); - if (swap_only_has_cache(si, offset, size)) - swap_entries_free(si, ci, entry, size); - else { - for (int i = 0; i < size; i++, entry.val++) - swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE); - } - unlock_cluster(ci); + swap_entries_put_cache(si, entry, size); } int __swap_count(swp_entry_t entry) -- 2.51.0 From ec9827cd28b13b88517812eb08b13d0ed97ae8f1 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 26 Mar 2025 00:25:28 +0800 Subject: [PATCH 03/16] mm: swap: replace cluster_swap_free_nr() with swap_entries_put_[map/cache]() Replace cluster_swap_free_nr() with swap_entries_put_[map/cache]() to remove repeat code and leverage batch-remove for entries with last flag. After removing cluster_swap_free_nr, only functions with "_nr" suffix could free entries spanning cross clusters. Add corresponding description in comment of swap_entries_put_map_nr() as is first function with "_nr" suffix and have a non-suffix variant function swap_entries_put_map(). Link: https://lkml.kernel.org/r/20250325162528.68385-9-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Tim Chen Reviewed-by: Baoquan He Cc: Kairui Song Signed-off-by: Andrew Morton --- mm/swapfile.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 953dcd99006e..b86637cfb17a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1520,6 +1520,11 @@ locked_fallback: } +/* + * Only functions with "_nr" suffix are able to free entries spanning + * cross multi clusters, so ensure the range is within a single cluster + * when freeing entries with functions without "_nr" suffix. + */ static bool swap_entries_put_map_nr(struct swap_info_struct *si, swp_entry_t entry, int nr) { @@ -1581,21 +1586,6 @@ static void swap_entries_free(struct swap_info_struct *si, partial_free_cluster(si, ci); } -static void cluster_swap_free_nr(struct swap_info_struct *si, - unsigned long offset, int nr_pages, - unsigned char usage) -{ - struct swap_cluster_info *ci; - unsigned long end = offset + nr_pages; - - ci = lock_cluster(si, offset); - do { - swap_entry_put_locked(si, ci, swp_entry(si->type, offset), - usage); - } while (++offset < end); - unlock_cluster(ci); -} - /* * Caller has made sure that the swap device corresponding to entry * is still around or has not been recycled. @@ -1612,7 +1602,7 @@ void swap_free_nr(swp_entry_t entry, int nr_pages) while (nr_pages) { nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); - cluster_swap_free_nr(sis, offset, nr, 1); + swap_entries_put_map(sis, swp_entry(sis->type, offset), nr); offset += nr; nr_pages -= nr; } @@ -3658,11 +3648,13 @@ int swapcache_prepare(swp_entry_t entry, int nr) return __swap_duplicate(entry, SWAP_HAS_CACHE, nr); } +/* + * Caller should ensure entries belong to the same folio so + * the entries won't span cross cluster boundary. + */ void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) { - unsigned long offset = swp_offset(entry); - - cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE); + swap_entries_put_cache(si, entry, nr); } struct swap_info_struct *swp_swap_info(swp_entry_t entry) -- 2.51.0 From f4d1c32489117c9d38206a673d880d23d7d3bb8a Mon Sep 17 00:00:00 2001 From: SoumishDas Date: Tue, 25 Mar 2025 23:43:25 +0530 Subject: [PATCH 04/16] mm: add kernel-doc comment for free_pgd_range() Provide kernel-doc for free_pgd_range() so it's easier to understand what the function does and how it is used. Link: https://lkml.kernel.org/r/20250325181325.5774-1-soumish.das@gmail.com Signed-off-by: SoumishDas Signed-off-by: Andrew Morton --- mm/memory.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 86e7e66e3c5b..f41fac7118ba 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -278,8 +278,17 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, p4d_free_tlb(tlb, p4d, start); } -/* - * This function frees user-level page tables of a process. +/** + * free_pgd_range - Unmap and free page tables in the range + * @tlb: the mmu_gather containing pending TLB flush info + * @addr: virtual address start + * @end: virtual address end + * @floor: lowest address boundary + * @ceiling: highest address boundary + * + * This function tears down all user-level page tables in the + * specified virtual address range [@addr..@end). It is part of + * the memory unmap flow. */ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, -- 2.51.0 From 87a929ae4fb4709c577e1d41d6fb13f567a4aa03 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Mon, 3 Mar 2025 13:19:53 +0200 Subject: [PATCH 05/16] hexagon: add syscall_set_return_value() Patch series "ptrace: introduce PTRACE_SET_SYSCALL_INFO API", v7. PTRACE_SET_SYSCALL_INFO is a generic ptrace API that complements PTRACE_GET_SYSCALL_INFO by letting the ptracer modify details of system calls the tracee is blocked in. This API allows ptracers to obtain and modify system call details in a straightforward and architecture-agnostic way, providing a consistent way of manipulating the system call number and arguments across architectures. As in case of PTRACE_GET_SYSCALL_INFO, PTRACE_SET_SYSCALL_INFO also does not aim to address numerous architecture-specific system call ABI peculiarities, like differences in the number of system call arguments for such system calls as pread64 and preadv. The current implementation supports changing only those bits of system call information that are used by strace system call tampering, namely, syscall number, syscall arguments, and syscall return value. Support of changing additional details returned by PTRACE_GET_SYSCALL_INFO, such as instruction pointer and stack pointer, could be added later if needed, by using struct ptrace_syscall_info.flags to specify the additional details that should be set. Currently, "flags" and "reserved" fields of struct ptrace_syscall_info must be initialized with zeroes; "arch", "instruction_pointer", and "stack_pointer" fields are currently ignored. PTRACE_SET_SYSCALL_INFO currently supports only PTRACE_SYSCALL_INFO_ENTRY, PTRACE_SYSCALL_INFO_EXIT, and PTRACE_SYSCALL_INFO_SECCOMP operations. Other operations could be added later if needed. Ideally, PTRACE_SET_SYSCALL_INFO should have been introduced along with PTRACE_GET_SYSCALL_INFO, but it didn't happen. The last straw that convinced me to implement PTRACE_SET_SYSCALL_INFO was apparent failure to provide an API of changing the first system call argument on riscv architecture [1]. ptrace(2) man page: long ptrace(enum __ptrace_request request, pid_t pid, void *addr, void *data); ... PTRACE_SET_SYSCALL_INFO Modify information about the system call that caused the stop. The "data" argument is a pointer to struct ptrace_syscall_info that specifies the system call information to be set. The "addr" argument should be set to sizeof(struct ptrace_syscall_info)). This patch (of 6): hexagon is the only architecture that provides HAVE_ARCH_TRACEHOOK but doesn't define syscall_set_return_value(). Since this function is going to be needed on all HAVE_ARCH_TRACEHOOK architectures to implement PTRACE_SET_SYSCALL_INFO API, add it on hexagon, too. Link: https://lore.kernel.org/all/59505464-c84a-403d-972f-d4b2055eeaac@gmail.com/ [1] Link: https://lkml.kernel.org/r/20250303111953.GB24170@strace.io Signed-off-by: Dmitry V. Levin Cc: Alexander Gordeev Cc: Alexey Gladkov (Intel) Cc: Andreas Larsson Cc: anton ivanov Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Brian Cain Cc: Charlie Jenkins Cc: Christian Borntraeger Cc: Christian Zankel Cc: Christophe Leroy Cc: Dave Hansen Cc: Davide Berardi Cc: David S. Miller Cc: Dinh Nguyen Cc: Eugene Syromyatnikov Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Maciej W. Rozycki Cc: Madhavan Srinivasan Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mike Frysinger Cc: Naveen N Rao Cc: Nicholas Piggin Cc: Oleg Nesterov Cc: Renzo Davoi Cc: Richard Weinberger Cc: Rich Felker Cc: Russel King Cc: Shuah Khan Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Cc: Eugene Syromiatnikov Signed-off-by: Andrew Morton --- arch/hexagon/include/asm/syscall.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/hexagon/include/asm/syscall.h b/arch/hexagon/include/asm/syscall.h index f6e454f18038..951ca0ed8376 100644 --- a/arch/hexagon/include/asm/syscall.h +++ b/arch/hexagon/include/asm/syscall.h @@ -45,6 +45,13 @@ static inline long syscall_get_return_value(struct task_struct *task, return regs->r00; } +static inline void syscall_set_return_value(struct task_struct *task, + struct pt_regs *regs, + int error, long val) +{ + regs->r00 = (long) error ?: val; +} + static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_HEXAGON; -- 2.51.0 From 17fc7b8f9bce5d3d61ef347dd8cfccb6365dcaa1 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Mon, 3 Mar 2025 13:20:09 +0200 Subject: [PATCH 06/16] syscall.h: add syscall_set_arguments() This function is going to be needed on all HAVE_ARCH_TRACEHOOK architectures to implement PTRACE_SET_SYSCALL_INFO API. This partially reverts commit 7962c2eddbfe ("arch: remove unused function syscall_set_arguments()") by reusing some of old syscall_set_arguments() implementations. [nathan@kernel.org: fix compile time fortify checks] Link: https://lkml.kernel.org/r/20250408213131.GA2872426@ax162 Link: https://lkml.kernel.org/r/20250303112009.GC24170@strace.io Signed-off-by: Dmitry V. Levin Signed-off-by: Nathan Chancellor Tested-by: Charlie Jenkins Reviewed-by: Charlie Jenkins Acked-by: Helge Deller # parisc Reviewed-by: Maciej W. Rozycki [mips] Cc: Alexander Gordeev Cc: Alexey Gladkov (Intel) Cc: Andreas Larsson Cc: anton ivanov Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Brian Cain Cc: Christian Borntraeger Cc: Christian Zankel Cc: Christophe Leroy Cc: Dave Hansen Cc: Davide Berardi Cc: David S. Miller Cc: Dinh Nguyen Cc: Eugene Syromiatnikov Cc: Eugene Syromyatnikov Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Madhavan Srinivasan Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mike Frysinger Cc: Naveen N Rao Cc: Nicholas Piggin Cc: Oleg Nesterov Cc: Renzo Davoi Cc: Richard Weinberger Cc: Rich Felker Cc: Russel King Cc: Shuah Khan Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/arc/include/asm/syscall.h | 14 +++++++++++ arch/arm/include/asm/syscall.h | 13 ++++++++++ arch/arm64/include/asm/syscall.h | 13 ++++++++++ arch/csky/include/asm/syscall.h | 13 ++++++++++ arch/hexagon/include/asm/syscall.h | 7 ++++++ arch/loongarch/include/asm/syscall.h | 8 ++++++ arch/mips/include/asm/syscall.h | 28 +++++++++++++++++++++ arch/nios2/include/asm/syscall.h | 11 ++++++++ arch/openrisc/include/asm/syscall.h | 7 ++++++ arch/parisc/include/asm/syscall.h | 12 +++++++++ arch/powerpc/include/asm/syscall.h | 10 ++++++++ arch/riscv/include/asm/syscall.h | 12 +++++++++ arch/s390/include/asm/syscall.h | 9 +++++++ arch/sh/include/asm/syscall_32.h | 12 +++++++++ arch/sparc/include/asm/syscall.h | 10 ++++++++ arch/um/include/asm/syscall-generic.h | 14 +++++++++++ arch/x86/include/asm/syscall.h | 36 +++++++++++++++++++++++++++ arch/xtensa/include/asm/syscall.h | 11 ++++++++ include/asm-generic/syscall.h | 16 ++++++++++++ 19 files changed, 256 insertions(+) diff --git a/arch/arc/include/asm/syscall.h b/arch/arc/include/asm/syscall.h index 9709256e31c8..89c1e1736356 100644 --- a/arch/arc/include/asm/syscall.h +++ b/arch/arc/include/asm/syscall.h @@ -67,6 +67,20 @@ syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, } } +static inline void +syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, + unsigned long *args) +{ + unsigned long *inside_ptregs = ®s->r0; + unsigned int n = 6; + unsigned int i = 0; + + while (n--) { + *inside_ptregs = args[i++]; + inside_ptregs--; + } +} + static inline int syscall_get_arch(struct task_struct *task) { diff --git a/arch/arm/include/asm/syscall.h b/arch/arm/include/asm/syscall.h index fe4326d938c1..21927fa0ae2b 100644 --- a/arch/arm/include/asm/syscall.h +++ b/arch/arm/include/asm/syscall.h @@ -80,6 +80,19 @@ static inline void syscall_get_arguments(struct task_struct *task, memcpy(args, ®s->ARM_r0 + 1, 5 * sizeof(args[0])); } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + memcpy(®s->ARM_r0, args, 6 * sizeof(args[0])); + /* + * Also copy the first argument into ARM_ORIG_r0 + * so that syscall_get_arguments() would return it + * instead of the previous value. + */ + regs->ARM_ORIG_r0 = regs->ARM_r0; +} + static inline int syscall_get_arch(struct task_struct *task) { /* ARM tasks don't change audit architectures on the fly. */ diff --git a/arch/arm64/include/asm/syscall.h b/arch/arm64/include/asm/syscall.h index ab8e14b96f68..76020b66286b 100644 --- a/arch/arm64/include/asm/syscall.h +++ b/arch/arm64/include/asm/syscall.h @@ -73,6 +73,19 @@ static inline void syscall_get_arguments(struct task_struct *task, memcpy(args, ®s->regs[1], 5 * sizeof(args[0])); } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + memcpy(®s->regs[0], args, 6 * sizeof(args[0])); + /* + * Also copy the first argument into orig_x0 + * so that syscall_get_arguments() would return it + * instead of the previous value. + */ + regs->orig_x0 = regs->regs[0]; +} + /* * We don't care about endianness (__AUDIT_ARCH_LE bit) here because * AArch64 has the same system calls both on little- and big- endian. diff --git a/arch/csky/include/asm/syscall.h b/arch/csky/include/asm/syscall.h index 0de5734950bf..717f44b4d26f 100644 --- a/arch/csky/include/asm/syscall.h +++ b/arch/csky/include/asm/syscall.h @@ -59,6 +59,19 @@ syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, memcpy(args, ®s->a1, 5 * sizeof(args[0])); } +static inline void +syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, + const unsigned long *args) +{ + memcpy(®s->a0, args, 6 * sizeof(regs->a0)); + /* + * Also copy the first argument into orig_a0 + * so that syscall_get_arguments() would return it + * instead of the previous value. + */ + regs->orig_a0 = regs->a0; +} + static inline int syscall_get_arch(struct task_struct *task) { diff --git a/arch/hexagon/include/asm/syscall.h b/arch/hexagon/include/asm/syscall.h index 951ca0ed8376..1024a6548d78 100644 --- a/arch/hexagon/include/asm/syscall.h +++ b/arch/hexagon/include/asm/syscall.h @@ -33,6 +33,13 @@ static inline void syscall_get_arguments(struct task_struct *task, memcpy(args, &(®s->r00)[0], 6 * sizeof(args[0])); } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned long *args) +{ + memcpy(&(®s->r00)[0], args, 6 * sizeof(args[0])); +} + static inline long syscall_get_error(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/loongarch/include/asm/syscall.h b/arch/loongarch/include/asm/syscall.h index e286dc58476e..ff415b3c0a8e 100644 --- a/arch/loongarch/include/asm/syscall.h +++ b/arch/loongarch/include/asm/syscall.h @@ -61,6 +61,14 @@ static inline void syscall_get_arguments(struct task_struct *task, memcpy(&args[1], ®s->regs[5], 5 * sizeof(long)); } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned long *args) +{ + regs->orig_a0 = args[0]; + memcpy(®s->regs[5], &args[1], 5 * sizeof(long)); +} + static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_LOONGARCH64; diff --git a/arch/mips/include/asm/syscall.h b/arch/mips/include/asm/syscall.h index 056aa1b713e2..f1926ce30d4b 100644 --- a/arch/mips/include/asm/syscall.h +++ b/arch/mips/include/asm/syscall.h @@ -74,6 +74,23 @@ static inline void mips_get_syscall_arg(unsigned long *arg, #endif } +static inline void mips_set_syscall_arg(unsigned long *arg, + struct task_struct *task, struct pt_regs *regs, unsigned int n) +{ +#ifdef CONFIG_32BIT + switch (n) { + case 0: case 1: case 2: case 3: + regs->regs[4 + n] = *arg; + return; + case 4: case 5: case 6: case 7: + *arg = regs->args[n] = *arg; + return; + } +#else + regs->regs[4 + n] = *arg; +#endif +} + static inline long syscall_get_error(struct task_struct *task, struct pt_regs *regs) { @@ -120,6 +137,17 @@ static inline void syscall_get_arguments(struct task_struct *task, mips_get_syscall_arg(args++, task, regs, i++); } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + unsigned long *args) +{ + unsigned int i = 0; + unsigned int n = 6; + + while (n--) + mips_set_syscall_arg(args++, task, regs, i++); +} + extern const unsigned long sys_call_table[]; extern const unsigned long sys32_call_table[]; extern const unsigned long sysn32_call_table[]; diff --git a/arch/nios2/include/asm/syscall.h b/arch/nios2/include/asm/syscall.h index fff52205fb65..526449edd768 100644 --- a/arch/nios2/include/asm/syscall.h +++ b/arch/nios2/include/asm/syscall.h @@ -58,6 +58,17 @@ static inline void syscall_get_arguments(struct task_struct *task, *args = regs->r9; } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, const unsigned long *args) +{ + regs->r4 = *args++; + regs->r5 = *args++; + regs->r6 = *args++; + regs->r7 = *args++; + regs->r8 = *args++; + regs->r9 = *args; +} + static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_NIOS2; diff --git a/arch/openrisc/include/asm/syscall.h b/arch/openrisc/include/asm/syscall.h index 903ed882bdec..e6383be2a195 100644 --- a/arch/openrisc/include/asm/syscall.h +++ b/arch/openrisc/include/asm/syscall.h @@ -57,6 +57,13 @@ syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, memcpy(args, ®s->gpr[3], 6 * sizeof(args[0])); } +static inline void +syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, + const unsigned long *args) +{ + memcpy(®s->gpr[3], args, 6 * sizeof(args[0])); +} + static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_OPENRISC; diff --git a/arch/parisc/include/asm/syscall.h b/arch/parisc/include/asm/syscall.h index 00b127a5e09b..b146d0ae4c77 100644 --- a/arch/parisc/include/asm/syscall.h +++ b/arch/parisc/include/asm/syscall.h @@ -29,6 +29,18 @@ static inline void syscall_get_arguments(struct task_struct *tsk, args[0] = regs->gr[26]; } +static inline void syscall_set_arguments(struct task_struct *tsk, + struct pt_regs *regs, + unsigned long *args) +{ + regs->gr[21] = args[5]; + regs->gr[22] = args[4]; + regs->gr[23] = args[3]; + regs->gr[24] = args[2]; + regs->gr[25] = args[1]; + regs->gr[26] = args[0]; +} + static inline long syscall_get_error(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h index 3dd36c5e334a..b2715448a660 100644 --- a/arch/powerpc/include/asm/syscall.h +++ b/arch/powerpc/include/asm/syscall.h @@ -110,6 +110,16 @@ static inline void syscall_get_arguments(struct task_struct *task, } } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + memcpy(®s->gpr[3], args, 6 * sizeof(args[0])); + + /* Also copy the first argument into orig_gpr3 */ + regs->orig_gpr3 = args[0]; +} + static inline int syscall_get_arch(struct task_struct *task) { if (is_tsk_32bit_task(task)) diff --git a/arch/riscv/include/asm/syscall.h b/arch/riscv/include/asm/syscall.h index eceabf59ae48..da56417b6705 100644 --- a/arch/riscv/include/asm/syscall.h +++ b/arch/riscv/include/asm/syscall.h @@ -69,6 +69,18 @@ static inline void syscall_get_arguments(struct task_struct *task, args[5] = regs->a5; } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + regs->orig_a0 = args[0]; + regs->a1 = args[1]; + regs->a2 = args[2]; + regs->a3 = args[3]; + regs->a4 = args[4]; + regs->a5 = args[5]; +} + static inline int syscall_get_arch(struct task_struct *task) { #ifdef CONFIG_64BIT diff --git a/arch/s390/include/asm/syscall.h b/arch/s390/include/asm/syscall.h index 0213ec800b57..b87d8bb2cbaa 100644 --- a/arch/s390/include/asm/syscall.h +++ b/arch/s390/include/asm/syscall.h @@ -76,6 +76,15 @@ static inline void syscall_get_arguments(struct task_struct *task, args[0] = regs->orig_gpr2 & mask; } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + regs->orig_gpr2 = args[0]; + for (int n = 1; n < 6; n++) + regs->gprs[2 + n] = args[n]; +} + static inline int syscall_get_arch(struct task_struct *task) { #ifdef CONFIG_COMPAT diff --git a/arch/sh/include/asm/syscall_32.h b/arch/sh/include/asm/syscall_32.h index d87738eebe30..cb51a7528384 100644 --- a/arch/sh/include/asm/syscall_32.h +++ b/arch/sh/include/asm/syscall_32.h @@ -57,6 +57,18 @@ static inline void syscall_get_arguments(struct task_struct *task, args[0] = regs->regs[4]; } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + regs->regs[1] = args[5]; + regs->regs[0] = args[4]; + regs->regs[7] = args[3]; + regs->regs[6] = args[2]; + regs->regs[5] = args[1]; + regs->regs[4] = args[0]; +} + static inline int syscall_get_arch(struct task_struct *task) { int arch = AUDIT_ARCH_SH; diff --git a/arch/sparc/include/asm/syscall.h b/arch/sparc/include/asm/syscall.h index 20c109ac8cc9..62a5a78804c4 100644 --- a/arch/sparc/include/asm/syscall.h +++ b/arch/sparc/include/asm/syscall.h @@ -117,6 +117,16 @@ static inline void syscall_get_arguments(struct task_struct *task, } } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + unsigned int i; + + for (i = 0; i < 6; i++) + regs->u_regs[UREG_I0 + i] = args[i]; +} + static inline int syscall_get_arch(struct task_struct *task) { #if defined(CONFIG_SPARC64) && defined(CONFIG_COMPAT) diff --git a/arch/um/include/asm/syscall-generic.h b/arch/um/include/asm/syscall-generic.h index 172b74143c4b..2984feb9d576 100644 --- a/arch/um/include/asm/syscall-generic.h +++ b/arch/um/include/asm/syscall-generic.h @@ -62,6 +62,20 @@ static inline void syscall_get_arguments(struct task_struct *task, *args = UPT_SYSCALL_ARG6(r); } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + struct uml_pt_regs *r = ®s->regs; + + UPT_SYSCALL_ARG1(r) = *args++; + UPT_SYSCALL_ARG2(r) = *args++; + UPT_SYSCALL_ARG3(r) = *args++; + UPT_SYSCALL_ARG4(r) = *args++; + UPT_SYSCALL_ARG5(r) = *args++; + UPT_SYSCALL_ARG6(r) = *args; +} + /* See arch/x86/um/asm/syscall.h for syscall_get_arch() definition. */ #endif /* __UM_SYSCALL_GENERIC_H */ diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 7c488ff0c764..b9c249dd9e3d 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -90,6 +90,18 @@ static inline void syscall_get_arguments(struct task_struct *task, args[5] = regs->bp; } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + regs->bx = args[0]; + regs->cx = args[1]; + regs->dx = args[2]; + regs->si = args[3]; + regs->di = args[4]; + regs->bp = args[5]; +} + static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_I386; @@ -121,6 +133,30 @@ static inline void syscall_get_arguments(struct task_struct *task, } } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ +# ifdef CONFIG_IA32_EMULATION + if (task->thread_info.status & TS_COMPAT) { + regs->bx = *args++; + regs->cx = *args++; + regs->dx = *args++; + regs->si = *args++; + regs->di = *args++; + regs->bp = *args; + } else +# endif + { + regs->di = *args++; + regs->si = *args++; + regs->dx = *args++; + regs->r10 = *args++; + regs->r8 = *args++; + regs->r9 = *args; + } +} + static inline int syscall_get_arch(struct task_struct *task) { /* x32 tasks should be considered AUDIT_ARCH_X86_64. */ diff --git a/arch/xtensa/include/asm/syscall.h b/arch/xtensa/include/asm/syscall.h index 5ee974bf8330..f9a671cbf933 100644 --- a/arch/xtensa/include/asm/syscall.h +++ b/arch/xtensa/include/asm/syscall.h @@ -68,6 +68,17 @@ static inline void syscall_get_arguments(struct task_struct *task, args[i] = regs->areg[reg[i]]; } +static inline void syscall_set_arguments(struct task_struct *task, + struct pt_regs *regs, + const unsigned long *args) +{ + static const unsigned int reg[] = XTENSA_SYSCALL_ARGUMENT_REGS; + unsigned int i; + + for (i = 0; i < 6; ++i) + regs->areg[reg[i]] = args[i]; +} + asmlinkage long xtensa_rt_sigreturn(void); asmlinkage long xtensa_shmat(int, char __user *, int); asmlinkage long xtensa_fadvise64_64(int, int, diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h index 182b039ce5fa..292b412f4e9a 100644 --- a/include/asm-generic/syscall.h +++ b/include/asm-generic/syscall.h @@ -117,6 +117,22 @@ void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs, void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, unsigned long *args); +/** + * syscall_set_arguments - change system call parameter value + * @task: task of interest, must be in system call entry tracing + * @regs: task_pt_regs() of @task + * @args: array of argument values to store + * + * Changes 6 arguments to the system call. + * The first argument gets value @args[0], and so on. + * + * It's only valid to call this when @task is stopped for tracing on + * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or + * %SYSCALL_WORK_SYSCALL_AUDIT. + */ +void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, + const unsigned long *args); + /** * syscall_get_arch - return the AUDIT_ARCH for the current system call * @task: task of interest, must be blocked -- 2.51.0 From cc6622730be77fa88acc4fb0942cd39e6fa5ca27 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Mon, 3 Mar 2025 13:20:20 +0200 Subject: [PATCH 07/16] syscall.h: introduce syscall_set_nr() Similar to syscall_set_arguments() that complements syscall_get_arguments(), introduce syscall_set_nr() that complements syscall_get_nr(). syscall_set_nr() is going to be needed along with syscall_set_arguments() on all HAVE_ARCH_TRACEHOOK architectures to implement PTRACE_SET_SYSCALL_INFO API. Link: https://lkml.kernel.org/r/20250303112020.GD24170@strace.io Signed-off-by: Dmitry V. Levin Tested-by: Charlie Jenkins Reviewed-by: Charlie Jenkins Acked-by: Helge Deller # parisc Reviewed-by: Maciej W. Rozycki # mips Cc: Alexander Gordeev Cc: Alexey Gladkov (Intel) Cc: Andreas Larsson Cc: anton ivanov Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Brian Cain Cc: Christian Borntraeger Cc: Christian Zankel Cc: Christophe Leroy Cc: Dave Hansen Cc: Davide Berardi Cc: David S. Miller Cc: Dinh Nguyen Cc: Eugene Syromiatnikov Cc: Eugene Syromyatnikov Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Madhavan Srinivasan Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mike Frysinger Cc: Naveen N Rao Cc: Nicholas Piggin Cc: Oleg Nesterov Cc: Renzo Davoi Cc: Richard Weinberger Cc: Rich Felker Cc: Russel King Cc: Shuah Khan Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/arc/include/asm/syscall.h | 11 +++++++++++ arch/arm/include/asm/syscall.h | 24 ++++++++++++++++++++++++ arch/arm64/include/asm/syscall.h | 16 ++++++++++++++++ arch/hexagon/include/asm/syscall.h | 7 +++++++ arch/loongarch/include/asm/syscall.h | 7 +++++++ arch/m68k/include/asm/syscall.h | 7 +++++++ arch/microblaze/include/asm/syscall.h | 7 +++++++ arch/mips/include/asm/syscall.h | 15 +++++++++++++++ arch/nios2/include/asm/syscall.h | 5 +++++ arch/openrisc/include/asm/syscall.h | 6 ++++++ arch/parisc/include/asm/syscall.h | 7 +++++++ arch/powerpc/include/asm/syscall.h | 10 ++++++++++ arch/riscv/include/asm/syscall.h | 7 +++++++ arch/s390/include/asm/syscall.h | 12 ++++++++++++ arch/sh/include/asm/syscall_32.h | 12 ++++++++++++ arch/sparc/include/asm/syscall.h | 12 ++++++++++++ arch/um/include/asm/syscall-generic.h | 5 +++++ arch/x86/include/asm/syscall.h | 7 +++++++ arch/xtensa/include/asm/syscall.h | 7 +++++++ include/asm-generic/syscall.h | 14 ++++++++++++++ 20 files changed, 198 insertions(+) diff --git a/arch/arc/include/asm/syscall.h b/arch/arc/include/asm/syscall.h index 89c1e1736356..728d625a10f1 100644 --- a/arch/arc/include/asm/syscall.h +++ b/arch/arc/include/asm/syscall.h @@ -23,6 +23,17 @@ syscall_get_nr(struct task_struct *task, struct pt_regs *regs) return -1; } +static inline void +syscall_set_nr(struct task_struct *task, struct pt_regs *regs, int nr) +{ + /* + * Unlike syscall_get_nr(), syscall_set_nr() can be called only when + * the target task is stopped for tracing on entering syscall, so + * there is no need to have the same check syscall_get_nr() has. + */ + regs->r8 = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/arm/include/asm/syscall.h b/arch/arm/include/asm/syscall.h index 21927fa0ae2b..18b102a30741 100644 --- a/arch/arm/include/asm/syscall.h +++ b/arch/arm/include/asm/syscall.h @@ -68,6 +68,30 @@ static inline void syscall_set_return_value(struct task_struct *task, regs->ARM_r0 = (long) error ? error : val; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + if (nr == -1) { + task_thread_info(task)->abi_syscall = -1; + /* + * When the syscall number is set to -1, the syscall will be + * skipped. In this case the syscall return value has to be + * set explicitly, otherwise the first syscall argument is + * returned as the syscall return value. + */ + syscall_set_return_value(task, regs, -ENOSYS, 0); + return; + } + if ((IS_ENABLED(CONFIG_AEABI) && !IS_ENABLED(CONFIG_OABI_COMPAT))) { + task_thread_info(task)->abi_syscall = nr; + return; + } + task_thread_info(task)->abi_syscall = + (task_thread_info(task)->abi_syscall & ~__NR_SYSCALL_MASK) | + (nr & __NR_SYSCALL_MASK); +} + #define SYSCALL_MAX_ARGS 7 static inline void syscall_get_arguments(struct task_struct *task, diff --git a/arch/arm64/include/asm/syscall.h b/arch/arm64/include/asm/syscall.h index 76020b66286b..712daa90e643 100644 --- a/arch/arm64/include/asm/syscall.h +++ b/arch/arm64/include/asm/syscall.h @@ -61,6 +61,22 @@ static inline void syscall_set_return_value(struct task_struct *task, regs->regs[0] = val; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + regs->syscallno = nr; + if (nr == -1) { + /* + * When the syscall number is set to -1, the syscall will be + * skipped. In this case the syscall return value has to be + * set explicitly, otherwise the first syscall argument is + * returned as the syscall return value. + */ + syscall_set_return_value(task, regs, -ENOSYS, 0); + } +} + #define SYSCALL_MAX_ARGS 6 static inline void syscall_get_arguments(struct task_struct *task, diff --git a/arch/hexagon/include/asm/syscall.h b/arch/hexagon/include/asm/syscall.h index 1024a6548d78..70637261817a 100644 --- a/arch/hexagon/include/asm/syscall.h +++ b/arch/hexagon/include/asm/syscall.h @@ -26,6 +26,13 @@ static inline long syscall_get_nr(struct task_struct *task, return regs->r06; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + regs->r06 = nr; +} + static inline void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, unsigned long *args) diff --git a/arch/loongarch/include/asm/syscall.h b/arch/loongarch/include/asm/syscall.h index ff415b3c0a8e..81d2733f7b94 100644 --- a/arch/loongarch/include/asm/syscall.h +++ b/arch/loongarch/include/asm/syscall.h @@ -26,6 +26,13 @@ static inline long syscall_get_nr(struct task_struct *task, return regs->regs[11]; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + regs->regs[11] = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/m68k/include/asm/syscall.h b/arch/m68k/include/asm/syscall.h index d1453e850cdd..bf84b160c2eb 100644 --- a/arch/m68k/include/asm/syscall.h +++ b/arch/m68k/include/asm/syscall.h @@ -14,6 +14,13 @@ static inline int syscall_get_nr(struct task_struct *task, return regs->orig_d0; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + regs->orig_d0 = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/microblaze/include/asm/syscall.h b/arch/microblaze/include/asm/syscall.h index 5eb3f624cc59..b5b6b91fae3e 100644 --- a/arch/microblaze/include/asm/syscall.h +++ b/arch/microblaze/include/asm/syscall.h @@ -14,6 +14,13 @@ static inline long syscall_get_nr(struct task_struct *task, return regs->r12; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + regs->r12 = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/mips/include/asm/syscall.h b/arch/mips/include/asm/syscall.h index f1926ce30d4b..d19e67e2aa6a 100644 --- a/arch/mips/include/asm/syscall.h +++ b/arch/mips/include/asm/syscall.h @@ -41,6 +41,21 @@ static inline long syscall_get_nr(struct task_struct *task, return task_thread_info(task)->syscall; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + /* + * New syscall number has to be assigned to regs[2] because + * it is loaded from there unconditionally after return from + * syscall_trace_enter() invocation. + * + * Consequently, if the syscall was indirect and nr != __NR_syscall, + * then after this assignment the syscall will cease to be indirect. + */ + task_thread_info(task)->syscall = regs->regs[2] = nr; +} + static inline void mips_syscall_update_nr(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/nios2/include/asm/syscall.h b/arch/nios2/include/asm/syscall.h index 526449edd768..8e3eb1d689bb 100644 --- a/arch/nios2/include/asm/syscall.h +++ b/arch/nios2/include/asm/syscall.h @@ -15,6 +15,11 @@ static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) return regs->r2; } +static inline void syscall_set_nr(struct task_struct *task, struct pt_regs *regs, int nr) +{ + regs->r2 = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/openrisc/include/asm/syscall.h b/arch/openrisc/include/asm/syscall.h index e6383be2a195..5e037d9659c5 100644 --- a/arch/openrisc/include/asm/syscall.h +++ b/arch/openrisc/include/asm/syscall.h @@ -25,6 +25,12 @@ syscall_get_nr(struct task_struct *task, struct pt_regs *regs) return regs->orig_gpr11; } +static inline void +syscall_set_nr(struct task_struct *task, struct pt_regs *regs, int nr) +{ + regs->orig_gpr11 = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/parisc/include/asm/syscall.h b/arch/parisc/include/asm/syscall.h index b146d0ae4c77..c11222798ab2 100644 --- a/arch/parisc/include/asm/syscall.h +++ b/arch/parisc/include/asm/syscall.h @@ -17,6 +17,13 @@ static inline long syscall_get_nr(struct task_struct *tsk, return regs->gr[20]; } +static inline void syscall_set_nr(struct task_struct *tsk, + struct pt_regs *regs, + int nr) +{ + regs->gr[20] = nr; +} + static inline void syscall_get_arguments(struct task_struct *tsk, struct pt_regs *regs, unsigned long *args) diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h index b2715448a660..4b3c52ed6e9d 100644 --- a/arch/powerpc/include/asm/syscall.h +++ b/arch/powerpc/include/asm/syscall.h @@ -39,6 +39,16 @@ static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) return -1; } +static inline void syscall_set_nr(struct task_struct *task, struct pt_regs *regs, int nr) +{ + /* + * Unlike syscall_get_nr(), syscall_set_nr() can be called only when + * the target task is stopped for tracing on entering syscall, so + * there is no need to have the same check syscall_get_nr() has. + */ + regs->gpr[0] = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/riscv/include/asm/syscall.h b/arch/riscv/include/asm/syscall.h index da56417b6705..34313387f977 100644 --- a/arch/riscv/include/asm/syscall.h +++ b/arch/riscv/include/asm/syscall.h @@ -30,6 +30,13 @@ static inline int syscall_get_nr(struct task_struct *task, return regs->a7; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + regs->a7 = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/s390/include/asm/syscall.h b/arch/s390/include/asm/syscall.h index b87d8bb2cbaa..bd4cb00ccd5e 100644 --- a/arch/s390/include/asm/syscall.h +++ b/arch/s390/include/asm/syscall.h @@ -24,6 +24,18 @@ static inline long syscall_get_nr(struct task_struct *task, (regs->int_code & 0xffff) : -1; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + /* + * Unlike syscall_get_nr(), syscall_set_nr() can be called only when + * the target task is stopped for tracing on entering syscall, so + * there is no need to have the same check syscall_get_nr() has. + */ + regs->int_code = (regs->int_code & ~0xffff) | (nr & 0xffff); +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/sh/include/asm/syscall_32.h b/arch/sh/include/asm/syscall_32.h index cb51a7528384..7027d87d901d 100644 --- a/arch/sh/include/asm/syscall_32.h +++ b/arch/sh/include/asm/syscall_32.h @@ -15,6 +15,18 @@ static inline long syscall_get_nr(struct task_struct *task, return (regs->tra >= 0) ? regs->regs[3] : -1L; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + /* + * Unlike syscall_get_nr(), syscall_set_nr() can be called only when + * the target task is stopped for tracing on entering syscall, so + * there is no need to have the same check syscall_get_nr() has. + */ + regs->regs[3] = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/sparc/include/asm/syscall.h b/arch/sparc/include/asm/syscall.h index 62a5a78804c4..b0233924d323 100644 --- a/arch/sparc/include/asm/syscall.h +++ b/arch/sparc/include/asm/syscall.h @@ -25,6 +25,18 @@ static inline long syscall_get_nr(struct task_struct *task, return (syscall_p ? regs->u_regs[UREG_G1] : -1L); } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + /* + * Unlike syscall_get_nr(), syscall_set_nr() can be called only when + * the target task is stopped for tracing on entering syscall, so + * there is no need to have the same check syscall_get_nr() has. + */ + regs->u_regs[UREG_G1] = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/um/include/asm/syscall-generic.h b/arch/um/include/asm/syscall-generic.h index 2984feb9d576..bcd73bcfe577 100644 --- a/arch/um/include/asm/syscall-generic.h +++ b/arch/um/include/asm/syscall-generic.h @@ -21,6 +21,11 @@ static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) return PT_REGS_SYSCALL_NR(regs); } +static inline void syscall_set_nr(struct task_struct *task, struct pt_regs *regs, int nr) +{ + PT_REGS_SYSCALL_NR(regs) = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index b9c249dd9e3d..c10dbb74cd00 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -38,6 +38,13 @@ static inline int syscall_get_nr(struct task_struct *task, struct pt_regs *regs) return regs->orig_ax; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + regs->orig_ax = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/xtensa/include/asm/syscall.h b/arch/xtensa/include/asm/syscall.h index f9a671cbf933..7db3b489c8ad 100644 --- a/arch/xtensa/include/asm/syscall.h +++ b/arch/xtensa/include/asm/syscall.h @@ -28,6 +28,13 @@ static inline long syscall_get_nr(struct task_struct *task, return regs->syscall; } +static inline void syscall_set_nr(struct task_struct *task, + struct pt_regs *regs, + int nr) +{ + regs->syscall = nr; +} + static inline void syscall_rollback(struct task_struct *task, struct pt_regs *regs) { diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h index 292b412f4e9a..c5a3ad53beec 100644 --- a/include/asm-generic/syscall.h +++ b/include/asm-generic/syscall.h @@ -37,6 +37,20 @@ struct pt_regs; */ int syscall_get_nr(struct task_struct *task, struct pt_regs *regs); +/** + * syscall_set_nr - change the system call a task is executing + * @task: task of interest, must be blocked + * @regs: task_pt_regs() of @task + * @nr: system call number + * + * Changes the system call number @task is about to execute. + * + * It's only valid to call this when @task is stopped for tracing on + * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or + * %SYSCALL_WORK_SYSCALL_AUDIT. + */ +void syscall_set_nr(struct task_struct *task, struct pt_regs *regs, int nr); + /** * syscall_rollback - roll back registers after an aborted system call * @task: task of interest, must be in system call exit tracing -- 2.51.0 From c354ec9cee909136e168f4f4900d56a491c4d6c5 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Mon, 3 Mar 2025 13:20:38 +0200 Subject: [PATCH 08/16] ptrace_get_syscall_info: factor out ptrace_get_syscall_info_op Move the code that calculates the type of the system call stop out of ptrace_get_syscall_info() into a separate function ptrace_get_syscall_info_op() which is going to be used later to implement PTRACE_SET_SYSCALL_INFO API. Link: https://lkml.kernel.org/r/20250303112038.GE24170@strace.io Signed-off-by: Dmitry V. Levin Reviewed-by: Oleg Nesterov Cc: Alexander Gordeev Cc: Alexey Gladkov (Intel) Cc: Andreas Larsson Cc: anton ivanov Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Brian Cain Cc: Charlie Jenkins Cc: Christian Borntraeger Cc: Christian Zankel Cc: Christophe Leroy Cc: Dave Hansen Cc: Davide Berardi Cc: David S. Miller Cc: Dinh Nguyen Cc: Eugene Syromiatnikov Cc: Eugene Syromyatnikov Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Maciej W. Rozycki Cc: Madhavan Srinivasan Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mike Frysinger Cc: Naveen N Rao Cc: Nicholas Piggin Cc: Renzo Davoi Cc: Richard Weinberger Cc: Rich Felker Cc: Russel King Cc: Shuah Khan Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- kernel/ptrace.c | 58 +++++++++++++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index d5f89f9ef29f..22e7d74cf4cd 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -921,7 +921,6 @@ ptrace_get_syscall_info_entry(struct task_struct *child, struct pt_regs *regs, unsigned long args[ARRAY_SIZE(info->entry.args)]; int i; - info->op = PTRACE_SYSCALL_INFO_ENTRY; info->entry.nr = syscall_get_nr(child, regs); syscall_get_arguments(child, regs, args); for (i = 0; i < ARRAY_SIZE(args); i++) @@ -943,7 +942,6 @@ ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs, * diverge significantly enough. */ ptrace_get_syscall_info_entry(child, regs, info); - info->op = PTRACE_SYSCALL_INFO_SECCOMP; info->seccomp.ret_data = child->ptrace_message; /* ret_data is the last field in struct ptrace_syscall_info.seccomp */ @@ -954,7 +952,6 @@ static unsigned long ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, struct ptrace_syscall_info *info) { - info->op = PTRACE_SYSCALL_INFO_EXIT; info->exit.rval = syscall_get_error(child, regs); info->exit.is_error = !!info->exit.rval; if (!info->exit.is_error) @@ -965,19 +962,8 @@ ptrace_get_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, } static int -ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, - void __user *datavp) +ptrace_get_syscall_info_op(struct task_struct *child) { - struct pt_regs *regs = task_pt_regs(child); - struct ptrace_syscall_info info = { - .op = PTRACE_SYSCALL_INFO_NONE, - .arch = syscall_get_arch(child), - .instruction_pointer = instruction_pointer(regs), - .stack_pointer = user_stack_pointer(regs), - }; - unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry); - unsigned long write_size; - /* * This does not need lock_task_sighand() to access * child->last_siginfo because ptrace_freeze_traced() @@ -988,18 +974,42 @@ ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, case SIGTRAP | 0x80: switch (child->ptrace_message) { case PTRACE_EVENTMSG_SYSCALL_ENTRY: - actual_size = ptrace_get_syscall_info_entry(child, regs, - &info); - break; + return PTRACE_SYSCALL_INFO_ENTRY; case PTRACE_EVENTMSG_SYSCALL_EXIT: - actual_size = ptrace_get_syscall_info_exit(child, regs, - &info); - break; + return PTRACE_SYSCALL_INFO_EXIT; + default: + return PTRACE_SYSCALL_INFO_NONE; } - break; case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8): - actual_size = ptrace_get_syscall_info_seccomp(child, regs, - &info); + return PTRACE_SYSCALL_INFO_SECCOMP; + default: + return PTRACE_SYSCALL_INFO_NONE; + } +} + +static int +ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, + void __user *datavp) +{ + struct pt_regs *regs = task_pt_regs(child); + struct ptrace_syscall_info info = { + .op = ptrace_get_syscall_info_op(child), + .arch = syscall_get_arch(child), + .instruction_pointer = instruction_pointer(regs), + .stack_pointer = user_stack_pointer(regs), + }; + unsigned long actual_size = offsetof(struct ptrace_syscall_info, entry); + unsigned long write_size; + + switch (info.op) { + case PTRACE_SYSCALL_INFO_ENTRY: + actual_size = ptrace_get_syscall_info_entry(child, regs, &info); + break; + case PTRACE_SYSCALL_INFO_EXIT: + actual_size = ptrace_get_syscall_info_exit(child, regs, &info); + break; + case PTRACE_SYSCALL_INFO_SECCOMP: + actual_size = ptrace_get_syscall_info_seccomp(child, regs, &info); break; } -- 2.51.0 From 26bb32768fe6552de044f782a58b3272073fbfc0 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Mon, 3 Mar 2025 13:20:44 +0200 Subject: [PATCH 09/16] ptrace: introduce PTRACE_SET_SYSCALL_INFO request PTRACE_SET_SYSCALL_INFO is a generic ptrace API that complements PTRACE_GET_SYSCALL_INFO by letting the ptracer modify details of system calls the tracee is blocked in. This API allows ptracers to obtain and modify system call details in a straightforward and architecture-agnostic way, providing a consistent way of manipulating the system call number and arguments across architectures. As in case of PTRACE_GET_SYSCALL_INFO, PTRACE_SET_SYSCALL_INFO also does not aim to address numerous architecture-specific system call ABI peculiarities, like differences in the number of system call arguments for such system calls as pread64 and preadv. The current implementation supports changing only those bits of system call information that are used by strace system call tampering, namely, syscall number, syscall arguments, and syscall return value. Support of changing additional details returned by PTRACE_GET_SYSCALL_INFO, such as instruction pointer and stack pointer, could be added later if needed, by using struct ptrace_syscall_info.flags to specify the additional details that should be set. Currently, "flags" and "reserved" fields of struct ptrace_syscall_info must be initialized with zeroes; "arch", "instruction_pointer", and "stack_pointer" fields are currently ignored. PTRACE_SET_SYSCALL_INFO currently supports only PTRACE_SYSCALL_INFO_ENTRY, PTRACE_SYSCALL_INFO_EXIT, and PTRACE_SYSCALL_INFO_SECCOMP operations. Other operations could be added later if needed. Ideally, PTRACE_SET_SYSCALL_INFO should have been introduced along with PTRACE_GET_SYSCALL_INFO, but it didn't happen. The last straw that convinced me to implement PTRACE_SET_SYSCALL_INFO was apparent failure to provide an API of changing the first system call argument on riscv architecture. ptrace(2) man page: long ptrace(enum __ptrace_request request, pid_t pid, void *addr, void *data); ... PTRACE_SET_SYSCALL_INFO Modify information about the system call that caused the stop. The "data" argument is a pointer to struct ptrace_syscall_info that specifies the system call information to be set. The "addr" argument should be set to sizeof(struct ptrace_syscall_info)). Link: https://lore.kernel.org/all/59505464-c84a-403d-972f-d4b2055eeaac@gmail.com/ Link: https://lkml.kernel.org/r/20250303112044.GF24170@strace.io Signed-off-by: Dmitry V. Levin Reviewed-by: Alexey Gladkov Reviewed-by: Charlie Jenkins Tested-by: Charlie Jenkins Reviewed-by: Eugene Syromiatnikov Reviewed-by: Oleg Nesterov Cc: Alexander Gordeev Cc: Andreas Larsson Cc: anton ivanov Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Brian Cain Cc: Christian Borntraeger Cc: Christian Zankel Cc: Christophe Leroy Cc: Dave Hansen Cc: Davide Berardi Cc: David S. Miller Cc: Dinh Nguyen Cc: Eugene Syromyatnikov Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Maciej W. Rozycki Cc: Madhavan Srinivasan Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mike Frysinger Cc: Naveen N Rao Cc: Nicholas Piggin Cc: Renzo Davoi Cc: Richard Weinberger Cc: Rich Felker Cc: Russel King Cc: Shuah Khan Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/uapi/linux/ptrace.h | 7 ++- kernel/ptrace.c | 121 +++++++++++++++++++++++++++++++++++- 2 files changed, 126 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h index 72c038fc71d0..5f8ef6156752 100644 --- a/include/uapi/linux/ptrace.h +++ b/include/uapi/linux/ptrace.h @@ -74,6 +74,7 @@ struct seccomp_metadata { }; #define PTRACE_GET_SYSCALL_INFO 0x420e +#define PTRACE_SET_SYSCALL_INFO 0x4212 #define PTRACE_SYSCALL_INFO_NONE 0 #define PTRACE_SYSCALL_INFO_ENTRY 1 #define PTRACE_SYSCALL_INFO_EXIT 2 @@ -81,7 +82,8 @@ struct seccomp_metadata { struct ptrace_syscall_info { __u8 op; /* PTRACE_SYSCALL_INFO_* */ - __u8 pad[3]; + __u8 reserved; + __u16 flags; __u32 arch; __u64 instruction_pointer; __u64 stack_pointer; @@ -98,6 +100,7 @@ struct ptrace_syscall_info { __u64 nr; __u64 args[6]; __u32 ret_data; + __u32 reserved2; } seccomp; }; }; @@ -142,6 +145,8 @@ struct ptrace_sud_config { __u64 len; }; +/* 0x4212 is PTRACE_SET_SYSCALL_INFO */ + /* * These values are stored in task->ptrace_message * by ptrace_stop to describe the current syscall-stop. diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 22e7d74cf4cd..75a84efad40f 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -944,7 +944,10 @@ ptrace_get_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs, ptrace_get_syscall_info_entry(child, regs, info); info->seccomp.ret_data = child->ptrace_message; - /* ret_data is the last field in struct ptrace_syscall_info.seccomp */ + /* + * ret_data is the last non-reserved field + * in struct ptrace_syscall_info.seccomp + */ return offsetofend(struct ptrace_syscall_info, seccomp.ret_data); } @@ -1016,6 +1019,118 @@ ptrace_get_syscall_info(struct task_struct *child, unsigned long user_size, write_size = min(actual_size, user_size); return copy_to_user(datavp, &info, write_size) ? -EFAULT : actual_size; } + +static int +ptrace_set_syscall_info_entry(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + unsigned long args[ARRAY_SIZE(info->entry.args)]; + int nr = info->entry.nr; + int i; + + /* + * Check that the syscall number specified in info->entry.nr + * is either a value of type "int" or a sign-extended value + * of type "int". + */ + if (nr != info->entry.nr) + return -ERANGE; + + for (i = 0; i < ARRAY_SIZE(args); i++) { + args[i] = info->entry.args[i]; + /* + * Check that the syscall argument specified in + * info->entry.args[i] is either a value of type + * "unsigned long" or a sign-extended value of type "long". + */ + if (args[i] != info->entry.args[i]) + return -ERANGE; + } + + syscall_set_nr(child, regs, nr); + /* + * If the syscall number is set to -1, setting syscall arguments is not + * just pointless, it would also clobber the syscall return value on + * those architectures that share the same register both for the first + * argument of syscall and its return value. + */ + if (nr != -1) + syscall_set_arguments(child, regs, args); + + return 0; +} + +static int +ptrace_set_syscall_info_seccomp(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + /* + * info->entry is currently a subset of info->seccomp, + * info->seccomp.ret_data is currently ignored. + */ + return ptrace_set_syscall_info_entry(child, regs, info); +} + +static int +ptrace_set_syscall_info_exit(struct task_struct *child, struct pt_regs *regs, + struct ptrace_syscall_info *info) +{ + long rval = info->exit.rval; + + /* + * Check that the return value specified in info->exit.rval + * is either a value of type "long" or a sign-extended value + * of type "long". + */ + if (rval != info->exit.rval) + return -ERANGE; + + if (info->exit.is_error) + syscall_set_return_value(child, regs, rval, 0); + else + syscall_set_return_value(child, regs, 0, rval); + + return 0; +} + +static int +ptrace_set_syscall_info(struct task_struct *child, unsigned long user_size, + const void __user *datavp) +{ + struct pt_regs *regs = task_pt_regs(child); + struct ptrace_syscall_info info; + + if (user_size < sizeof(info)) + return -EINVAL; + + /* + * The compatibility is tracked by info.op and info.flags: if user-space + * does not instruct us to use unknown extra bits from future versions + * of ptrace_syscall_info, we are not going to read them either. + */ + if (copy_from_user(&info, datavp, sizeof(info))) + return -EFAULT; + + /* Reserved for future use. */ + if (info.flags || info.reserved) + return -EINVAL; + + /* Changing the type of the system call stop is not supported yet. */ + if (ptrace_get_syscall_info_op(child) != info.op) + return -EINVAL; + + switch (info.op) { + case PTRACE_SYSCALL_INFO_ENTRY: + return ptrace_set_syscall_info_entry(child, regs, &info); + case PTRACE_SYSCALL_INFO_EXIT: + return ptrace_set_syscall_info_exit(child, regs, &info); + case PTRACE_SYSCALL_INFO_SECCOMP: + return ptrace_set_syscall_info_seccomp(child, regs, &info); + default: + /* Other types of system call stops are not supported yet. */ + return -EINVAL; + } +} #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ int ptrace_request(struct task_struct *child, long request, @@ -1234,6 +1349,10 @@ int ptrace_request(struct task_struct *child, long request, case PTRACE_GET_SYSCALL_INFO: ret = ptrace_get_syscall_info(child, addr, datavp); break; + + case PTRACE_SET_SYSCALL_INFO: + ret = ptrace_set_syscall_info(child, addr, datavp); + break; #endif case PTRACE_SECCOMP_GET_FILTER: -- 2.51.0 From bc6fa711951185fa0fdf5974c50a1c4d0cd65be3 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Mon, 3 Mar 2025 13:20:52 +0200 Subject: [PATCH 10/16] selftests/ptrace: add a test case for PTRACE_SET_SYSCALL_INFO Check whether PTRACE_SET_SYSCALL_INFO semantics implemented in the kernel matches userspace expectations. Link: https://lkml.kernel.org/r/20250303112052.GG24170@strace.io Signed-off-by: Dmitry V. Levin Reviewed-by: Oleg Nesterov Cc: Alexander Gordeev Cc: Alexey Gladkov (Intel) Cc: Andreas Larsson Cc: anton ivanov Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Brian Cain Cc: Charlie Jenkins Cc: Christian Borntraeger Cc: Christian Zankel Cc: Christophe Leroy Cc: Dave Hansen Cc: Davide Berardi Cc: David S. Miller Cc: Dinh Nguyen Cc: Eugene Syromiatnikov Cc: Eugene Syromyatnikov Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Maciej W. Rozycki Cc: Madhavan Srinivasan Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mike Frysinger Cc: Naveen N Rao Cc: Nicholas Piggin Cc: Renzo Davoi Cc: Richard Weinberger Cc: Rich Felker Cc: Russel King Cc: Shuah Khan Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- tools/testing/selftests/ptrace/Makefile | 2 +- .../selftests/ptrace/set_syscall_info.c | 519 ++++++++++++++++++ 2 files changed, 520 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/ptrace/set_syscall_info.c diff --git a/tools/testing/selftests/ptrace/Makefile b/tools/testing/selftests/ptrace/Makefile index 1c631740a730..c5e0b76ba6ac 100644 --- a/tools/testing/selftests/ptrace/Makefile +++ b/tools/testing/selftests/ptrace/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -std=c99 -pthread -Wall $(KHDR_INCLUDES) -TEST_GEN_PROGS := get_syscall_info peeksiginfo vmaccess get_set_sud +TEST_GEN_PROGS := get_syscall_info set_syscall_info peeksiginfo vmaccess get_set_sud include ../lib.mk diff --git a/tools/testing/selftests/ptrace/set_syscall_info.c b/tools/testing/selftests/ptrace/set_syscall_info.c new file mode 100644 index 000000000000..4198248ef874 --- /dev/null +++ b/tools/testing/selftests/ptrace/set_syscall_info.c @@ -0,0 +1,519 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (c) 2018-2025 Dmitry V. Levin + * All rights reserved. + * + * Check whether PTRACE_SET_SYSCALL_INFO semantics implemented in the kernel + * matches userspace expectations. + */ + +#include "../kselftest_harness.h" +#include +#include +#include +#include +#include +#include + +#if defined(_MIPS_SIM) && _MIPS_SIM == _MIPS_SIM_NABI32 +/* + * MIPS N32 is the only architecture where __kernel_ulong_t + * does not match the bitness of syscall arguments. + */ +typedef unsigned long long kernel_ulong_t; +#else +typedef __kernel_ulong_t kernel_ulong_t; +#endif + +struct si_entry { + int nr; + kernel_ulong_t args[6]; +}; +struct si_exit { + unsigned int is_error; + int rval; +}; + +static unsigned int ptrace_stop; +static pid_t tracee_pid; + +static int +kill_tracee(pid_t pid) +{ + if (!pid) + return 0; + + int saved_errno = errno; + + int rc = kill(pid, SIGKILL); + + errno = saved_errno; + return rc; +} + +static long +sys_ptrace(int request, pid_t pid, unsigned long addr, unsigned long data) +{ + return syscall(__NR_ptrace, request, pid, addr, data); +} + +#define LOG_KILL_TRACEE(fmt, ...) \ + do { \ + kill_tracee(tracee_pid); \ + TH_LOG("wait #%d: " fmt, \ + ptrace_stop, ##__VA_ARGS__); \ + } while (0) + +static void +check_psi_entry(struct __test_metadata *_metadata, + const struct ptrace_syscall_info *info, + const struct si_entry *exp_entry, + const char *text) +{ + unsigned int i; + int exp_nr = exp_entry->nr; +#if defined __s390__ || defined __s390x__ + /* s390 is the only architecture that has 16-bit syscall numbers */ + exp_nr &= 0xffff; +#endif + + ASSERT_EQ(PTRACE_SYSCALL_INFO_ENTRY, info->op) { + LOG_KILL_TRACEE("%s: entry stop mismatch", text); + } + ASSERT_TRUE(info->arch) { + LOG_KILL_TRACEE("%s: entry stop mismatch", text); + } + ASSERT_TRUE(info->instruction_pointer) { + LOG_KILL_TRACEE("%s: entry stop mismatch", text); + } + ASSERT_TRUE(info->stack_pointer) { + LOG_KILL_TRACEE("%s: entry stop mismatch", text); + } + ASSERT_EQ(exp_nr, info->entry.nr) { + LOG_KILL_TRACEE("%s: syscall nr mismatch", text); + } + for (i = 0; i < ARRAY_SIZE(exp_entry->args); ++i) { + ASSERT_EQ(exp_entry->args[i], info->entry.args[i]) { + LOG_KILL_TRACEE("%s: syscall arg #%u mismatch", + text, i); + } + } +} + +static void +check_psi_exit(struct __test_metadata *_metadata, + const struct ptrace_syscall_info *info, + const struct si_exit *exp_exit, + const char *text) +{ + ASSERT_EQ(PTRACE_SYSCALL_INFO_EXIT, info->op) { + LOG_KILL_TRACEE("%s: exit stop mismatch", text); + } + ASSERT_TRUE(info->arch) { + LOG_KILL_TRACEE("%s: exit stop mismatch", text); + } + ASSERT_TRUE(info->instruction_pointer) { + LOG_KILL_TRACEE("%s: exit stop mismatch", text); + } + ASSERT_TRUE(info->stack_pointer) { + LOG_KILL_TRACEE("%s: exit stop mismatch", text); + } + ASSERT_EQ(exp_exit->is_error, info->exit.is_error) { + LOG_KILL_TRACEE("%s: exit stop mismatch", text); + } + ASSERT_EQ(exp_exit->rval, info->exit.rval) { + LOG_KILL_TRACEE("%s: exit stop mismatch", text); + } +} + +TEST(set_syscall_info) +{ + const pid_t tracer_pid = getpid(); + const kernel_ulong_t dummy[] = { + (kernel_ulong_t) 0xdad0bef0bad0fed0ULL, + (kernel_ulong_t) 0xdad1bef1bad1fed1ULL, + (kernel_ulong_t) 0xdad2bef2bad2fed2ULL, + (kernel_ulong_t) 0xdad3bef3bad3fed3ULL, + (kernel_ulong_t) 0xdad4bef4bad4fed4ULL, + (kernel_ulong_t) 0xdad5bef5bad5fed5ULL, + }; + int splice_in[2], splice_out[2]; + + ASSERT_EQ(0, pipe(splice_in)); + ASSERT_EQ(0, pipe(splice_out)); + ASSERT_EQ(sizeof(dummy), write(splice_in[1], dummy, sizeof(dummy))); + + const struct { + struct si_entry entry[2]; + struct si_exit exit[2]; + } si[] = { + /* change scno, keep non-error rval */ + { + { + { + __NR_gettid, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + }, { + __NR_getppid, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + } + }, { + { 0, tracer_pid }, { 0, tracer_pid } + } + }, + + /* set scno to -1, keep error rval */ + { + { + { + __NR_chdir, + { + (uintptr_t) ".", + dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + }, { + -1, + { + (uintptr_t) ".", + dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + } + }, { + { 1, -ENOSYS }, { 1, -ENOSYS } + } + }, + + /* keep scno, change non-error rval */ + { + { + { + __NR_getppid, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + }, { + __NR_getppid, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + } + }, { + { 0, tracer_pid }, { 0, tracer_pid + 1 } + } + }, + + /* change arg1, keep non-error rval */ + { + { + { + __NR_chdir, + { + (uintptr_t) "", + dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + }, { + __NR_chdir, + { + (uintptr_t) ".", + dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + } + }, { + { 0, 0 }, { 0, 0 } + } + }, + + /* set scno to -1, change error rval to non-error */ + { + { + { + __NR_gettid, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + }, { + -1, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + } + }, { + { 1, -ENOSYS }, { 0, tracer_pid } + } + }, + + /* change scno, change non-error rval to error */ + { + { + { + __NR_chdir, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + }, { + __NR_getppid, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + } + }, { + { 0, tracer_pid }, { 1, -EISDIR } + } + }, + + /* change scno and all args, change non-error rval */ + { + { + { + __NR_gettid, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + }, { + __NR_splice, + { + splice_in[0], 0, splice_out[1], 0, + sizeof(dummy), SPLICE_F_NONBLOCK + } + } + }, { + { 0, sizeof(dummy) }, { 0, sizeof(dummy) + 1 } + } + }, + + /* change arg1, no exit stop */ + { + { + { + __NR_exit_group, + { + dummy[0], dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + }, { + __NR_exit_group, + { + 0, dummy[1], dummy[2], + dummy[3], dummy[4], dummy[5] + } + } + }, { + { 0, 0 }, { 0, 0 } + } + }, + }; + + long rc; + unsigned int i; + + tracee_pid = fork(); + + ASSERT_LE(0, tracee_pid) { + TH_LOG("fork: %m"); + } + + if (tracee_pid == 0) { + /* get the pid before PTRACE_TRACEME */ + tracee_pid = getpid(); + ASSERT_EQ(0, sys_ptrace(PTRACE_TRACEME, 0, 0, 0)) { + TH_LOG("PTRACE_TRACEME: %m"); + } + ASSERT_EQ(0, kill(tracee_pid, SIGSTOP)) { + /* cannot happen */ + TH_LOG("kill SIGSTOP: %m"); + } + for (i = 0; i < ARRAY_SIZE(si); ++i) { + rc = syscall(si[i].entry[0].nr, + si[i].entry[0].args[0], + si[i].entry[0].args[1], + si[i].entry[0].args[2], + si[i].entry[0].args[3], + si[i].entry[0].args[4], + si[i].entry[0].args[5]); + if (si[i].exit[1].is_error) { + if (rc != -1 || errno != -si[i].exit[1].rval) + break; + } else { + if (rc != si[i].exit[1].rval) + break; + } + } + /* + * Something went wrong, but in this state tracee + * cannot reliably issue syscalls, so just crash. + */ + *(volatile unsigned char *) (uintptr_t) i = 42; + /* unreachable */ + _exit(i + 1); + } + + for (ptrace_stop = 0; ; ++ptrace_stop) { + struct ptrace_syscall_info info = { + .op = 0xff /* invalid PTRACE_SYSCALL_INFO_* op */ + }; + const size_t size = sizeof(info); + const int expected_entry_size = + (void *) &info.entry.args[6] - (void *) &info; + const int expected_exit_size = + (void *) (&info.exit.is_error + 1) - + (void *) &info; + int status; + + ASSERT_EQ(tracee_pid, wait(&status)) { + /* cannot happen */ + LOG_KILL_TRACEE("wait: %m"); + } + if (WIFEXITED(status)) { + tracee_pid = 0; /* the tracee is no more */ + ASSERT_EQ(0, WEXITSTATUS(status)) { + LOG_KILL_TRACEE("unexpected exit status %u", + WEXITSTATUS(status)); + } + break; + } + ASSERT_FALSE(WIFSIGNALED(status)) { + tracee_pid = 0; /* the tracee is no more */ + LOG_KILL_TRACEE("unexpected signal %u", + WTERMSIG(status)); + } + ASSERT_TRUE(WIFSTOPPED(status)) { + /* cannot happen */ + LOG_KILL_TRACEE("unexpected wait status %#x", status); + } + + ASSERT_LT(ptrace_stop, ARRAY_SIZE(si) * 2) { + LOG_KILL_TRACEE("ptrace stop overflow"); + } + + switch (WSTOPSIG(status)) { + case SIGSTOP: + ASSERT_EQ(0, ptrace_stop) { + LOG_KILL_TRACEE("unexpected signal stop"); + } + ASSERT_EQ(0, sys_ptrace(PTRACE_SETOPTIONS, tracee_pid, + 0, PTRACE_O_TRACESYSGOOD)) { + LOG_KILL_TRACEE("PTRACE_SETOPTIONS: %m"); + } + break; + + case SIGTRAP | 0x80: + ASSERT_LT(0, ptrace_stop) { + LOG_KILL_TRACEE("unexpected syscall stop"); + } + ASSERT_LT(0, (rc = sys_ptrace(PTRACE_GET_SYSCALL_INFO, + tracee_pid, size, + (uintptr_t) &info))) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO #1: %m"); + } + if (ptrace_stop & 1) { + /* entering syscall */ + const struct si_entry *exp_entry = + &si[ptrace_stop / 2].entry[0]; + const struct si_entry *set_entry = + &si[ptrace_stop / 2].entry[1]; + + /* check ptrace_syscall_info before the changes */ + ASSERT_EQ(expected_entry_size, rc) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO #1" + ": entry stop mismatch"); + } + check_psi_entry(_metadata, &info, exp_entry, + "PTRACE_GET_SYSCALL_INFO #1"); + + /* apply the changes */ + info.entry.nr = set_entry->nr; + for (i = 0; i < ARRAY_SIZE(set_entry->args); ++i) + info.entry.args[i] = set_entry->args[i]; + ASSERT_EQ(0, sys_ptrace(PTRACE_SET_SYSCALL_INFO, + tracee_pid, size, + (uintptr_t) &info)) { + LOG_KILL_TRACEE("PTRACE_SET_SYSCALL_INFO: %m"); + } + + /* check ptrace_syscall_info after the changes */ + memset(&info, 0, sizeof(info)); + info.op = 0xff; + ASSERT_LT(0, (rc = sys_ptrace(PTRACE_GET_SYSCALL_INFO, + tracee_pid, size, + (uintptr_t) &info))) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO: %m"); + } + ASSERT_EQ(expected_entry_size, rc) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO #2" + ": entry stop mismatch"); + } + check_psi_entry(_metadata, &info, set_entry, + "PTRACE_GET_SYSCALL_INFO #2"); + } else { + /* exiting syscall */ + const struct si_exit *exp_exit = + &si[ptrace_stop / 2 - 1].exit[0]; + const struct si_exit *set_exit = + &si[ptrace_stop / 2 - 1].exit[1]; + + /* check ptrace_syscall_info before the changes */ + ASSERT_EQ(expected_exit_size, rc) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO #1" + ": exit stop mismatch"); + } + check_psi_exit(_metadata, &info, exp_exit, + "PTRACE_GET_SYSCALL_INFO #1"); + + /* apply the changes */ + info.exit.is_error = set_exit->is_error; + info.exit.rval = set_exit->rval; + ASSERT_EQ(0, sys_ptrace(PTRACE_SET_SYSCALL_INFO, + tracee_pid, size, + (uintptr_t) &info)) { + LOG_KILL_TRACEE("PTRACE_SET_SYSCALL_INFO: %m"); + } + + /* check ptrace_syscall_info after the changes */ + memset(&info, 0, sizeof(info)); + info.op = 0xff; + ASSERT_LT(0, (rc = sys_ptrace(PTRACE_GET_SYSCALL_INFO, + tracee_pid, size, + (uintptr_t) &info))) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO #2: %m"); + } + ASSERT_EQ(expected_exit_size, rc) { + LOG_KILL_TRACEE("PTRACE_GET_SYSCALL_INFO #2" + ": exit stop mismatch"); + } + check_psi_exit(_metadata, &info, set_exit, + "PTRACE_GET_SYSCALL_INFO #2"); + } + break; + + default: + LOG_KILL_TRACEE("unexpected stop signal %u", + WSTOPSIG(status)); + abort(); + } + + ASSERT_EQ(0, sys_ptrace(PTRACE_SYSCALL, tracee_pid, 0, 0)) { + LOG_KILL_TRACEE("PTRACE_SYSCALL: %m"); + } + } + + ASSERT_EQ(ptrace_stop, ARRAY_SIZE(si) * 2); +} + +TEST_HARNESS_MAIN -- 2.51.0 From 7eeafde0ac05fa84a74d37af0efe5c6c5270bbef Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 25 Mar 2025 17:04:16 +0900 Subject: [PATCH 11/16] zsmalloc: cleanup headers includes Remove unused headers includes from zsmalloc and move pagemap.h and migrate.h includes into zpdesc header. Link: https://lkml.kernel.org/r/20250325080427.3449359-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Signed-off-by: Andrew Morton --- mm/zpdesc.h | 3 +++ mm/zsmalloc.c | 12 +----------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/mm/zpdesc.h b/mm/zpdesc.h index fa47fece2237..57e7a4d6c6ca 100644 --- a/mm/zpdesc.h +++ b/mm/zpdesc.h @@ -7,6 +7,9 @@ #ifndef __MM_ZPDESC_H__ #define __MM_ZPDESC_H__ +#include +#include + /* * struct zpdesc - Memory descriptor for zpool memory. * @flags: Page flags, mostly unused by zsmalloc. diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 513b08c7c941..999b513c7fdf 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -26,17 +26,10 @@ #include #include #include -#include #include #include #include #include -#include -#include -#include -#include -#include -#include #include #include #include @@ -44,11 +37,8 @@ #include #include #include -#include -#include -#include #include -#include +#include #include "zpdesc.h" #define ZSPAGE_MAGIC 0x58 -- 2.51.0 From a516403787e08119b70ce8bfff985272ef318a58 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 24 Mar 2025 06:53:26 +0000 Subject: [PATCH 12/16] fs/proc: extend the PAGEMAP_SCAN ioctl to report guard regions Patch series "fs/proc: extend the PAGEMAP_SCAN ioctl to report guard regions", v2. Introduce the PAGE_IS_GUARD flag in the PAGEMAP_SCAN ioctl to expose information about guard regions. This allows userspace tools, such as CRIU, to detect and handle guard regions. Currently, CRIU utilizes PAGEMAP_SCAN as a more efficient alternative to parsing /proc/pid/pagemap. Without this change, guard regions are incorrectly reported as swap-anon regions, leading CRIU to attempt dumping them and subsequently failing. The series includes updates to the documentation and selftests to reflect the new functionality. This patch (of 3): Introduce the PAGE_IS_GUARD flag in the PAGEMAP_SCAN ioctl to expose information about guard regions. This allows userspace tools, such as CRIU, to detect and handle guard regions. Link: https://lkml.kernel.org/r/20250324065328.107678-1-avagin@google.com Link: https://lkml.kernel.org/r/20250324065328.107678-2-avagin@google.com Signed-off-by: Andrei Vagin Acked-by: David Hildenbrand Reviewed-by: Lorenzo Stoakes Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/pagemap.rst | 1 + fs/proc/task_mmu.c | 17 ++++++++++------- include/uapi/linux/fs.h | 1 + 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index afce291649dd..e60e9211fd9b 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -250,6 +250,7 @@ Following flags about pages are currently supported: - ``PAGE_IS_PFNZERO`` - Page has zero PFN - ``PAGE_IS_HUGE`` - Page is PMD-mapped THP or Hugetlb backed - ``PAGE_IS_SOFT_DIRTY`` - Page is soft-dirty +- ``PAGE_IS_GUARD`` - Page is a part of a guard region The ``struct pm_scan_arg`` is used as the argument of the IOCTL. diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 994cde10e3f4..b9e4fbbdf6e6 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2087,7 +2087,8 @@ static int pagemap_release(struct inode *inode, struct file *file) #define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ PAGE_IS_FILE | PAGE_IS_PRESENT | \ PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ - PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY) + PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY | \ + PAGE_IS_GUARD) #define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) struct pagemap_scan_private { @@ -2128,12 +2129,14 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; - if (p->masks_of_interest & PAGE_IS_FILE) { - swp = pte_to_swp_entry(pte); - if (is_pfn_swap_entry(swp) && - !folio_test_anon(pfn_swap_entry_folio(swp))) - categories |= PAGE_IS_FILE; - } + swp = pte_to_swp_entry(pte); + if (is_guard_swp_entry(swp)) + categories |= PAGE_IS_GUARD; + else if ((p->masks_of_interest & PAGE_IS_FILE) && + is_pfn_swap_entry(swp) && + !folio_test_anon(pfn_swap_entry_folio(swp))) + categories |= PAGE_IS_FILE; + if (pte_swp_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; } diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index e762e1af650c..0098b0ce8ccb 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -361,6 +361,7 @@ typedef int __bitwise __kernel_rwf_t; #define PAGE_IS_PFNZERO (1 << 5) #define PAGE_IS_HUGE (1 << 6) #define PAGE_IS_SOFT_DIRTY (1 << 7) +#define PAGE_IS_GUARD (1 << 8) /* * struct page_region - Page region with flags -- 2.51.0 From 267bee0cd87a98832fd9da1976f0f53788b6a2b2 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 24 Mar 2025 06:53:27 +0000 Subject: [PATCH 13/16] tools headers UAPI: sync linux/fs.h with the kernel sources Required for a new PAGEMAP_SCAN test to verify guard region reporting. Link: https://lkml.kernel.org/r/20250324065328.107678-3-avagin@google.com Signed-off-by: Andrei Vagin Reviewed-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/include/uapi/linux/fs.h | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h index 8a27bc5c7a7f..24ddf7bc4f25 100644 --- a/tools/include/uapi/linux/fs.h +++ b/tools/include/uapi/linux/fs.h @@ -40,6 +40,15 @@ #define BLOCK_SIZE_BITS 10 #define BLOCK_SIZE (1< Date: Mon, 24 Mar 2025 06:53:28 +0000 Subject: [PATCH 14/16] selftests/mm: add PAGEMAP_SCAN guard region test Add a selftest to verify the PAGEMAP_SCAN ioctl correctly reports guard regions using the newly introduced PAGE_IS_GUARD flag. Link: https://lkml.kernel.org/r/20250324065328.107678-4-avagin@google.com Signed-off-by: Andrei Vagin Reviewed-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/guard-regions.c | 57 ++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/tools/testing/selftests/mm/guard-regions.c b/tools/testing/selftests/mm/guard-regions.c index eba43ead13ae..0cd9d236649d 100644 --- a/tools/testing/selftests/mm/guard-regions.c +++ b/tools/testing/selftests/mm/guard-regions.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -2075,4 +2076,60 @@ TEST_F(guard_regions, pagemap) ASSERT_EQ(munmap(ptr, 10 * page_size), 0); } +/* + * Assert that PAGEMAP_SCAN correctly reports guard region ranges. + */ +TEST_F(guard_regions, pagemap_scan) +{ + const unsigned long page_size = self->page_size; + struct page_region pm_regs[10]; + struct pm_scan_arg pm_scan_args = { + .size = sizeof(struct pm_scan_arg), + .category_anyof_mask = PAGE_IS_GUARD, + .return_mask = PAGE_IS_GUARD, + .vec = (long)&pm_regs, + .vec_len = ARRAY_SIZE(pm_regs), + }; + int proc_fd, i; + char *ptr; + + proc_fd = open("/proc/self/pagemap", O_RDONLY); + ASSERT_NE(proc_fd, -1); + + ptr = mmap_(self, variant, NULL, 10 * page_size, + PROT_READ | PROT_WRITE, 0, 0); + ASSERT_NE(ptr, MAP_FAILED); + + pm_scan_args.start = (long)ptr; + pm_scan_args.end = (long)ptr + 10 * page_size; + ASSERT_EQ(ioctl(proc_fd, PAGEMAP_SCAN, &pm_scan_args), 0); + ASSERT_EQ(pm_scan_args.walk_end, (long)ptr + 10 * page_size); + + /* Install a guard region in every other page. */ + for (i = 0; i < 10; i += 2) { + char *ptr_p = &ptr[i * page_size]; + + ASSERT_EQ(syscall(__NR_madvise, ptr_p, page_size, MADV_GUARD_INSTALL), 0); + } + + /* + * Assert ioctl() returns the count of located regions, where each + * region spans every other page within the range of 10 pages. + */ + ASSERT_EQ(ioctl(proc_fd, PAGEMAP_SCAN, &pm_scan_args), 5); + ASSERT_EQ(pm_scan_args.walk_end, (long)ptr + 10 * page_size); + + /* Re-read from pagemap, and assert guard regions are detected. */ + for (i = 0; i < 5; i++) { + long ptr_p = (long)&ptr[2 * i * page_size]; + + ASSERT_EQ(pm_regs[i].start, ptr_p); + ASSERT_EQ(pm_regs[i].end, ptr_p + page_size); + ASSERT_EQ(pm_regs[i].categories, PAGE_IS_GUARD); + } + + ASSERT_EQ(close(proc_fd), 0); + ASSERT_EQ(munmap(ptr, 10 * page_size), 0); +} + TEST_HARNESS_MAIN -- 2.51.0 From 979f3ef0f798d9b4fda4806d37fb1a264fc38566 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Fri, 21 Mar 2025 22:02:21 +1000 Subject: [PATCH 15/16] mm: fix parameter passed to page_mapcount_is_type() Patch series "Fix parameter passed to page_mapcount_is_type()", v2. Found by code inspection. There are two places where the parameter passed to page_mapcount_is_type() is (page->_mapcount), which is incorrect since it should be one more than the value, as explained in the comments to page_mapcount_is_type(): (a) page_has_type() in page-flags.h (b) __dump_folio() in mm/debug.c PATCH[1] fixes the parameter for (a) PATCH[2] fixes the parameter for (b) Note that the issue doesn't cause any visible impacts due to the safety gap introduced by PGTY_mapcount_underflow limit. So the tag 'Cc: stable@vger.kernel.org' isn't needed. This patch (of 2): As the comments of page_mapcount_is_type() indicate, the parameter passed to the function should be one more than page->_mapcount. However, page->_mapcount (equivalent to page->page_type) is passed to the function by commit 4ffca5a96678 ("mm: support only one page_type per page") page_type_has_type() is replaced by page_mapcount_is_type(), but the parameter isn't adjusted. Fix it by replacing page_mapcount_is_type() with page_type_has_type() in page_has_type(). Note that the issue doesn't cause any visible impacts due to the safety gap introduced by PGTY_mapcount_underflow limit. Link: https://lkml.kernel.org/r/20250321120222.1456770-1-gshan@redhat.com Link: https://lkml.kernel.org/r/20250321120222.1456770-2-gshan@redhat.com Fixes: 4ffca5a96678 ("mm: support only one page_type per page") Signed-off-by: Gavin Shan Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: gehao Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e6a21b62dcce..d3909cb1e576 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -982,7 +982,7 @@ static inline bool page_mapcount_is_type(unsigned int mapcount) static inline bool page_has_type(const struct page *page) { - return page_mapcount_is_type(data_race(page->page_type)); + return page_type_has_type(data_race(page->page_type)); } #define FOLIO_TYPE_OPS(lname, fname) \ -- 2.51.0 From 79049bb48a76333646d076e29d4f99fedefdaf0d Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Fri, 21 Mar 2025 22:02:22 +1000 Subject: [PATCH 16/16] mm/debug: fix parameter passed to page_mapcount_is_type() As the comments of page_mapcount_is_type() indicate, the parameter passed to the function should be one more than page->_mapcount. However, page->_mapcount is passed to the function by commit 4ffca5a96678 ("mm: support only one page_type per page") where page_type_has_type() is replaced by page_mapcount_is_type(), but the parameter isn't adjusted. Fix the parameter for page_mapcount_is_type() to be (page->__mapcount + 1). Note that the issue doesn't cause any visible impacts due to the safety gap introduced by PGTY_mapcount_underflow limit. [akpm@linux-foundation.org: simplify __dump_folio(), per David] Link: https://lkml.kernel.org/r/20250321120222.1456770-3-gshan@redhat.com Fixes: 4ffca5a96678 ("mm: support only one page_type per page") Signed-off-by: Gavin Shan Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: gehao Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Signed-off-by: Andrew Morton --- mm/debug.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/debug.c b/mm/debug.c index db83e381a8ae..907382257062 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -71,10 +71,12 @@ static void __dump_folio(struct folio *folio, struct page *page, unsigned long pfn, unsigned long idx) { struct address_space *mapping = folio_mapping(folio); - int mapcount = atomic_read(&page->_mapcount); + int mapcount = atomic_read(&page->_mapcount) + 1; char *type = ""; - mapcount = page_mapcount_is_type(mapcount) ? 0 : mapcount + 1; + if (page_mapcount_is_type(mapcount)) + mapcount = 0; + pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n", folio_ref_count(folio), mapcount, mapping, folio->index + idx, pfn); -- 2.51.0