From 965b5dd1894f4525f38c1b5f99b0106a07dbb5db Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sat, 23 Nov 2024 22:28:34 +0900 Subject: [PATCH 01/16] ocfs2: free inode when ocfs2_get_init_inode() fails syzbot is reporting busy inodes after unmount, for commit 9c89fe0af826 ("ocfs2: Handle error from dquot_initialize()") forgot to call iput() when new_inode() succeeded and dquot_initialize() failed. Link: https://lkml.kernel.org/r/e68c0224-b7c6-4784-b4fa-a9fc8c675525@I-love.SAKURA.ne.jp Fixes: 9c89fe0af826 ("ocfs2: Handle error from dquot_initialize()") Signed-off-by: Tetsuo Handa Reported-by: syzbot+0af00f6a2cba2058b5db@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=0af00f6a2cba2058b5db Tested-by: syzbot+0af00f6a2cba2058b5db@syzkaller.appspotmail.com Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton --- fs/ocfs2/namei.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 59c92353151a..5550f8afa438 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -200,8 +200,10 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) mode = mode_strip_sgid(&nop_mnt_idmap, dir, mode); inode_init_owner(&nop_mnt_idmap, inode, dir, mode); status = dquot_initialize(inode); - if (status) + if (status) { + iput(inode); return ERR_PTR(status); + } return inode; } -- 2.51.0 From 4ae132c693896b0713db572676c90ffd855a4246 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 27 Nov 2024 16:14:22 +0000 Subject: [PATCH 02/16] selftest: hugetlb_dio: fix test naming The string logged when a test passes or fails is used by the selftest framework to identify which test is being reported. The hugetlb_dio test not only uses the same strings for every test that is run but it also uses different strings for test passes and failures which means that test automation is unable to follow what the test is doing at all. Pull the existing duplicated logging of the number of free huge pages before and after the test out of the conditional and replace that and the logging of the result with a single ksft_print_result() which incorporates the parameters passed into the test into the output. Link: https://lkml.kernel.org/r/20241127-kselftest-mm-hugetlb-dio-names-v1-1-22aab01bf550@kernel.org Fixes: fae1980347bf ("selftests: hugetlb_dio: fixup check for initial conditions to skip in the start") Signed-off-by: Mark Brown Reviewed-by: Muhammad Usama Anjum Cc: Donet Tom Cc: Ritesh Harjani (IBM) Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hugetlb_dio.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/mm/hugetlb_dio.c b/tools/testing/selftests/mm/hugetlb_dio.c index 432d5af15e66..db63abe5ee5e 100644 --- a/tools/testing/selftests/mm/hugetlb_dio.c +++ b/tools/testing/selftests/mm/hugetlb_dio.c @@ -76,19 +76,15 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off) /* Get the free huge pages after unmap*/ free_hpage_a = get_free_hugepages(); + ksft_print_msg("No. Free pages before allocation : %d\n", free_hpage_b); + ksft_print_msg("No. Free pages after munmap : %d\n", free_hpage_a); + /* * If the no. of free hugepages before allocation and after unmap does * not match - that means there could still be a page which is pinned. */ - if (free_hpage_a != free_hpage_b) { - ksft_print_msg("No. Free pages before allocation : %d\n", free_hpage_b); - ksft_print_msg("No. Free pages after munmap : %d\n", free_hpage_a); - ksft_test_result_fail(": Huge pages not freed!\n"); - } else { - ksft_print_msg("No. Free pages before allocation : %d\n", free_hpage_b); - ksft_print_msg("No. Free pages after munmap : %d\n", free_hpage_a); - ksft_test_result_pass(": Huge pages freed successfully !\n"); - } + ksft_test_result(free_hpage_a == free_hpage_b, + "free huge pages from %u-%u\n", start_off, end_off); } int main(void) -- 2.51.0 From 4a475c0a7eeb3368eca40fe7cb02d157eeddc77a Mon Sep 17 00:00:00 2001 From: Maximilian Heyne Date: Wed, 27 Nov 2024 12:08:53 +0000 Subject: [PATCH 03/16] selftests/damon: add _damon_sysfs.py to TEST_FILES When running selftests I encountered the following error message with some damon tests: # Traceback (most recent call last): # File "[...]/damon/./damos_quota.py", line 7, in # import _damon_sysfs # ModuleNotFoundError: No module named '_damon_sysfs' Fix this by adding the _damon_sysfs.py file to TEST_FILES so that it will be available when running the respective damon selftests. Link: https://lkml.kernel.org/r/20241127-picks-visitor-7416685b-mheyne@amazon.de Fixes: 306abb63a8ca ("selftests/damon: implement a python module for test-purpose DAMON sysfs controls") Signed-off-by: Maximilian Heyne Reviewed-by: SeongJae Park Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 5b2a6a5dd1af..812f656260fb 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -6,7 +6,7 @@ TEST_GEN_FILES += debugfs_target_ids_read_before_terminate_race TEST_GEN_FILES += debugfs_target_ids_pid_leak TEST_GEN_FILES += access_memory access_memory_even -TEST_FILES = _chk_dependency.sh _debugfs_common.sh +TEST_FILES = _chk_dependency.sh _debugfs_common.sh _damon_sysfs.py # functionality tests TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh -- 2.51.0 From a220d6b95b1ae12c7626283d7609f0a1438e6437 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 26 Nov 2024 15:52:08 +0100 Subject: [PATCH 04/16] Revert "readahead: properly shorten readahead when falling back to do_page_cache_ra()" This reverts commit 7c877586da3178974a8a94577b6045a48377ff25. Anders and Philippe have reported that recent kernels occasionally hang when used with NFS in readahead code. The problem has been bisected to 7c877586da3 ("readahead: properly shorten readahead when falling back to do_page_cache_ra()"). The cause of the problem is that ra->size can be shrunk by read_pages() call and subsequently we end up calling do_page_cache_ra() with negative (read huge positive) number of pages. Let's revert 7c877586da3 for now until we can find a proper way how the logic in read_pages() and page_cache_ra_order() can coexist. This can lead to reduced readahead throughput due to readahead window confusion but that's better than outright hangs. Link: https://lkml.kernel.org/r/20241126145208.985-1-jack@suse.cz Fixes: 7c877586da31 ("readahead: properly shorten readahead when falling back to do_page_cache_ra()") Reported-by: Anders Blomdell Reported-by: Philippe Troin Signed-off-by: Jan Kara Tested-by: Philippe Troin Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton --- mm/readahead.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 8f1cf599b572..ea650b8b02fb 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -458,8 +458,7 @@ void page_cache_ra_order(struct readahead_control *ractl, struct file_ra_state *ra, unsigned int new_order) { struct address_space *mapping = ractl->mapping; - pgoff_t start = readahead_index(ractl); - pgoff_t index = start; + pgoff_t index = readahead_index(ractl); unsigned int min_order = mapping_min_folio_order(mapping); pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; pgoff_t mark = index + ra->size - ra->async_size; @@ -522,7 +521,7 @@ void page_cache_ra_order(struct readahead_control *ractl, if (!err) return; fallback: - do_page_cache_ra(ractl, ra->size - (index - start), ra->async_size); + do_page_cache_ra(ractl, ra->size, ra->async_size); } static unsigned long ractl_max_pages(struct readahead_control *ractl, -- 2.51.0 From d699440f58ce9bd71103cc7b692e3ab76a20bfcd Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 25 Nov 2024 16:52:06 -0800 Subject: [PATCH 05/16] mm: fix vrealloc()'s KASAN poisoning logic When vrealloc() reuses already allocated vmap_area, we need to re-annotate poisoned and unpoisoned portions of underlying memory according to the new size. This results in a KASAN splat recorded at [1]. A KASAN mis-reporting issue where there is none. Note, hard-coding KASAN_VMALLOC_PROT_NORMAL might not be exactly correct, but KASAN flag logic is pretty involved and spread out throughout __vmalloc_node_range_noprof(), so I'm using the bare minimum flag here and leaving the rest to mm people to refactor this logic and reuse it here. Link: https://lkml.kernel.org/r/20241126005206.3457974-1-andrii@kernel.org Link: https://lore.kernel.org/bpf/67450f9b.050a0220.21d33d.0004.GAE@google.com/ [1] Fixes: 3ddc2fefe6f3 ("mm: vmalloc: implement vrealloc()") Signed-off-by: Andrii Nakryiko Cc: Alexei Starovoitov Cc: Christoph Hellwig Cc: Michal Hocko Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/vmalloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7ed39d104201..f009b21705c1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -4093,7 +4093,8 @@ void *vrealloc_noprof(const void *p, size_t size, gfp_t flags) /* Zero out spare memory. */ if (want_init_on_alloc(flags)) memset((void *)p + size, 0, old_size - size); - + kasan_poison_vmalloc(p + size, old_size - size); + kasan_unpoison_vmalloc(p, size, KASAN_VMALLOC_PROT_NORMAL); return (void *)p; } -- 2.51.0 From 4de22b2a6a7477d84d9a01eb6b62a9117309d722 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 20:17:18 +0000 Subject: [PATCH 06/16] mm: open-code PageTail in folio_flags() and const_folio_flags() It is unsafe to call PageTail() in dump_page() as page_is_fake_head() will almost certainly return true when called on a head page that is copied to the stack. That will cause the VM_BUG_ON_PGFLAGS() in const_folio_flags() to trigger when it shouldn't. Fortunately, we don't need to call PageTail() here; it's fine to have a pointer to a virtual alias of the page's flag word rather than the real page's flag word. Link: https://lkml.kernel.org/r/20241125201721.2963278-1-willy@infradead.org Fixes: fae7d834c43c ("mm: add __dump_folio()") Signed-off-by: Matthew Wilcox (Oracle) Cc: Kees Cook Cc: Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 2220bfec278e..cf46ac720802 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -306,7 +306,7 @@ static const unsigned long *const_folio_flags(const struct folio *folio, { const struct page *page = &folio->page; - VM_BUG_ON_PGFLAGS(PageTail(page), page); + VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page); return &page[n].flags; } @@ -315,7 +315,7 @@ static unsigned long *folio_flags(struct folio *folio, unsigned n) { struct page *page = &folio->page; - VM_BUG_ON_PGFLAGS(PageTail(page), page); + VM_BUG_ON_PGFLAGS(page->compound_head & 1, page); VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page); return &page[n].flags; } -- 2.51.0 From 6a7de1bf218d75f27f68d6a3f5ae1eb7332b941e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 20:17:19 +0000 Subject: [PATCH 07/16] mm: open-code page_folio() in dump_page() page_folio() calls page_fixed_fake_head() which will misidentify this page as being a fake head and load off the end of 'precise'. We may have a pointer to a fake head, but that's OK because it contains the right information for dump_page(). gcc-15 is smart enough to catch this with -Warray-bounds: In function 'page_fixed_fake_head', inlined from '_compound_head' at ../include/linux/page-flags.h:251:24, inlined from '__dump_page' at ../mm/debug.c:123:11: ../include/asm-generic/rwonce.h:44:26: warning: array subscript 9 is outside +array bounds of 'struct page[1]' [-Warray-bounds=] Link: https://lkml.kernel.org/r/20241125201721.2963278-2-willy@infradead.org Fixes: fae7d834c43c ("mm: add __dump_folio()") Signed-off-by: Matthew Wilcox (Oracle) Reported-by: Kees Cook Cc: Signed-off-by: Andrew Morton --- mm/debug.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/debug.c b/mm/debug.c index aa57d3ffd4ed..95b6ab809c0e 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -124,19 +124,22 @@ static void __dump_page(const struct page *page) { struct folio *foliop, folio; struct page precise; + unsigned long head; unsigned long pfn = page_to_pfn(page); unsigned long idx, nr_pages = 1; int loops = 5; again: memcpy(&precise, page, sizeof(*page)); - foliop = page_folio(&precise); - if (foliop == (struct folio *)&precise) { + head = precise.compound_head; + if ((head & 1) == 0) { + foliop = (struct folio *)&precise; idx = 0; if (!folio_test_large(foliop)) goto dump; foliop = (struct folio *)page; } else { + foliop = (struct folio *)(head - 1); idx = folio_page_idx(foliop, page); } -- 2.51.0 From 031e04bdc834cda3b054ef6b698503b2b97e8186 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 22 Nov 2024 16:39:47 +0100 Subject: [PATCH 08/16] stackdepot: fix stack_depot_save_flags() in NMI context Per documentation, stack_depot_save_flags() was meant to be usable from NMI context if STACK_DEPOT_FLAG_CAN_ALLOC is unset. However, it still would try to take the pool_lock in an attempt to save a stack trace in the current pool (if space is available). This could result in deadlock if an NMI is handled while pool_lock is already held. To avoid deadlock, only try to take the lock in NMI context and give up if unsuccessful. The documentation is fixed to clearly convey this. Link: https://lkml.kernel.org/r/Z0CcyfbPqmxJ9uJH@elver.google.com Link: https://lkml.kernel.org/r/20241122154051.3914732-1-elver@google.com Fixes: 4434a56ec209 ("stackdepot: make fast paths lock-less again") Signed-off-by: Marco Elver Reported-by: Sebastian Andrzej Siewior Reviewed-by: Sebastian Andrzej Siewior Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Oscar Salvador Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 6 +++--- lib/stackdepot.c | 10 +++++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index e9ec32fb97d4..2cc21ffcdaf9 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -147,7 +147,7 @@ static inline int stack_depot_early_init(void) { return 0; } * If the provided stack trace comes from the interrupt context, only the part * up to the interrupt entry is saved. * - * Context: Any context, but setting STACK_DEPOT_FLAG_CAN_ALLOC is required if + * Context: Any context, but unsetting STACK_DEPOT_FLAG_CAN_ALLOC is required if * alloc_pages() cannot be used from the current context. Currently * this is the case for contexts where neither %GFP_ATOMIC nor * %GFP_NOWAIT can be used (NMI, raw_spin_lock). @@ -156,7 +156,7 @@ static inline int stack_depot_early_init(void) { return 0; } */ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, unsigned int nr_entries, - gfp_t gfp_flags, + gfp_t alloc_flags, depot_flags_t depot_flags); /** @@ -175,7 +175,7 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, * Return: Handle of the stack trace stored in depot, 0 on failure */ depot_stack_handle_t stack_depot_save(unsigned long *entries, - unsigned int nr_entries, gfp_t gfp_flags); + unsigned int nr_entries, gfp_t alloc_flags); /** * __stack_depot_get_stack_record - Get a pointer to a stack_record struct diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 5ed34cc963fc..245d5b416699 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -630,7 +630,15 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries, prealloc = page_address(page); } - raw_spin_lock_irqsave(&pool_lock, flags); + if (in_nmi()) { + /* We can never allocate in NMI context. */ + WARN_ON_ONCE(can_alloc); + /* Best effort; bail if we fail to take the lock. */ + if (!raw_spin_trylock_irqsave(&pool_lock, flags)) + goto exit; + } else { + raw_spin_lock_irqsave(&pool_lock, flags); + } printk_deferred_enter(); /* Try to find again, to avoid concurrently inserting duplicates. */ -- 2.51.0 From 914eec5e980171bc128e7e24f7a22aa1d803570e Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Tue, 19 Nov 2024 09:45:00 -0800 Subject: [PATCH 09/16] ocfs2: update seq_file index in ocfs2_dlm_seq_next The following INFO level message was seen: seq_file: buggy .next function ocfs2_dlm_seq_next [ocfs2] did not update position index Fix: Update *pos (so m->index) to make seq_read_iter happy though the index its self makes no sense to ocfs2_dlm_seq_next. Link: https://lkml.kernel.org/r/20241119174500.9198-1-wen.gang.wang@oracle.com Signed-off-by: Wengang Wang Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/dlmglue.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 60df52e4c1f8..764ecbd5ad41 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3110,6 +3110,7 @@ static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos) struct ocfs2_lock_res *iter = v; struct ocfs2_lock_res *dummy = &priv->p_iter_res; + (*pos)++; spin_lock(&ocfs2_dlm_tracking_lock); iter = ocfs2_dlm_next_res(iter, priv); list_del_init(&dummy->l_debug_list); -- 2.51.0 From 51f43d5d82ed2ba3f9a3f9a2390c52f28e42af32 Mon Sep 17 00:00:00 2001 From: David Wang <00107082@163.com> Date: Fri, 29 Nov 2024 10:52:13 +0800 Subject: [PATCH 10/16] mm/codetag: swap tags when migrate pages Current solution to adjust codetag references during page migration is done in 3 steps: 1. sets the codetag reference of the old page as empty (not pointing to any codetag); 2. subtracts counters of the new page to compensate for its own allocation; 3. sets codetag reference of the new page to point to the codetag of the old page. This does not work if CONFIG_MEM_ALLOC_PROFILING_DEBUG=n because set_codetag_empty() becomes NOOP. Instead, let's simply swap codetag references so that the new page is referencing the old codetag and the old page is referencing the new codetag. This way accounting stays valid and the logic makes more sense. Link: https://lkml.kernel.org/r/20241129025213.34836-1-00107082@163.com Fixes: e0a955bf7f61 ("mm/codetag: add pgalloc_tag_copy()") Signed-off-by: David Wang <00107082@163.com> Closes: https://lore.kernel.org/lkml/20241124074318.399027-1-00107082@163.com/ Acked-by: Suren Baghdasaryan Suggested-by: Suren Baghdasaryan Acked-by: Yu Zhao Cc: Kent Overstreet Signed-off-by: Andrew Morton --- include/linux/pgalloc_tag.h | 4 ++-- lib/alloc_tag.c | 36 ++++++++++++++++++++++-------------- mm/migrate.c | 2 +- 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 0e43ab653ab6..3469c4b20105 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -231,7 +231,7 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) } void pgalloc_tag_split(struct folio *folio, int old_order, int new_order); -void pgalloc_tag_copy(struct folio *new, struct folio *old); +void pgalloc_tag_swap(struct folio *new, struct folio *old); void __init alloc_tag_sec_init(void); @@ -245,7 +245,7 @@ static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {} static inline void alloc_tag_sec_init(void) {} static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) {} -static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) {} +static inline void pgalloc_tag_swap(struct folio *new, struct folio *old) {} #endif /* CONFIG_MEM_ALLOC_PROFILING */ diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 2414a7ee7ec7..35f7560a309a 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -189,26 +189,34 @@ void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) } } -void pgalloc_tag_copy(struct folio *new, struct folio *old) +void pgalloc_tag_swap(struct folio *new, struct folio *old) { - union pgtag_ref_handle handle; - union codetag_ref ref; - struct alloc_tag *tag; + union pgtag_ref_handle handle_old, handle_new; + union codetag_ref ref_old, ref_new; + struct alloc_tag *tag_old, *tag_new; - tag = pgalloc_tag_get(&old->page); - if (!tag) + tag_old = pgalloc_tag_get(&old->page); + if (!tag_old) + return; + tag_new = pgalloc_tag_get(&new->page); + if (!tag_new) return; - if (!get_page_tag_ref(&new->page, &ref, &handle)) + if (!get_page_tag_ref(&old->page, &ref_old, &handle_old)) return; + if (!get_page_tag_ref(&new->page, &ref_new, &handle_new)) { + put_page_tag_ref(handle_old); + return; + } + + /* swap tags */ + __alloc_tag_ref_set(&ref_old, tag_new); + update_page_tag_ref(handle_old, &ref_old); + __alloc_tag_ref_set(&ref_new, tag_old); + update_page_tag_ref(handle_new, &ref_new); - /* Clear the old ref to the original allocation tag. */ - clear_page_tag_ref(&old->page); - /* Decrement the counters of the tag on get_new_folio. */ - alloc_tag_sub(&ref, folio_size(new)); - __alloc_tag_ref_set(&ref, tag); - update_page_tag_ref(handle, &ref); - put_page_tag_ref(handle); + put_page_tag_ref(handle_old); + put_page_tag_ref(handle_new); } static void shutdown_mem_profiling(bool remove_file) diff --git a/mm/migrate.c b/mm/migrate.c index 2ce6b4b814df..cc68583c86f9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -745,7 +745,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) folio_set_readahead(newfolio); folio_copy_owner(newfolio, folio); - pgalloc_tag_copy(newfolio, folio); + pgalloc_tag_swap(newfolio, folio); mem_cgroup_migrate(folio, newfolio); } -- 2.51.0 From 89dd878282881306c38f7e354e7614fca98cb9a6 Mon Sep 17 00:00:00 2001 From: John Sperbeck Date: Thu, 28 Nov 2024 12:39:59 -0800 Subject: [PATCH 11/16] mm: memcg: declare do_memsw_account inline In commit 66d60c428b23 ("mm: memcg: move legacy memcg event code into memcontrol-v1.c"), the static do_memsw_account() function was moved from a .c file to a .h file. Unfortunately, the traditional inline keyword wasn't added. If a file (e.g., a unit test) includes the .h file, but doesn't refer to do_memsw_account(), it will get a warning like: mm/memcontrol-v1.h:41:13: warning: unused function 'do_memsw_account' [-Wunused-function] 41 | static bool do_memsw_account(void) | ^~~~~~~~~~~~~~~~ Link: https://lkml.kernel.org/r/20241128203959.726527-1-jsperbeck@google.com Fixes: 66d60c428b23 ("mm: memcg: move legacy memcg event code into memcontrol-v1.c") Signed-off-by: John Sperbeck Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Shakeel Butt Cc: Signed-off-by: Andrew Morton --- mm/memcontrol-v1.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h index 0e3b82951d91..144d71b65907 100644 --- a/mm/memcontrol-v1.h +++ b/mm/memcontrol-v1.h @@ -38,7 +38,7 @@ void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n); iter = mem_cgroup_iter(NULL, iter, NULL)) /* Whether legacy memory+swap accounting is active */ -static bool do_memsw_account(void) +static inline bool do_memsw_account(void) { return !cgroup_subsys_on_dfl(memory_cgrp_subsys); } -- 2.51.0 From 249608ee47132cab3b1adacd9e463548f57bd316 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 18 Nov 2024 13:46:48 -0800 Subject: [PATCH 12/16] mm: respect mmap hint address when aligning for THP Commit efa7df3e3bb5 ("mm: align larger anonymous mappings on THP boundaries") updated __get_unmapped_area() to align the start address for the VMA to a PMD boundary if CONFIG_TRANSPARENT_HUGEPAGE=y. It does this by effectively looking up a region that is of size, request_size + PMD_SIZE, and aligning up the start to a PMD boundary. Commit 4ef9ad19e176 ("mm: huge_memory: don't force huge page alignment on 32 bit") opted out of this for 32bit due to regressions in mmap base randomization. Commit d4148aeab412 ("mm, mmap: limit THP alignment of anonymous mappings to PMD-aligned sizes") restricted this to only mmap sizes that are multiples of the PMD_SIZE due to reported regressions in some performance benchmarks -- which seemed mostly due to the reduced spatial locality of related mappings due to the forced PMD-alignment. Another unintended side effect has emerged: When a user specifies an mmap hint address, the THP alignment logic modifies the behavior, potentially ignoring the hint even if a sufficiently large gap exists at the requested hint location. Example Scenario: Consider the following simplified virtual address (VA) space: ... 0x200000-0x400000 --- VMA A 0x400000-0x600000 --- Hole 0x600000-0x800000 --- VMA B ... A call to mmap() with hint=0x400000 and len=0x200000 behaves differently: - Before THP alignment: The requested region (size 0x200000) fits into the gap at 0x400000, so the hint is respected. - After alignment: The logic searches for a region of size 0x400000 (len + PMD_SIZE) starting at 0x400000. This search fails due to the mapping at 0x600000 (VMA B), and the hint is ignored, falling back to arch_get_unmapped_area[_topdown](). In general the hint is effectively ignored, if there is any existing mapping in the below range: [mmap_hint + mmap_size, mmap_hint + mmap_size + PMD_SIZE) This changes the semantics of mmap hint; from ""Respect the hint if a sufficiently large gap exists at the requested location" to "Respect the hint only if an additional PMD-sized gap exists beyond the requested size". This has performance implications for allocators that allocate their heap using mmap but try to keep it "as contiguous as possible" by using the end of the exisiting heap as the address hint. With the new behavior it's more likely to get a much less contiguous heap, adding extra fragmentation and performance overhead. To restore the expected behavior; don't use thp_get_unmapped_area_vmflags() when the user provided a hint address, for anonymous mappings. Note: As Yang Shi pointed out: the issue still remains for filesystems which are using thp_get_unmapped_area() for their get_unmapped_area() op. It is unclear what worklaods will regress for if we ignore THP alignment when the hint address is provided for such file backed mappings -- so this fix will be handled separately. Link: https://lkml.kernel.org/r/20241118214650.3667577-1-kaleshsingh@google.com Fixes: efa7df3e3bb5 ("mm: align larger anonymous mappings on THP boundaries") Signed-off-by: Kalesh Singh Reviewed-by: Rik van Riel Reviewed-by: Vlastimil Babka Reviewed-by: David Hildenbrand Cc: Kefeng Wang Cc: Vlastimil Babka Cc: Yang Shi Cc: Rik van Riel Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Minchan Kim Cc: Hans Boehm Cc: Lokesh Gidra Cc: Signed-off-by: Andrew Morton --- mm/mmap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/mmap.c b/mm/mmap.c index 386429f7db5a..d32b7e701058 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -889,6 +889,7 @@ __get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (get_area) { addr = get_area(file, addr, len, pgoff, flags); } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) + && !addr /* no hint */ && IS_ALIGNED(len, PMD_SIZE)) { /* Ensures that larger anonymous mappings are THP aligned. */ addr = thp_get_unmapped_area_vmflags(file, addr, len, -- 2.51.0 From cbb70e45348769172cb24cca2d2b6437f4c9240b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 18 Nov 2024 17:54:14 +0000 Subject: [PATCH 13/16] mm: correct typo in MMAP_STATE() macro We mistakenly refer to len rather than len_ here. The only existing caller passes len to the len_ parameter so this has no impact on the code, but it is obviously incorrect to do this, so fix it. Link: https://lkml.kernel.org/r/20241118175414.390827-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Reviewed-by: Wei Yang Cc: Jann Horn Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vma.c b/mm/vma.c index 8a454a7bbc80..8e31b7e25aeb 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -35,7 +35,7 @@ struct mmap_state { .mm = mm_, \ .vmi = vmi_, \ .addr = addr_, \ - .end = (addr_) + len, \ + .end = (addr_) + (len_), \ .pgoff = pgoff_, \ .pglen = PHYS_PFN(len_), \ .flags = flags_, \ -- 2.51.0 From d89c8ec0546184267cb211b579514ebaf8916100 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 29 Nov 2024 18:24:06 -0800 Subject: [PATCH 14/16] scatterlist: fix incorrect func name in kernel-doc Fix a kernel-doc warning by making the kernel-doc function description match the function name: include/linux/scatterlist.h:323: warning: expecting prototype for sg_unmark_bus_address(). Prototype was for sg_dma_unmark_bus_address() instead Link: https://lkml.kernel.org/r/20241130022406.537973-1-rdunlap@infradead.org Fixes: 42399301203e ("lib/scatterlist: add flag for indicating P2PDMA segments in an SGL") Signed-off-by: Randy Dunlap Cc: Logan Gunthorpe Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- include/linux/scatterlist.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index c5e2239b550e..d836e7440ee8 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -313,7 +313,7 @@ static inline void sg_dma_mark_bus_address(struct scatterlist *sg) } /** - * sg_unmark_bus_address - Unmark the scatterlist entry as a bus address + * sg_dma_unmark_bus_address - Unmark the scatterlist entry as a bus address * @sg: SG entry * * Description: -- 2.51.0 From 3203b3ab0fcf22132caadd72caebfad47bf0dd2b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 29 Nov 2024 13:53:03 +0100 Subject: [PATCH 15/16] mm/filemap: don't call folio_test_locked() without a reference in next_uptodate_folio() The folio can get freed + buddy-merged + reallocated in the meantime, resulting in us calling folio_test_locked() possibly on a tail page. This makes const_folio_flags VM_BUG_ON_PGFLAGS() when stumbling over the tail page. Could this result in other issues? Doesn't look like it. False positives and false negatives don't really matter, because this folio would get skipped either way when detecting that they have been reallocated in the meantime. Fix it by performing the folio_test_locked() checked after grabbing a reference. If this ever becomes a real problem, we could add a special helper that racily checks if the bit is set even on tail pages ... but let's hope that's not required so we can just handle it cleaner: work on the folio after we hold a reference. Do we really need the folio_test_locked() check if we are going to trylock briefly after? Well, we can at least avoid a xas_reload(). It's a bit unclear which exact change introduced that issue. Likely, ever since we made PG_locked obey to the PF_NO_TAIL policy it could have been triggered in some way. Link: https://lkml.kernel.org/r/20241129125303.4033164-1-david@redhat.com Fixes: 48c935ad88f5 ("page-flags: define PG_locked behavior on compound pages") Signed-off-by: David Hildenbrand Reported-by: syzbot+9f9a7f73fb079b2387a6@syzkaller.appspotmail.com Closes: https://lore.kernel.org/lkml/674184c9.050a0220.1cc393.0001.GAE@google.com/ Acked-by: Kirill A. Shutemov Cc: "Matthew Wilcox (Oracle)" Cc: Hillf Danton Signed-off-by: Andrew Morton --- mm/filemap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 7c76a123ba18..f61cf51c2238 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3501,10 +3501,10 @@ static struct folio *next_uptodate_folio(struct xa_state *xas, continue; if (xa_is_value(folio)) continue; - if (folio_test_locked(folio)) - continue; if (!folio_try_get(folio)) continue; + if (folio_test_locked(folio)) + goto skip; /* Has the page moved or been split? */ if (unlikely(folio != xas_reload(xas))) goto skip; -- 2.51.0 From 5c3793604f91123bf49bc792ce697a0bef4c173c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 17 Nov 2024 03:38:13 -0800 Subject: [PATCH 16/16] lib: stackinit: hide never-taken branch from compiler The never-taken branch leads to an invalid bounds condition, which is by design. To avoid the unwanted warning from the compiler, hide the variable from the optimizer. ../lib/stackinit_kunit.c: In function 'do_nothing_u16_zero': ../lib/stackinit_kunit.c:51:49: error: array subscript 1 is outside array bounds of 'u16[0]' {aka 'short unsigned int[]'} [-Werror=array-bounds=] 51 | #define DO_NOTHING_RETURN_SCALAR(ptr) *(ptr) | ^~~~~~ ../lib/stackinit_kunit.c:219:24: note: in expansion of macro 'DO_NOTHING_RETURN_SCALAR' 219 | return DO_NOTHING_RETURN_ ## which(ptr + 1); \ | ^~~~~~~~~~~~~~~~~~ Link: https://lkml.kernel.org/r/20241117113813.work.735-kees@kernel.org Signed-off-by: Kees Cook Cc: Signed-off-by: Andrew Morton --- lib/stackinit_kunit.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/stackinit_kunit.c b/lib/stackinit_kunit.c index c14c6f8e6308..c40818ec9c18 100644 --- a/lib/stackinit_kunit.c +++ b/lib/stackinit_kunit.c @@ -212,6 +212,7 @@ static noinline void test_ ## name (struct kunit *test) \ static noinline DO_NOTHING_TYPE_ ## which(var_type) \ do_nothing_ ## name(var_type *ptr) \ { \ + OPTIMIZER_HIDE_VAR(ptr); \ /* Will always be true, but compiler doesn't know. */ \ if ((unsigned long)ptr > 0x2) \ return DO_NOTHING_RETURN_ ## which(ptr); \ -- 2.51.0