From a32bf5bb7933fde6f39747499f8ec232b5b5400f Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Tue, 7 Jan 2025 14:25:53 +0000 Subject: [PATCH 01/16] selftests/mm: set allocated memory to non-zero content in cow test After commit b1f202060afe ("mm: remap unused subpages to shared zeropage when splitting isolated thp"), cow test cases involving swapping out THPs via madvise(MADV_PAGEOUT) started to be skipped due to the subsequent check via pagemap determining that the memory was not actually swapped out. Logs similar to this were emitted: ... # [RUN] Basic COW after fork() ... with swapped-out, PTE-mapped THP (16 kB) ok 2 # SKIP MADV_PAGEOUT did not work, is swap enabled? # [RUN] Basic COW after fork() ... with single PTE of swapped-out THP (16 kB) ok 3 # SKIP MADV_PAGEOUT did not work, is swap enabled? # [RUN] Basic COW after fork() ... with swapped-out, PTE-mapped THP (32 kB) ok 4 # SKIP MADV_PAGEOUT did not work, is swap enabled? ... The commit in question introduces the behaviour of scanning THPs and if their content is predominantly zero, it splits them and replaces the pages which are wholly zero with the zero page. These cow test cases were getting caught up in this. So let's avoid that by filling the contents of all allocated memory with a non-zero value. With this in place, the tests are passing again. Link: https://lkml.kernel.org/r/20250107142555.1870101-1-ryan.roberts@arm.com Fixes: b1f202060afe ("mm: remap unused subpages to shared zeropage when splitting isolated thp") Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Cc: Usama Arif Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/cow.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index 32c6ccc2a6be..1238e1c5aae1 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -758,7 +758,7 @@ static void do_run_with_base_page(test_fn fn, bool swapout) } /* Populate a base page. */ - memset(mem, 0, pagesize); + memset(mem, 1, pagesize); if (swapout) { madvise(mem, pagesize, MADV_PAGEOUT); @@ -824,12 +824,12 @@ static void do_run_with_thp(test_fn fn, enum thp_run thp_run, size_t thpsize) * Try to populate a THP. Touch the first sub-page and test if * we get the last sub-page populated automatically. */ - mem[0] = 0; + mem[0] = 1; if (!pagemap_is_populated(pagemap_fd, mem + thpsize - pagesize)) { ksft_test_result_skip("Did not get a THP populated\n"); goto munmap; } - memset(mem, 0, thpsize); + memset(mem, 1, thpsize); size = thpsize; switch (thp_run) { @@ -1012,7 +1012,7 @@ static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize) } /* Populate an huge page. */ - memset(mem, 0, hugetlbsize); + memset(mem, 1, hugetlbsize); /* * We need a total of two hugetlb pages to handle COW/unsharing -- 2.51.0 From 212fe1c0df4a150fb6298db2cfff267ceaba5402 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 7 Jan 2025 14:54:46 +0800 Subject: [PATCH 02/16] zram: fix potential UAF of zram table If zram_meta_alloc failed early, it frees allocated zram->table without setting it NULL. Which will potentially cause zram_meta_free to access the table if user reset an failed and uninitialized device. Link: https://lkml.kernel.org/r/20250107065446.86928-1-ryncsn@gmail.com Fixes: 74363ec674cb ("zram: fix uninitialized ZRAM not releasing backing device") Signed-off-by: Kairui Song Reviewed-by: Sergey Senozhatsky Cc: Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 45df5eeabc5e..7903a4da40ac 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1468,6 +1468,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) zram->mem_pool = zs_create_pool(zram->disk->disk_name); if (!zram->mem_pool) { vfree(zram->table); + zram->table = NULL; return false; } -- 2.51.0 From 9fd8fcf171dcc39d2a8ecf221388820fb5fbc00e Mon Sep 17 00:00:00 2001 From: Koichiro Den Date: Wed, 8 Jan 2025 13:28:07 +0900 Subject: [PATCH 03/16] vmstat: disable vmstat_work on vmstat_cpu_down_prep() The upstream commit adcfb264c3ed ("vmstat: disable vmstat_work on vmstat_cpu_down_prep()") introduced another warning during the boot phase so was soon reverted on upstream by commit cd6313beaeae ("Revert "vmstat: disable vmstat_work on vmstat_cpu_down_prep()""). This commit resolves it and reattempts the original fix. Even after mm/vmstat:online teardown, shepherd may still queue work for the dying cpu until the cpu is removed from online mask. While it's quite rare, this means that after unbind_workers() unbinds a per-cpu kworker, it potentially runs vmstat_update for the dying CPU on an irrelevant cpu before entering atomic AP states. When CONFIG_DEBUG_PREEMPT=y, it results in the following error with the backtrace. BUG: using smp_processor_id() in preemptible [00000000] code: \ kworker/7:3/1702 caller is refresh_cpu_vm_stats+0x235/0x5f0 CPU: 0 UID: 0 PID: 1702 Comm: kworker/7:3 Tainted: G Tainted: [N]=TEST Workqueue: mm_percpu_wq vmstat_update Call Trace: dump_stack_lvl+0x8d/0xb0 check_preemption_disabled+0xce/0xe0 refresh_cpu_vm_stats+0x235/0x5f0 vmstat_update+0x17/0xa0 process_one_work+0x869/0x1aa0 worker_thread+0x5e5/0x1100 kthread+0x29e/0x380 ret_from_fork+0x2d/0x70 ret_from_fork_asm+0x1a/0x30 So, for mm/vmstat:online, disable vmstat_work reliably on teardown and symmetrically enable it on startup. For secondary CPUs during CPU hotplug scenarios, ensure the delayed work is disabled immediately after the initialization. These CPUs are not yet online when start_shepherd_timer() runs on boot CPU. vmstat_cpu_online() will enable the work for them. Link: https://lkml.kernel.org/r/20250108042807.3429745-1-koichiro.den@canonical.com Signed-off-by: Huacai Chen Signed-off-by: Koichiro Den Suggested-by: Huacai Chen Tested-by: Charalampos Mitrodimas Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- mm/vmstat.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 4d016314a56c..16bfe1c694dd 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -2122,10 +2122,20 @@ static void __init start_shepherd_timer(void) { int cpu; - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), vmstat_update); + /* + * For secondary CPUs during CPU hotplug scenarios, + * vmstat_cpu_online() will enable the work. + * mm/vmstat:online enables and disables vmstat_work + * symmetrically during CPU hotplug events. + */ + if (!cpu_online(cpu)) + disable_delayed_work_sync(&per_cpu(vmstat_work, cpu)); + } + schedule_delayed_work(&shepherd, round_jiffies_relative(sysctl_stat_interval)); } @@ -2148,13 +2158,14 @@ static int vmstat_cpu_online(unsigned int cpu) if (!node_state(cpu_to_node(cpu), N_CPU)) { node_set_state(cpu_to_node(cpu), N_CPU); } + enable_delayed_work(&per_cpu(vmstat_work, cpu)); return 0; } static int vmstat_cpu_down_prep(unsigned int cpu) { - cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); + disable_delayed_work_sync(&per_cpu(vmstat_work, cpu)); return 0; } -- 2.51.0 From bd3d56ffa2c450364acf02663ba88996da37079d Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Thu, 9 Jan 2025 00:05:39 -0600 Subject: [PATCH 04/16] mm: vmscan : pgdemote vmstat is not getting updated when MGLRU is enabled. When MGLRU is enabled, the pgdemote_kswapd, pgdemote_direct, and pgdemote_khugepaged stats in vmstat are not being updated. Commit f77f0c751478 ("mm,memcg: provide per-cgroup counters for NUMA balancing operations") moved the pgdemote vmstat update from demote_folio_list() to shrink_inactive_list(), which is in the normal LRU path. As a result, the pgdemote stats are updated correctly for the normal LRU but not for MGLRU. To address this, we have added the pgdemote stat update in the evict_folios() function, which is in the MGLRU path. With this patch, the pgdemote stats will now be updated correctly when MGLRU is enabled. Without this patch vmstat output when MGLRU is enabled ====================================================== pgdemote_kswapd 0 pgdemote_direct 0 pgdemote_khugepaged 0 With this patch vmstat output when MGLRU is enabled =================================================== pgdemote_kswapd 43234 pgdemote_direct 4691 pgdemote_khugepaged 0 Link: https://lkml.kernel.org/r/20250109060540.451261-1-donettom@linux.ibm.com Fixes: f77f0c751478 ("mm,memcg: provide per-cgroup counters for NUMA balancing operations") Signed-off-by: Donet Tom Acked-by: Yu Zhao Tested-by: Li Zhijian Reviewed-by: Li Zhijian Cc: Aneesh Kumar K.V (Arm) Cc: David Rientjes Cc: Johannes Weiner Cc: Kaiyang Zhao Cc: Michal Hocko Cc: Muchun Song Cc: Ritesh Harjani (IBM) Cc: Roman Gushchin Cc: Shakeel Butt Cc: Wei Xu Cc: Signed-off-by: Andrew Morton --- mm/vmscan.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index 9a859b7d18d7..b1ec5ece067e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4642,6 +4642,9 @@ retry: reset_batch_size(walk); } + __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(), + stat.nr_demoted); + item = PGSTEAL_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) __count_vm_events(item, reclaimed); -- 2.51.0 From 1c47c57818ad73d2d09ddbcb4839708aab5ff2e3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 10 Jan 2025 16:32:57 +0000 Subject: [PATCH 05/16] mm: fix assertion in folio_end_read() We only need to assert that the uptodate flag is clear if we're going to set it. This hasn't been a problem before now because we have only used folio_end_read() when completing with an error, but it's convenient to use it in squashfs if we discover the folio is already uptodate. Link: https://lkml.kernel.org/r/20250110163300.3346321-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Phillip Lougher Signed-off-by: Andrew Morton --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 118fa1e0bafe..4f476411a9a2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1523,7 +1523,7 @@ void folio_end_read(struct folio *folio, bool success) /* Must be in bottom byte for x86 to work */ BUILD_BUG_ON(PG_uptodate > 7); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio); + VM_BUG_ON_FOLIO(success && folio_test_uptodate(folio), folio); if (likely(success)) mask |= 1 << PG_uptodate; -- 2.51.0 From cbc5dde0a461240046e8a41c43d7c3b76d5db952 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 10 Jan 2025 10:28:21 -0500 Subject: [PATCH 06/16] fs/proc: fix softlockup in __read_vmcore (part 2) Since commit 5cbcb62dddf5 ("fs/proc: fix softlockup in __read_vmcore") the number of softlockups in __read_vmcore at kdump time have gone down, but they still happen sometimes. In a memory constrained environment like the kdump image, a softlockup is not just a harmless message, but it can interfere with things like RCU freeing memory, causing the crashdump to get stuck. The second loop in __read_vmcore has a lot more opportunities for natural sleep points, like scheduling out while waiting for a data write to happen, but apparently that is not always enough. Add a cond_resched() to the second loop in __read_vmcore to (hopefully) get rid of the softlockups. Link: https://lkml.kernel.org/r/20250110102821.2a37581b@fangorn Fixes: 5cbcb62dddf5 ("fs/proc: fix softlockup in __read_vmcore") Signed-off-by: Rik van Riel Reported-by: Breno Leitao Cc: Baoquan He Cc: Dave Young Cc: Vivek Goyal Cc: Signed-off-by: Andrew Morton --- fs/proc/vmcore.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 3d8a82cee63e..658bf199d424 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -404,6 +404,8 @@ static ssize_t __read_vmcore(struct iov_iter *iter, loff_t *fpos) if (!iov_iter_count(iter)) return acc; } + + cond_resched(); } return acc; -- 2.51.0 From 002ebb925e2fcf150b0e346a960060ea4789ff01 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 25 Nov 2024 02:41:56 +0000 Subject: [PATCH 07/16] maple_tree: use mas_next_slot() directly The loop condition makes sure (mas.last < max), so we can directly use mas_next_slot() here. Since no other use of mas_next_entry(), it is removed. Link: https://lkml.kernel.org/r/20241125024156.26093-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Cc: Sidhartha Kumar Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- lib/maple_tree.c | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 047397136f15..36e603645a30 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4745,29 +4745,6 @@ again: return entry; } -/* - * mas_next_entry() - Internal function to get the next entry. - * @mas: The maple state - * @limit: The maximum range start. - * - * Set the @mas->node to the next entry and the range_start to - * the beginning value for the entry. Does not check beyond @limit. - * Sets @mas->index and @mas->last to the range, Does not update @mas->index and - * @mas->last on overflow. - * Restarts on dead nodes. - * - * Return: the next entry or %NULL. - */ -static inline void *mas_next_entry(struct ma_state *mas, unsigned long limit) -{ - if (mas->last >= limit) { - mas->status = ma_overflow; - return NULL; - } - - return mas_next_slot(mas, limit, false); -} - /* * mas_rev_awalk() - Internal function. Reverse allocation walk. Find the * highest gap address of a given size in a given node and descend. @@ -6938,7 +6915,7 @@ retry: goto unlock; while (mas_is_active(&mas) && (mas.last < max)) { - entry = mas_next_entry(&mas, max); + entry = mas_next_slot(&mas, max, false); if (likely(entry && !xa_is_zero(entry))) break; } -- 2.51.0 From a882dd92de83f7d1634dbde6f1d065d6ac73546e Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Wed, 27 Nov 2024 13:53:29 +0000 Subject: [PATCH 08/16] mm/zswap: add LRU_STOP to comment about dropping the lru lock This function has been able to return LRU_STOP since commit b49547ade38a ("mm/zswap: stop lru list shrinking when encounter warm region"). To reduce confusion, update the comment to also list LRU_STOP as an option. Link: https://lkml.kernel.org/r/20241127-lru-stop-comment-v1-1-f54a7cba9429@google.com Signed-off-by: Alice Ryhl Acked-by: Johannes Weiner Reviewed-by: Chengming Zhou Cc: Alice Ryhl Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/zswap.c b/mm/zswap.c index 30f5a27a6862..167ae641379f 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1186,7 +1186,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o /* * It's safe to drop the lock here because we return either - * LRU_REMOVED_RETRY or LRU_RETRY. + * LRU_REMOVED_RETRY, LRU_RETRY or LRU_STOP. */ spin_unlock(&l->lock); -- 2.51.0 From bfc1d1782984903457b2707d05a35b24ce5fbea1 Mon Sep 17 00:00:00 2001 From: Donet Tom Date: Tue, 26 Nov 2024 09:56:54 -0600 Subject: [PATCH 09/16] mm: migrate: remove unused argument vma from migrate_misplaced_folio() Commit ee86814b0562 ("mm/migrate: move NUMA hinting fault folio isolation + checks under PTL") removed the code that had used the vma argument in migrate_misplaced_folio. Since the vma argument was no longer used in migrate_misplaced_folio, this patch removes it. Link: https://lkml.kernel.org/r/20241126155655.466186-1-donettom@linux.ibm.com Signed-off-by: Donet Tom Reviewed-by: Baolin Wang Reviewed-by: Zi Yan Acked-by: David Hildenbrand Cc: Ritesh Harjani (IBM) Signed-off-by: Andrew Morton --- include/linux/migrate.h | 6 ++---- mm/huge_memory.c | 2 +- mm/memory.c | 2 +- mm/migrate.c | 3 +-- 4 files changed, 5 insertions(+), 8 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 002e49b2ebd9..29919faea2f1 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -144,16 +144,14 @@ const struct movable_operations *page_movable_ops(struct page *page) #ifdef CONFIG_NUMA_BALANCING int migrate_misplaced_folio_prepare(struct folio *folio, struct vm_area_struct *vma, int node); -int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, - int node); +int migrate_misplaced_folio(struct folio *folio, int node); #else static inline int migrate_misplaced_folio_prepare(struct folio *folio, struct vm_area_struct *vma, int node) { return -EAGAIN; /* can't migrate now */ } -static inline int migrate_misplaced_folio(struct folio *folio, - struct vm_area_struct *vma, int node) +static inline int migrate_misplaced_folio(struct folio *folio, int node) { return -EAGAIN; /* can't migrate now */ } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index db64116a4f84..45901dc6710c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2003,7 +2003,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) spin_unlock(vmf->ptl); writable = false; - if (!migrate_misplaced_folio(folio, vma, target_nid)) { + if (!migrate_misplaced_folio(folio, target_nid)) { flags |= TNF_MIGRATED; nid = target_nid; task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); diff --git a/mm/memory.c b/mm/memory.c index 398c031be9ba..78b6741ae593 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5625,7 +5625,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) ignore_writable = true; /* Migrate to the requested node */ - if (!migrate_misplaced_folio(folio, vma, target_nid)) { + if (!migrate_misplaced_folio(folio, target_nid)) { nid = target_nid; flags |= TNF_MIGRATED; task_numa_fault(last_cpupid, nid, nr_pages, flags); diff --git a/mm/migrate.c b/mm/migrate.c index cc68583c86f9..e9e00d1d1d19 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2683,8 +2683,7 @@ int migrate_misplaced_folio_prepare(struct folio *folio, * elevated reference count on the folio. This function will un-isolate the * folio, dereferencing the folio before returning. */ -int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, - int node) +int migrate_misplaced_folio(struct folio *folio, int node) { pg_data_t *pgdat = NODE_DATA(node); int nr_remaining; -- 2.51.0 From d4056386aefdcd0637f9bb2bd52279af043f0381 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 21:01:33 +0000 Subject: [PATCH 10/16] mm/page_alloc: cache page_zone() result in free_unref_page() Patch series "Allocate and free frozen pages", v3. Slab does not need to use the page refcount at all, and it can avoid an atomic operation on page free. Hugetlb wants to delay setting the refcount until it has assembled a complete gigantic page. We already have the ability to freeze a page (safely reduce its reference count to 0), so this patchset adds APIs to allocate and free pages which are in a frozen state. This patchset is also a step towards the Glorious Future in which struct page doesn't have a refcount; the users which need a refcount will have one in their per-allocation memdesc. This patch (of 15): Save 17 bytes of text by calculating page_zone() once instead of twice. Link: https://lkml.kernel.org/r/20241125210149.2976098-1-willy@infradead.org Link: https://lkml.kernel.org/r/20241125210149.2976098-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Miaohe Lin Reviewed-by: Muchun Song Acked-by: Mel Gorman Reviewed-by: Zi Yan Acked-by: David Hildenbrand Reviewed-by: Vlastimil Babka Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: William Kucharski Signed-off-by: Andrew Morton --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cae7b93864c2..a44f9ff04b1a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2666,16 +2666,16 @@ void free_unref_page(struct page *page, unsigned int order) * get those areas back if necessary. Otherwise, we may have to free * excessively into the page allocator */ + zone = page_zone(page); migratetype = get_pfnblock_migratetype(page, pfn); if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(page_zone(page), page, pfn, order, FPI_NONE); + free_one_page(zone, page, pfn, order, FPI_NONE); return; } migratetype = MIGRATE_MOVABLE; } - zone = page_zone(page); pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { -- 2.51.0 From 38558b2460d7881a3de3bdc31a23fa7034384d00 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 21:01:34 +0000 Subject: [PATCH 11/16] mm: make alloc_pages_mpol() static All callers outside mempolicy.c now use folio_alloc_mpol() thanks to Kefeng's cleanups, so we can remove this as a visible symbol. And also remove the alloc_hooks for alloc_pages_mpol(), since all users in mempolicy.c are using the nonprof version. Link: https://lkml.kernel.org/r/20241125210149.2976098-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Acked-by: David Hildenbrand Reviewed-by: Vlastimil Babka Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Mel Gorman Cc: Miaohe Lin Cc: Muchun Song Cc: William Kucharski Signed-off-by: Andrew Morton --- include/linux/gfp.h | 8 -------- mm/mempolicy.c | 8 ++++---- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index b0fe9f62d15b..c96d5d7f7b89 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -300,8 +300,6 @@ static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask, #ifdef CONFIG_NUMA struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order); -struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, - struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order); struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid); @@ -312,11 +310,6 @@ static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order { return alloc_pages_node_noprof(numa_node_id(), gfp_mask, order); } -static inline struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, - struct mempolicy *mpol, pgoff_t ilx, int nid) -{ - return alloc_pages_noprof(gfp, order); -} static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { return __folio_alloc_node_noprof(gfp, order, numa_node_id()); @@ -331,7 +324,6 @@ static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int orde #endif #define alloc_pages(...) alloc_hooks(alloc_pages_noprof(__VA_ARGS__)) -#define alloc_pages_mpol(...) alloc_hooks(alloc_pages_mpol_noprof(__VA_ARGS__)) #define folio_alloc(...) alloc_hooks(folio_alloc_noprof(__VA_ARGS__)) #define folio_alloc_mpol(...) alloc_hooks(folio_alloc_mpol_noprof(__VA_ARGS__)) #define vma_alloc_folio(...) alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__)) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 162407fbf2bc..e092aff55e2d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2222,7 +2222,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, * * Return: The page on success or NULL if allocation fails. */ -struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, +static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, struct mempolicy *pol, pgoff_t ilx, int nid) { nodemask_t *nodemask; @@ -2285,7 +2285,7 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *pol, pgoff_t ilx, int nid) { - return page_rmappable_folio(alloc_pages_mpol_noprof(gfp | __GFP_COMP, + return page_rmappable_folio(alloc_pages_mpol(gfp | __GFP_COMP, order, pol, ilx, nid)); } @@ -2300,7 +2300,7 @@ struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the * VMA to prevent it from going away. Should be used for all allocations * for folios that will be mapped into user space, excepting hugetlbfs, and - * excepting where direct use of alloc_pages_mpol() is more appropriate. + * excepting where direct use of folio_alloc_mpol() is more appropriate. * * Return: The folio on success or NULL if allocation fails. */ @@ -2346,7 +2346,7 @@ struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order) if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); - return alloc_pages_mpol_noprof(gfp, order, pol, NO_INTERLEAVE_INDEX, + return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX, numa_node_id()); } EXPORT_SYMBOL(alloc_pages_noprof); -- 2.51.0 From 520128a1d1f06657cf52d7d87252ea9a10729256 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 21:01:35 +0000 Subject: [PATCH 12/16] mm/page_alloc: export free_frozen_pages() instead of free_unref_page() We already have the concept of "frozen pages" (eg page_ref_freeze()), so let's not complicate things by also having the concept of "unref pages". Link: https://lkml.kernel.org/r/20241125210149.2976098-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Reviewed-by: William Kucharski Reviewed-by: Miaohe Lin Reviewed-by: Muchun Song Reviewed-by: Zi Yan Reviewed-by: Vlastimil Babka Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/internal.h | 2 +- mm/page_alloc.c | 18 +++++++++--------- mm/page_frag_cache.c | 6 +++--- mm/swap.c | 2 +- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 9826f7dce607..b650a7cb7b46 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -741,7 +741,7 @@ extern bool free_pages_prepare(struct page *page, unsigned int order); extern int user_min_free_kbytes; -void free_unref_page(struct page *page, unsigned int order); +void free_frozen_pages(struct page *page, unsigned int order); void free_unref_folios(struct folio_batch *fbatch); extern void zone_pcp_reset(struct zone *zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a44f9ff04b1a..03f2491d13d2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2592,9 +2592,9 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, return high; } -static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, - struct page *page, int migratetype, - unsigned int order) +static void free_frozen_page_commit(struct zone *zone, + struct per_cpu_pages *pcp, struct page *page, int migratetype, + unsigned int order) { int high, batch; int pindex; @@ -2643,7 +2643,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, /* * Free a pcp page */ -void free_unref_page(struct page *page, unsigned int order) +void free_frozen_pages(struct page *page, unsigned int order) { unsigned long __maybe_unused UP_flags; struct per_cpu_pages *pcp; @@ -2679,7 +2679,7 @@ void free_unref_page(struct page *page, unsigned int order) pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { - free_unref_page_commit(zone, pcp, page, migratetype, order); + free_frozen_page_commit(zone, pcp, page, migratetype, order); pcp_spin_unlock(pcp); } else { free_one_page(zone, page, pfn, order, FPI_NONE); @@ -2743,7 +2743,7 @@ void free_unref_folios(struct folio_batch *folios) /* * Free isolated pages directly to the - * allocator, see comment in free_unref_page. + * allocator, see comment in free_frozen_pages. */ if (is_migrate_isolate(migratetype)) { free_one_page(zone, &folio->page, pfn, @@ -2774,7 +2774,7 @@ void free_unref_folios(struct folio_batch *folios) migratetype = MIGRATE_MOVABLE; trace_mm_page_free_batched(&folio->page); - free_unref_page_commit(zone, pcp, &folio->page, migratetype, + free_frozen_page_commit(zone, pcp, &folio->page, migratetype, order); } @@ -4837,11 +4837,11 @@ void __free_pages(struct page *page, unsigned int order) struct alloc_tag *tag = pgalloc_tag_get(page); if (put_page_testzero(page)) - free_unref_page(page, order); + free_frozen_pages(page, order); else if (!head) { pgalloc_tag_sub_pages(tag, (1 << order) - 1); while (order-- > 0) - free_unref_page(page + (1 << order), order); + free_frozen_pages(page + (1 << order), order); } } EXPORT_SYMBOL(__free_pages); diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c index 3f7a203d35c6..d2423f30577e 100644 --- a/mm/page_frag_cache.c +++ b/mm/page_frag_cache.c @@ -86,7 +86,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); if (page_ref_sub_and_test(page, count)) - free_unref_page(page, compound_order(page)); + free_frozen_pages(page, compound_order(page)); } EXPORT_SYMBOL(__page_frag_cache_drain); @@ -138,7 +138,7 @@ refill: goto refill; if (unlikely(encoded_page_decode_pfmemalloc(encoded_page))) { - free_unref_page(page, + free_frozen_pages(page, encoded_page_decode_order(encoded_page)); goto refill; } @@ -166,6 +166,6 @@ void page_frag_free(void *addr) struct page *page = virt_to_head_page(addr); if (unlikely(put_page_testzero(page))) - free_unref_page(page, compound_order(page)); + free_frozen_pages(page, compound_order(page)); } EXPORT_SYMBOL(page_frag_free); diff --git a/mm/swap.c b/mm/swap.c index 10decd9dffa1..3a01acfd5a89 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -109,7 +109,7 @@ void __folio_put(struct folio *folio) page_cache_release(folio); folio_unqueue_deferred_split(folio); mem_cgroup_uncharge(folio); - free_unref_page(&folio->page, folio_order(folio)); + free_frozen_pages(&folio->page, folio_order(folio)); } EXPORT_SYMBOL(__folio_put); -- 2.51.0 From 8fd10a892a8db797fffb59a9a60bce23a56eef46 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 21:01:36 +0000 Subject: [PATCH 13/16] mm/page_alloc: move set_page_refcounted() to callers of post_alloc_hook() In preparation for allocating frozen pages, stop initialising the page refcount in post_alloc_hook(). Link: https://lkml.kernel.org/r/20241125210149.2976098-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Miaohe Lin Reviewed-by: Zi Yan Acked-by: David Hildenbrand Reviewed-by: Vlastimil Babka Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Mel Gorman Cc: Muchun Song Cc: William Kucharski Signed-off-by: Andrew Morton --- mm/compaction.c | 2 ++ mm/internal.h | 3 +-- mm/page_alloc.c | 3 ++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index a2b16b08cbbf..07bd22789f07 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -83,6 +83,7 @@ static inline bool is_via_compact_memory(int order) { return false; } static struct page *mark_allocated_noprof(struct page *page, unsigned int order, gfp_t gfp_flags) { post_alloc_hook(page, order, __GFP_MOVABLE); + set_page_refcounted(page); return page; } #define mark_allocated(...) alloc_hooks(mark_allocated_noprof(__VA_ARGS__)) @@ -1868,6 +1869,7 @@ again: dst = (struct folio *)freepage; post_alloc_hook(&dst->page, order, __GFP_MOVABLE); + set_page_refcounted(&dst->page); if (order) prep_compound_page(&dst->page, order); cc->nr_freepages -= 1 << order; diff --git a/mm/internal.h b/mm/internal.h index b650a7cb7b46..9c941af5bdb6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -735,8 +735,7 @@ static inline void prep_compound_tail(struct page *head, int tail_idx) extern void prep_compound_page(struct page *page, unsigned int order); -extern void post_alloc_hook(struct page *page, unsigned int order, - gfp_t gfp_flags); +void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern bool free_pages_prepare(struct page *page, unsigned int order); extern int user_min_free_kbytes; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 03f2491d13d2..a3072047c705 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1508,7 +1508,6 @@ inline void post_alloc_hook(struct page *page, unsigned int order, int i; set_page_private(page, 0); - set_page_refcounted(page); arch_alloc_page(page, order); debug_pagealloc_map_pages(page, 1 << order); @@ -1564,6 +1563,7 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags unsigned int alloc_flags) { post_alloc_hook(page, order, gfp_flags); + set_page_refcounted(page); if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); @@ -6360,6 +6360,7 @@ static void split_free_pages(struct list_head *list) int i; post_alloc_hook(page, order, __GFP_MOVABLE); + set_page_refcounted(page); if (!order) continue; -- 2.51.0 From ee66e9c34fd3aeab3a7c2cda300f852aa5363609 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 21:01:37 +0000 Subject: [PATCH 14/16] mm/page_alloc: move set_page_refcounted() to callers of prep_new_page() In preparation for allocating frozen pages, stop initialising the page refcount in prep_new_page(). Link: https://lkml.kernel.org/r/20241125210149.2976098-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Acked-by: David Hildenbrand Reviewed-by: Vlastimil Babka Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Mel Gorman Cc: Miaohe Lin Cc: Muchun Song Cc: William Kucharski Signed-off-by: Andrew Morton --- mm/page_alloc.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a3072047c705..08cc1e0bd950 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1563,7 +1563,6 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags unsigned int alloc_flags) { post_alloc_hook(page, order, gfp_flags); - set_page_refcounted(page); if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); @@ -3474,6 +3473,7 @@ try_this_zone: gfp_mask, alloc_flags, ac->migratetype); if (page) { prep_new_page(page, order, gfp_mask, alloc_flags); + set_page_refcounted(page); /* * If this is a high-order atomic allocation then check @@ -3698,8 +3698,10 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, count_vm_event(COMPACTSTALL); /* Prep a captured page if available */ - if (page) + if (page) { prep_new_page(page, order, gfp_mask, alloc_flags); + set_page_refcounted(page); + } /* Try get a page from the freelist if available */ if (!page) @@ -4678,6 +4680,7 @@ retry_this_zone: nr_account++; prep_new_page(page, 0, gfp, 0); + set_page_refcounted(page); if (page_list) list_add(&page->lru, page_list); else @@ -6500,6 +6503,7 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end, check_new_pages(head, order); prep_new_page(head, order, gfp_mask, 0); + set_page_refcounted(head); } else { ret = -EINVAL; WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n", -- 2.51.0 From efabfe1420f5245938871a9578160f32277c8e21 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 21:01:38 +0000 Subject: [PATCH 15/16] mm/page_alloc: move set_page_refcounted() to callers of get_page_from_freelist() In preparation for allocating frozen pages, stop initialising the page refcount in get_page_from_freelist(). Link: https://lkml.kernel.org/r/20241125210149.2976098-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Reviewed-by: Vlastimil Babka Cc: David Hildenbrand Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Mel Gorman Cc: Miaohe Lin Cc: Muchun Song Cc: William Kucharski Signed-off-by: Andrew Morton --- mm/page_alloc.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 08cc1e0bd950..2786117a50ee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3473,7 +3473,6 @@ try_this_zone: gfp_mask, alloc_flags, ac->migratetype); if (page) { prep_new_page(page, order, gfp_mask, alloc_flags); - set_page_refcounted(page); /* * If this is a high-order atomic allocation then check @@ -3568,6 +3567,8 @@ __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); + if (page) + set_page_refcounted(page); return page; } @@ -3606,8 +3607,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) & ~__GFP_DIRECT_RECLAIM, order, ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); - if (page) + if (page) { + set_page_refcounted(page); goto out; + } /* Coredumps can quickly deplete all memory reserves */ if (current->flags & PF_DUMPCORE) @@ -3698,10 +3701,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, count_vm_event(COMPACTSTALL); /* Prep a captured page if available */ - if (page) { + if (page) prep_new_page(page, order, gfp_mask, alloc_flags); - set_page_refcounted(page); - } /* Try get a page from the freelist if available */ if (!page) @@ -3710,6 +3711,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, if (page) { struct zone *zone = page_zone(page); + set_page_refcounted(page); zone->compact_blockskip_flush = false; compaction_defer_reset(zone, order, true); count_vm_event(COMPACTSUCCESS); @@ -3968,6 +3970,7 @@ retry: drained = true; goto retry; } + set_page_refcounted(page); out: psi_memstall_leave(&pflags); @@ -4288,8 +4291,10 @@ restart: * that first */ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); - if (page) + if (page) { + set_page_refcounted(page); goto got_pg; + } /* * For costly allocations, try direct compaction first, as it's likely @@ -4369,8 +4374,10 @@ retry: /* Attempt with potentially adjusted zonelist and alloc_flags */ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); - if (page) + if (page) { + set_page_refcounted(page); goto got_pg; + } /* Caller is not willing to reclaim, we can't balance anything */ if (!can_direct_reclaim) @@ -4754,8 +4761,10 @@ struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, /* First allocation attempt */ page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); - if (likely(page)) + if (likely(page)) { + set_page_refcounted(page); goto out; + } alloc_gfp = gfp; ac.spread_dirty_pages = false; -- 2.51.0 From 4c9017cc4c59627720b198e22757e17f67dc3590 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 25 Nov 2024 21:01:39 +0000 Subject: [PATCH 16/16] mm/page_alloc: move set_page_refcounted() to callers of __alloc_pages_cpuset_fallback() In preparation for allocating frozen pages, stop initialising the page refcount in __alloc_pages_cpuset_fallback(). Link: https://lkml.kernel.org/r/20241125210149.2976098-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Zi Yan Reviewed-by: Vlastimil Babka Cc: David Hildenbrand Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Mel Gorman Cc: Miaohe Lin Cc: Muchun Song Cc: William Kucharski Signed-off-by: Andrew Morton --- mm/page_alloc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2786117a50ee..40ffc7f41b73 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3566,9 +3566,6 @@ __alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, if (!page) page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac); - - if (page) - set_page_refcounted(page); return page; } @@ -3655,6 +3652,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (gfp_mask & __GFP_NOFAIL) page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_NO_WATERMARKS, ac); + if (page) + set_page_refcounted(page); } out: mutex_unlock(&oom_lock); @@ -4483,8 +4482,10 @@ nopage: * the situation worse. */ page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_MIN_RESERVE, ac); - if (page) + if (page) { + set_page_refcounted(page); goto got_pg; + } cond_resched(); goto retry; -- 2.51.0